mm/memory.c at v6.19-rc8 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / memory.c
at v6.19-rc8 7404 lines 211 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/mm/memory.c
   4 *
   5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6 */
   7
   8/*
   9 * demand-loading started 01.12.91 - seems it is high on the list of
  10 * things wanted, and it should be easy to implement. - Linus
  11 */
  12
  13/*
  14 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  15 * pages started 02.12.91, seems to work. - Linus.
  16 *
  17 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  18 * would have taken more than the 6M I have free, but it worked well as
  19 * far as I could see.
  20 *
  21 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  22 */
  23
  24/*
  25 * Real VM (paging to/from disk) started 18.12.91. Much more work and
  26 * thought has to go into this. Oh, well..
  27 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  28 *		Found it. Everything seems to work now.
  29 * 20.12.91  -  Ok, making the swap-device changeable like the root.
  30 */
  31
  32/*
  33 * 05.04.94  -  Multi-page memory management added for v1.1.
  34 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  35 *
  36 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
  37 *		(Gerhard.Wichert@pdb.siemens.de)
  38 *
  39 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
  40 */
  41
  42#include <linux/kernel_stat.h>
  43#include <linux/mm.h>
  44#include <linux/mm_inline.h>
  45#include <linux/sched/mm.h>
  46#include <linux/sched/numa_balancing.h>
  47#include <linux/sched/task.h>
  48#include <linux/hugetlb.h>
  49#include <linux/mman.h>
  50#include <linux/swap.h>
  51#include <linux/highmem.h>
  52#include <linux/pagemap.h>
  53#include <linux/memremap.h>
  54#include <linux/kmsan.h>
  55#include <linux/ksm.h>
  56#include <linux/rmap.h>
  57#include <linux/export.h>
  58#include <linux/delayacct.h>
  59#include <linux/init.h>
  60#include <linux/writeback.h>
  61#include <linux/memcontrol.h>
  62#include <linux/mmu_notifier.h>
  63#include <linux/leafops.h>
  64#include <linux/elf.h>
  65#include <linux/gfp.h>
  66#include <linux/migrate.h>
  67#include <linux/string.h>
  68#include <linux/shmem_fs.h>
  69#include <linux/memory-tiers.h>
  70#include <linux/debugfs.h>
  71#include <linux/userfaultfd_k.h>
  72#include <linux/dax.h>
  73#include <linux/oom.h>
  74#include <linux/numa.h>
  75#include <linux/perf_event.h>
  76#include <linux/ptrace.h>
  77#include <linux/vmalloc.h>
  78#include <linux/sched/sysctl.h>
  79#include <linux/pgalloc.h>
  80#include <linux/uaccess.h>
  81
  82#include <trace/events/kmem.h>
  83
  84#include <asm/io.h>
  85#include <asm/mmu_context.h>
  86#include <asm/tlb.h>
  87#include <asm/tlbflush.h>
  88
  89#include "pgalloc-track.h"
  90#include "internal.h"
  91#include "swap.h"
  92
  93#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
  94#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
  95#endif
  96
  97static vm_fault_t do_fault(struct vm_fault *vmf);
  98static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
  99static bool vmf_pte_changed(struct vm_fault *vmf);
 100
 101/*
 102 * Return true if the original pte was a uffd-wp pte marker (so the pte was
 103 * wr-protected).
 104 */
 105static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
 106{
 107	if (!userfaultfd_wp(vmf->vma))
 108		return false;
 109	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
 110		return false;
 111
 112	return pte_is_uffd_wp_marker(vmf->orig_pte);
 113}
 114
 115/*
 116 * Randomize the address space (stacks, mmaps, brk, etc.).
 117 *
 118 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 119 *   as ancient (libc5 based) binaries can segfault. )
 120 */
 121int randomize_va_space __read_mostly =
 122#ifdef CONFIG_COMPAT_BRK
 123					1;
 124#else
 125					2;
 126#endif
 127
 128static const struct ctl_table mmu_sysctl_table[] = {
 129	{
 130		.procname	= "randomize_va_space",
 131		.data		= &randomize_va_space,
 132		.maxlen		= sizeof(int),
 133		.mode		= 0644,
 134		.proc_handler	= proc_dointvec,
 135	},
 136};
 137
 138static int __init init_mm_sysctl(void)
 139{
 140	register_sysctl_init("kernel", mmu_sysctl_table);
 141	return 0;
 142}
 143
 144subsys_initcall(init_mm_sysctl);
 145
 146#ifndef arch_wants_old_prefaulted_pte
 147static inline bool arch_wants_old_prefaulted_pte(void)
 148{
 149	/*
 150	 * Transitioning a PTE from 'old' to 'young' can be expensive on
 151	 * some architectures, even if it's performed in hardware. By
 152	 * default, "false" means prefaulted entries will be 'young'.
 153	 */
 154	return false;
 155}
 156#endif
 157
 158static int __init disable_randmaps(char *s)
 159{
 160	randomize_va_space = 0;
 161	return 1;
 162}
 163__setup("norandmaps", disable_randmaps);
 164
 165unsigned long zero_pfn __read_mostly;
 166EXPORT_SYMBOL(zero_pfn);
 167
 168unsigned long highest_memmap_pfn __read_mostly;
 169
 170/*
 171 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 172 */
 173static int __init init_zero_pfn(void)
 174{
 175	zero_pfn = page_to_pfn(ZERO_PAGE(0));
 176	return 0;
 177}
 178early_initcall(init_zero_pfn);
 179
 180void mm_trace_rss_stat(struct mm_struct *mm, int member)
 181{
 182	trace_rss_stat(mm, member);
 183}
 184
 185/*
 186 * Note: this doesn't free the actual pages themselves. That
 187 * has been handled earlier when unmapping all the memory regions.
 188 */
 189static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 190			   unsigned long addr)
 191{
 192	pgtable_t token = pmd_pgtable(*pmd);
 193	pmd_clear(pmd);
 194	pte_free_tlb(tlb, token, addr);
 195	mm_dec_nr_ptes(tlb->mm);
 196}
 197
 198static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 199				unsigned long addr, unsigned long end,
 200				unsigned long floor, unsigned long ceiling)
 201{
 202	pmd_t *pmd;
 203	unsigned long next;
 204	unsigned long start;
 205
 206	start = addr;
 207	pmd = pmd_offset(pud, addr);
 208	do {
 209		next = pmd_addr_end(addr, end);
 210		if (pmd_none_or_clear_bad(pmd))
 211			continue;
 212		free_pte_range(tlb, pmd, addr);
 213	} while (pmd++, addr = next, addr != end);
 214
 215	start &= PUD_MASK;
 216	if (start < floor)
 217		return;
 218	if (ceiling) {
 219		ceiling &= PUD_MASK;
 220		if (!ceiling)
 221			return;
 222	}
 223	if (end - 1 > ceiling - 1)
 224		return;
 225
 226	pmd = pmd_offset(pud, start);
 227	pud_clear(pud);
 228	pmd_free_tlb(tlb, pmd, start);
 229	mm_dec_nr_pmds(tlb->mm);
 230}
 231
 232static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 233				unsigned long addr, unsigned long end,
 234				unsigned long floor, unsigned long ceiling)
 235{
 236	pud_t *pud;
 237	unsigned long next;
 238	unsigned long start;
 239
 240	start = addr;
 241	pud = pud_offset(p4d, addr);
 242	do {
 243		next = pud_addr_end(addr, end);
 244		if (pud_none_or_clear_bad(pud))
 245			continue;
 246		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 247	} while (pud++, addr = next, addr != end);
 248
 249	start &= P4D_MASK;
 250	if (start < floor)
 251		return;
 252	if (ceiling) {
 253		ceiling &= P4D_MASK;
 254		if (!ceiling)
 255			return;
 256	}
 257	if (end - 1 > ceiling - 1)
 258		return;
 259
 260	pud = pud_offset(p4d, start);
 261	p4d_clear(p4d);
 262	pud_free_tlb(tlb, pud, start);
 263	mm_dec_nr_puds(tlb->mm);
 264}
 265
 266static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 267				unsigned long addr, unsigned long end,
 268				unsigned long floor, unsigned long ceiling)
 269{
 270	p4d_t *p4d;
 271	unsigned long next;
 272	unsigned long start;
 273
 274	start = addr;
 275	p4d = p4d_offset(pgd, addr);
 276	do {
 277		next = p4d_addr_end(addr, end);
 278		if (p4d_none_or_clear_bad(p4d))
 279			continue;
 280		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
 281	} while (p4d++, addr = next, addr != end);
 282
 283	start &= PGDIR_MASK;
 284	if (start < floor)
 285		return;
 286	if (ceiling) {
 287		ceiling &= PGDIR_MASK;
 288		if (!ceiling)
 289			return;
 290	}
 291	if (end - 1 > ceiling - 1)
 292		return;
 293
 294	p4d = p4d_offset(pgd, start);
 295	pgd_clear(pgd);
 296	p4d_free_tlb(tlb, p4d, start);
 297}
 298
 299/**
 300 * free_pgd_range - Unmap and free page tables in the range
 301 * @tlb: the mmu_gather containing pending TLB flush info
 302 * @addr: virtual address start
 303 * @end: virtual address end
 304 * @floor: lowest address boundary
 305 * @ceiling: highest address boundary
 306 *
 307 * This function tears down all user-level page tables in the
 308 * specified virtual address range [@addr..@end). It is part of
 309 * the memory unmap flow.
 310 */
 311void free_pgd_range(struct mmu_gather *tlb,
 312			unsigned long addr, unsigned long end,
 313			unsigned long floor, unsigned long ceiling)
 314{
 315	pgd_t *pgd;
 316	unsigned long next;
 317
 318	/*
 319	 * The next few lines have given us lots of grief...
 320	 *
 321	 * Why are we testing PMD* at this top level?  Because often
 322	 * there will be no work to do at all, and we'd prefer not to
 323	 * go all the way down to the bottom just to discover that.
 324	 *
 325	 * Why all these "- 1"s?  Because 0 represents both the bottom
 326	 * of the address space and the top of it (using -1 for the
 327	 * top wouldn't help much: the masks would do the wrong thing).
 328	 * The rule is that addr 0 and floor 0 refer to the bottom of
 329	 * the address space, but end 0 and ceiling 0 refer to the top
 330	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
 331	 * that end 0 case should be mythical).
 332	 *
 333	 * Wherever addr is brought up or ceiling brought down, we must
 334	 * be careful to reject "the opposite 0" before it confuses the
 335	 * subsequent tests.  But what about where end is brought down
 336	 * by PMD_SIZE below? no, end can't go down to 0 there.
 337	 *
 338	 * Whereas we round start (addr) and ceiling down, by different
 339	 * masks at different levels, in order to test whether a table
 340	 * now has no other vmas using it, so can be freed, we don't
 341	 * bother to round floor or end up - the tests don't need that.
 342	 */
 343
 344	addr &= PMD_MASK;
 345	if (addr < floor) {
 346		addr += PMD_SIZE;
 347		if (!addr)
 348			return;
 349	}
 350	if (ceiling) {
 351		ceiling &= PMD_MASK;
 352		if (!ceiling)
 353			return;
 354	}
 355	if (end - 1 > ceiling - 1)
 356		end -= PMD_SIZE;
 357	if (addr > end - 1)
 358		return;
 359	/*
 360	 * We add page table cache pages with PAGE_SIZE,
 361	 * (see pte_free_tlb()), flush the tlb if we need
 362	 */
 363	tlb_change_page_size(tlb, PAGE_SIZE);
 364	pgd = pgd_offset(tlb->mm, addr);
 365	do {
 366		next = pgd_addr_end(addr, end);
 367		if (pgd_none_or_clear_bad(pgd))
 368			continue;
 369		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
 370	} while (pgd++, addr = next, addr != end);
 371}
 372
 373void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 374		   struct vm_area_struct *vma, unsigned long floor,
 375		   unsigned long ceiling, bool mm_wr_locked)
 376{
 377	struct unlink_vma_file_batch vb;
 378
 379	tlb_free_vmas(tlb);
 380
 381	do {
 382		unsigned long addr = vma->vm_start;
 383		struct vm_area_struct *next;
 384
 385		/*
 386		 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
 387		 * be 0.  This will underflow and is okay.
 388		 */
 389		next = mas_find(mas, ceiling - 1);
 390		if (unlikely(xa_is_zero(next)))
 391			next = NULL;
 392
 393		/*
 394		 * Hide vma from rmap and truncate_pagecache before freeing
 395		 * pgtables
 396		 */
 397		if (mm_wr_locked)
 398			vma_start_write(vma);
 399		unlink_anon_vmas(vma);
 400
 401		unlink_file_vma_batch_init(&vb);
 402		unlink_file_vma_batch_add(&vb, vma);
 403
 404		/*
 405		 * Optimization: gather nearby vmas into one call down
 406		 */
 407		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
 408			vma = next;
 409			next = mas_find(mas, ceiling - 1);
 410			if (unlikely(xa_is_zero(next)))
 411				next = NULL;
 412			if (mm_wr_locked)
 413				vma_start_write(vma);
 414			unlink_anon_vmas(vma);
 415			unlink_file_vma_batch_add(&vb, vma);
 416		}
 417		unlink_file_vma_batch_final(&vb);
 418
 419		free_pgd_range(tlb, addr, vma->vm_end,
 420			floor, next ? next->vm_start : ceiling);
 421		vma = next;
 422	} while (vma);
 423}
 424
 425void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 426{
 427	spinlock_t *ptl = pmd_lock(mm, pmd);
 428
 429	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
 430		mm_inc_nr_ptes(mm);
 431		/*
 432		 * Ensure all pte setup (eg. pte page lock and page clearing) are
 433		 * visible before the pte is made visible to other CPUs by being
 434		 * put into page tables.
 435		 *
 436		 * The other side of the story is the pointer chasing in the page
 437		 * table walking code (when walking the page table without locking;
 438		 * ie. most of the time). Fortunately, these data accesses consist
 439		 * of a chain of data-dependent loads, meaning most CPUs (alpha
 440		 * being the notable exception) will already guarantee loads are
 441		 * seen in-order. See the alpha page table accessors for the
 442		 * smp_rmb() barriers in page table walking code.
 443		 */
 444		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 445		pmd_populate(mm, pmd, *pte);
 446		*pte = NULL;
 447	}
 448	spin_unlock(ptl);
 449}
 450
 451int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
 452{
 453	pgtable_t new = pte_alloc_one(mm);
 454	if (!new)
 455		return -ENOMEM;
 456
 457	pmd_install(mm, pmd, &new);
 458	if (new)
 459		pte_free(mm, new);
 460	return 0;
 461}
 462
 463int __pte_alloc_kernel(pmd_t *pmd)
 464{
 465	pte_t *new = pte_alloc_one_kernel(&init_mm);
 466	if (!new)
 467		return -ENOMEM;
 468
 469	spin_lock(&init_mm.page_table_lock);
 470	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
 471		smp_wmb(); /* See comment in pmd_install() */
 472		pmd_populate_kernel(&init_mm, pmd, new);
 473		new = NULL;
 474	}
 475	spin_unlock(&init_mm.page_table_lock);
 476	if (new)
 477		pte_free_kernel(&init_mm, new);
 478	return 0;
 479}
 480
 481static inline void init_rss_vec(int *rss)
 482{
 483	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
 484}
 485
 486static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 487{
 488	int i;
 489
 490	for (i = 0; i < NR_MM_COUNTERS; i++)
 491		if (rss[i])
 492			add_mm_counter(mm, i, rss[i]);
 493}
 494
 495static bool is_bad_page_map_ratelimited(void)
 496{
 497	static unsigned long resume;
 498	static unsigned long nr_shown;
 499	static unsigned long nr_unshown;
 500
 501	/*
 502	 * Allow a burst of 60 reports, then keep quiet for that minute;
 503	 * or allow a steady drip of one report per second.
 504	 */
 505	if (nr_shown == 60) {
 506		if (time_before(jiffies, resume)) {
 507			nr_unshown++;
 508			return true;
 509		}
 510		if (nr_unshown) {
 511			pr_alert("BUG: Bad page map: %lu messages suppressed\n",
 512				 nr_unshown);
 513			nr_unshown = 0;
 514		}
 515		nr_shown = 0;
 516	}
 517	if (nr_shown++ == 0)
 518		resume = jiffies + 60 * HZ;
 519	return false;
 520}
 521
 522static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr)
 523{
 524	unsigned long long pgdv, p4dv, pudv, pmdv;
 525	p4d_t p4d, *p4dp;
 526	pud_t pud, *pudp;
 527	pmd_t pmd, *pmdp;
 528	pgd_t *pgdp;
 529
 530	/*
 531	 * Although this looks like a fully lockless pgtable walk, it is not:
 532	 * see locking requirements for print_bad_page_map().
 533	 */
 534	pgdp = pgd_offset(mm, addr);
 535	pgdv = pgd_val(*pgdp);
 536
 537	if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) {
 538		pr_alert("pgd:%08llx\n", pgdv);
 539		return;
 540	}
 541
 542	p4dp = p4d_offset(pgdp, addr);
 543	p4d = p4dp_get(p4dp);
 544	p4dv = p4d_val(p4d);
 545
 546	if (!p4d_present(p4d) || p4d_leaf(p4d)) {
 547		pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
 548		return;
 549	}
 550
 551	pudp = pud_offset(p4dp, addr);
 552	pud = pudp_get(pudp);
 553	pudv = pud_val(pud);
 554
 555	if (!pud_present(pud) || pud_leaf(pud)) {
 556		pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
 557		return;
 558	}
 559
 560	pmdp = pmd_offset(pudp, addr);
 561	pmd = pmdp_get(pmdp);
 562	pmdv = pmd_val(pmd);
 563
 564	/*
 565	 * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
 566	 * because the table should already be mapped by the caller and
 567	 * doing another map would be bad. print_bad_page_map() should
 568	 * already take care of printing the PTE.
 569	 */
 570	pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
 571		 p4dv, pudv, pmdv);
 572}
 573
 574/*
 575 * This function is called to print an error when a bad page table entry (e.g.,
 576 * corrupted page table entry) is found. For example, we might have a
 577 * PFN-mapped pte in a region that doesn't allow it.
 578 *
 579 * The calling function must still handle the error.
 580 *
 581 * This function must be called during a proper page table walk, as it will
 582 * re-walk the page table to dump information: the caller MUST prevent page
 583 * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
 584 * page table lock.
 585 */
 586static void print_bad_page_map(struct vm_area_struct *vma,
 587		unsigned long addr, unsigned long long entry, struct page *page,
 588		enum pgtable_level level)
 589{
 590	struct address_space *mapping;
 591	pgoff_t index;
 592
 593	if (is_bad_page_map_ratelimited())
 594		return;
 595
 596	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
 597	index = linear_page_index(vma, addr);
 598
 599	pr_alert("BUG: Bad page map in process %s  %s:%08llx", current->comm,
 600		 pgtable_level_to_str(level), entry);
 601	__print_bad_page_map_pgtable(vma->vm_mm, addr);
 602	if (page)
 603		dump_page(page, "bad page map");
 604	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
 605		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
 606	pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
 607		 vma->vm_file,
 608		 vma->vm_ops ? vma->vm_ops->fault : NULL,
 609		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
 610		 vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
 611		 mapping ? mapping->a_ops->read_folio : NULL);
 612	dump_stack();
 613	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 614}
 615#define print_bad_pte(vma, addr, pte, page) \
 616	print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
 617
 618/**
 619 * __vm_normal_page() - Get the "struct page" associated with a page table entry.
 620 * @vma: The VMA mapping the page table entry.
 621 * @addr: The address where the page table entry is mapped.
 622 * @pfn: The PFN stored in the page table entry.
 623 * @special: Whether the page table entry is marked "special".
 624 * @level: The page table level for error reporting purposes only.
 625 * @entry: The page table entry value for error reporting purposes only.
 626 *
 627 * "Special" mappings do not wish to be associated with a "struct page" (either
 628 * it doesn't exist, or it exists but they don't want to touch it). In this
 629 * case, NULL is returned here. "Normal" mappings do have a struct page and
 630 * are ordinarily refcounted.
 631 *
 632 * Page mappings of the shared zero folios are always considered "special", as
 633 * they are not ordinarily refcounted: neither the refcount nor the mapcount
 634 * of these folios is adjusted when mapping them into user page tables.
 635 * Selected page table walkers (such as GUP) can still identify mappings of the
 636 * shared zero folios and work with the underlying "struct page".
 637 *
 638 * There are 2 broad cases. Firstly, an architecture may define a "special"
 639 * page table entry bit, such as pte_special(), in which case this function is
 640 * trivial. Secondly, an architecture may not have a spare page table
 641 * entry bit, which requires a more complicated scheme, described below.
 642 *
 643 * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
 644 * page table entries that actually map "normal" pages: however, that page
 645 * cannot be looked up through the PFN stored in the page table entry, but
 646 * instead will be looked up through vm_ops->find_normal_page(). So far, this
 647 * only applies to PTEs.
 648 *
 649 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 650 * special mapping (even if there are underlying and valid "struct pages").
 651 * COWed pages of a VM_PFNMAP are always normal.
 652 *
 653 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 654 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 655 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 656 * mapping will always honor the rule
 657 *
 658 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 659 *
 660 * And for normal mappings this is false.
 661 *
 662 * This restricts such mappings to be a linear translation from virtual address
 663 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 664 * as the vma is not a COW mapping; in that case, we know that all ptes are
 665 * special (because none can have been COWed).
 666 *
 667 *
 668 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 669 *
 670 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 671 * page" backing, however the difference is that _all_ pages with a struct
 672 * page (that is, those where pfn_valid is true, except the shared zero
 673 * folios) are refcounted and considered normal pages by the VM.
 674 *
 675 * The disadvantage is that pages are refcounted (which can be slower and
 676 * simply not an option for some PFNMAP users). The advantage is that we
 677 * don't have to follow the strict linearity rule of PFNMAP mappings in
 678 * order to support COWable mappings.
 679 *
 680 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 681 *	   NULL if this is a "special" mapping.
 682 */
 683static inline struct page *__vm_normal_page(struct vm_area_struct *vma,
 684		unsigned long addr, unsigned long pfn, bool special,
 685		unsigned long long entry, enum pgtable_level level)
 686{
 687	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
 688		if (unlikely(special)) {
 689#ifdef CONFIG_FIND_NORMAL_PAGE
 690			if (vma->vm_ops && vma->vm_ops->find_normal_page)
 691				return vma->vm_ops->find_normal_page(vma, addr);
 692#endif /* CONFIG_FIND_NORMAL_PAGE */
 693			if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
 694				return NULL;
 695			if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
 696				return NULL;
 697
 698			print_bad_page_map(vma, addr, entry, NULL, level);
 699			return NULL;
 700		}
 701		/*
 702		 * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
 703		 * mappings (incl. shared zero folios) are marked accordingly.
 704		 */
 705	} else {
 706		if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) {
 707			if (vma->vm_flags & VM_MIXEDMAP) {
 708				/* If it has a "struct page", it's "normal". */
 709				if (!pfn_valid(pfn))
 710					return NULL;
 711			} else {
 712				unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
 713
 714				/* Only CoW'ed anon folios are "normal". */
 715				if (pfn == vma->vm_pgoff + off)
 716					return NULL;
 717				if (!is_cow_mapping(vma->vm_flags))
 718					return NULL;
 719			}
 720		}
 721
 722		if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn))
 723			return NULL;
 724	}
 725
 726	if (unlikely(pfn > highest_memmap_pfn)) {
 727		/* Corrupted page table entry. */
 728		print_bad_page_map(vma, addr, entry, NULL, level);
 729		return NULL;
 730	}
 731	/*
 732	 * NOTE! We still have PageReserved() pages in the page tables.
 733	 * For example, VDSO mappings can cause them to exist.
 734	 */
 735	VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn));
 736	return pfn_to_page(pfn);
 737}
 738
 739/**
 740 * vm_normal_page() - Get the "struct page" associated with a PTE
 741 * @vma: The VMA mapping the @pte.
 742 * @addr: The address where the @pte is mapped.
 743 * @pte: The PTE.
 744 *
 745 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 746 * for details on "normal" and "special" mappings.
 747 *
 748 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 749 *	   NULL if this is a "special" mapping.
 750 */
 751struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 752			    pte_t pte)
 753{
 754	return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte),
 755				pte_val(pte), PGTABLE_LEVEL_PTE);
 756}
 757
 758/**
 759 * vm_normal_folio() - Get the "struct folio" associated with a PTE
 760 * @vma: The VMA mapping the @pte.
 761 * @addr: The address where the @pte is mapped.
 762 * @pte: The PTE.
 763 *
 764 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 765 * for details on "normal" and "special" mappings.
 766 *
 767 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 768 *	   NULL if this is a "special" mapping.
 769 */
 770struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
 771			    pte_t pte)
 772{
 773	struct page *page = vm_normal_page(vma, addr, pte);
 774
 775	if (page)
 776		return page_folio(page);
 777	return NULL;
 778}
 779
 780#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
 781/**
 782 * vm_normal_page_pmd() - Get the "struct page" associated with a PMD
 783 * @vma: The VMA mapping the @pmd.
 784 * @addr: The address where the @pmd is mapped.
 785 * @pmd: The PMD.
 786 *
 787 * Get the "struct page" associated with a PTE. See __vm_normal_page()
 788 * for details on "normal" and "special" mappings.
 789 *
 790 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 791 *	   NULL if this is a "special" mapping.
 792 */
 793struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 794				pmd_t pmd)
 795{
 796	return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd),
 797				pmd_val(pmd), PGTABLE_LEVEL_PMD);
 798}
 799
 800/**
 801 * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
 802 * @vma: The VMA mapping the @pmd.
 803 * @addr: The address where the @pmd is mapped.
 804 * @pmd: The PMD.
 805 *
 806 * Get the "struct folio" associated with a PTE. See __vm_normal_page()
 807 * for details on "normal" and "special" mappings.
 808 *
 809 * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
 810 *	   NULL if this is a "special" mapping.
 811 */
 812struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
 813				  unsigned long addr, pmd_t pmd)
 814{
 815	struct page *page = vm_normal_page_pmd(vma, addr, pmd);
 816
 817	if (page)
 818		return page_folio(page);
 819	return NULL;
 820}
 821
 822/**
 823 * vm_normal_page_pud() - Get the "struct page" associated with a PUD
 824 * @vma: The VMA mapping the @pud.
 825 * @addr: The address where the @pud is mapped.
 826 * @pud: The PUD.
 827 *
 828 * Get the "struct page" associated with a PUD. See __vm_normal_page()
 829 * for details on "normal" and "special" mappings.
 830 *
 831 * Return: Returns the "struct page" if this is a "normal" mapping. Returns
 832 *	   NULL if this is a "special" mapping.
 833 */
 834struct page *vm_normal_page_pud(struct vm_area_struct *vma,
 835		unsigned long addr, pud_t pud)
 836{
 837	return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
 838				pud_val(pud), PGTABLE_LEVEL_PUD);
 839}
 840#endif
 841
 842/**
 843 * restore_exclusive_pte - Restore a device-exclusive entry
 844 * @vma: VMA covering @address
 845 * @folio: the mapped folio
 846 * @page: the mapped folio page
 847 * @address: the virtual address
 848 * @ptep: pte pointer into the locked page table mapping the folio page
 849 * @orig_pte: pte value at @ptep
 850 *
 851 * Restore a device-exclusive non-swap entry to an ordinary present pte.
 852 *
 853 * The folio and the page table must be locked, and MMU notifiers must have
 854 * been called to invalidate any (exclusive) device mappings.
 855 *
 856 * Locking the folio makes sure that anybody who just converted the pte to
 857 * a device-exclusive entry can map it into the device to make forward
 858 * progress without others converting it back until the folio was unlocked.
 859 *
 860 * If the folio lock ever becomes an issue, we can stop relying on the folio
 861 * lock; it might make some scenarios with heavy thrashing less likely to
 862 * make forward progress, but these scenarios might not be valid use cases.
 863 *
 864 * Note that the folio lock does not protect against all cases of concurrent
 865 * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
 866 * must use MMU notifiers to sync against any concurrent changes.
 867 */
 868static void restore_exclusive_pte(struct vm_area_struct *vma,
 869		struct folio *folio, struct page *page, unsigned long address,
 870		pte_t *ptep, pte_t orig_pte)
 871{
 872	pte_t pte;
 873
 874	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
 875
 876	pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
 877	if (pte_swp_soft_dirty(orig_pte))
 878		pte = pte_mksoft_dirty(pte);
 879
 880	if (pte_swp_uffd_wp(orig_pte))
 881		pte = pte_mkuffd_wp(pte);
 882
 883	if ((vma->vm_flags & VM_WRITE) &&
 884	    can_change_pte_writable(vma, address, pte)) {
 885		if (folio_test_dirty(folio))
 886			pte = pte_mkdirty(pte);
 887		pte = pte_mkwrite(pte, vma);
 888	}
 889	set_pte_at(vma->vm_mm, address, ptep, pte);
 890
 891	/*
 892	 * No need to invalidate - it was non-present before. However
 893	 * secondary CPUs may have mappings that need invalidating.
 894	 */
 895	update_mmu_cache(vma, address, ptep);
 896}
 897
 898/*
 899 * Tries to restore an exclusive pte if the page lock can be acquired without
 900 * sleeping.
 901 */
 902static int try_restore_exclusive_pte(struct vm_area_struct *vma,
 903		unsigned long addr, pte_t *ptep, pte_t orig_pte)
 904{
 905	const softleaf_t entry = softleaf_from_pte(orig_pte);
 906	struct page *page = softleaf_to_page(entry);
 907	struct folio *folio = page_folio(page);
 908
 909	if (folio_trylock(folio)) {
 910		restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
 911		folio_unlock(folio);
 912		return 0;
 913	}
 914
 915	return -EBUSY;
 916}
 917
 918/*
 919 * copy one vm_area from one task to the other. Assumes the page tables
 920 * already present in the new task to be cleared in the whole range
 921 * covered by this vma.
 922 */
 923
 924static unsigned long
 925copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 926		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
 927		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
 928{
 929	vm_flags_t vm_flags = dst_vma->vm_flags;
 930	pte_t orig_pte = ptep_get(src_pte);
 931	softleaf_t entry = softleaf_from_pte(orig_pte);
 932	pte_t pte = orig_pte;
 933	struct folio *folio;
 934	struct page *page;
 935
 936	if (likely(softleaf_is_swap(entry))) {
 937		if (swap_duplicate(entry) < 0)
 938			return -EIO;
 939
 940		/* make sure dst_mm is on swapoff's mmlist. */
 941		if (unlikely(list_empty(&dst_mm->mmlist))) {
 942			spin_lock(&mmlist_lock);
 943			if (list_empty(&dst_mm->mmlist))
 944				list_add(&dst_mm->mmlist,
 945						&src_mm->mmlist);
 946			spin_unlock(&mmlist_lock);
 947		}
 948		/* Mark the swap entry as shared. */
 949		if (pte_swp_exclusive(orig_pte)) {
 950			pte = pte_swp_clear_exclusive(orig_pte);
 951			set_pte_at(src_mm, addr, src_pte, pte);
 952		}
 953		rss[MM_SWAPENTS]++;
 954	} else if (softleaf_is_migration(entry)) {
 955		folio = softleaf_to_folio(entry);
 956
 957		rss[mm_counter(folio)]++;
 958
 959		if (!softleaf_is_migration_read(entry) &&
 960				is_cow_mapping(vm_flags)) {
 961			/*
 962			 * COW mappings require pages in both parent and child
 963			 * to be set to read. A previously exclusive entry is
 964			 * now shared.
 965			 */
 966			entry = make_readable_migration_entry(
 967							swp_offset(entry));
 968			pte = softleaf_to_pte(entry);
 969			if (pte_swp_soft_dirty(orig_pte))
 970				pte = pte_swp_mksoft_dirty(pte);
 971			if (pte_swp_uffd_wp(orig_pte))
 972				pte = pte_swp_mkuffd_wp(pte);
 973			set_pte_at(src_mm, addr, src_pte, pte);
 974		}
 975	} else if (softleaf_is_device_private(entry)) {
 976		page = softleaf_to_page(entry);
 977		folio = page_folio(page);
 978
 979		/*
 980		 * Update rss count even for unaddressable pages, as
 981		 * they should treated just like normal pages in this
 982		 * respect.
 983		 *
 984		 * We will likely want to have some new rss counters
 985		 * for unaddressable pages, at some point. But for now
 986		 * keep things as they are.
 987		 */
 988		folio_get(folio);
 989		rss[mm_counter(folio)]++;
 990		/* Cannot fail as these pages cannot get pinned. */
 991		folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);
 992
 993		/*
 994		 * We do not preserve soft-dirty information, because so
 995		 * far, checkpoint/restore is the only feature that
 996		 * requires that. And checkpoint/restore does not work
 997		 * when a device driver is involved (you cannot easily
 998		 * save and restore device driver state).
 999		 */
1000		if (softleaf_is_device_private_write(entry) &&
1001		    is_cow_mapping(vm_flags)) {
1002			entry = make_readable_device_private_entry(
1003							swp_offset(entry));
1004			pte = swp_entry_to_pte(entry);
1005			if (pte_swp_uffd_wp(orig_pte))
1006				pte = pte_swp_mkuffd_wp(pte);
1007			set_pte_at(src_mm, addr, src_pte, pte);
1008		}
1009	} else if (softleaf_is_device_exclusive(entry)) {
1010		/*
1011		 * Make device exclusive entries present by restoring the
1012		 * original entry then copying as for a present pte. Device
1013		 * exclusive entries currently only support private writable
1014		 * (ie. COW) mappings.
1015		 */
1016		VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
1017		if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
1018			return -EBUSY;
1019		return -ENOENT;
1020	} else if (softleaf_is_marker(entry)) {
1021		pte_marker marker = copy_pte_marker(entry, dst_vma);
1022
1023		if (marker)
1024			set_pte_at(dst_mm, addr, dst_pte,
1025				   make_pte_marker(marker));
1026		return 0;
1027	}
1028	if (!userfaultfd_wp(dst_vma))
1029		pte = pte_swp_clear_uffd_wp(pte);
1030	set_pte_at(dst_mm, addr, dst_pte, pte);
1031	return 0;
1032}
1033
1034/*
1035 * Copy a present and normal page.
1036 *
1037 * NOTE! The usual case is that this isn't required;
1038 * instead, the caller can just increase the page refcount
1039 * and re-use the pte the traditional way.
1040 *
1041 * And if we need a pre-allocated page but don't yet have
1042 * one, return a negative error to let the preallocation
1043 * code know so that it can do so outside the page table
1044 * lock.
1045 */
1046static inline int
1047copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1048		  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
1049		  struct folio **prealloc, struct page *page)
1050{
1051	struct folio *new_folio;
1052	pte_t pte;
1053
1054	new_folio = *prealloc;
1055	if (!new_folio)
1056		return -EAGAIN;
1057
1058	/*
1059	 * We have a prealloc page, all good!  Take it
1060	 * over and copy the page & arm it.
1061	 */
1062
1063	if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma))
1064		return -EHWPOISON;
1065
1066	*prealloc = NULL;
1067	__folio_mark_uptodate(new_folio);
1068	folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE);
1069	folio_add_lru_vma(new_folio, dst_vma);
1070	rss[MM_ANONPAGES]++;
1071
1072	/* All done, just insert the new page copy in the child */
1073	pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
1074	pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
1075	if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
1076		/* Uffd-wp needs to be delivered to dest pte as well */
1077		pte = pte_mkuffd_wp(pte);
1078	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
1079	return 0;
1080}
1081
1082static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
1083		struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
1084		pte_t pte, unsigned long addr, int nr)
1085{
1086	struct mm_struct *src_mm = src_vma->vm_mm;
1087
1088	/* If it's a COW mapping, write protect it both processes. */
1089	if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
1090		wrprotect_ptes(src_mm, addr, src_pte, nr);
1091		pte = pte_wrprotect(pte);
1092	}
1093
1094	/* If it's a shared mapping, mark it clean in the child. */
1095	if (src_vma->vm_flags & VM_SHARED)
1096		pte = pte_mkclean(pte);
1097	pte = pte_mkold(pte);
1098
1099	if (!userfaultfd_wp(dst_vma))
1100		pte = pte_clear_uffd_wp(pte);
1101
1102	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
1103}
1104
1105/*
1106 * Copy one present PTE, trying to batch-process subsequent PTEs that map
1107 * consecutive pages of the same folio by copying them as well.
1108 *
1109 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
1110 * Otherwise, returns the number of copied PTEs (at least 1).
1111 */
1112static inline int
1113copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1114		 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
1115		 int max_nr, int *rss, struct folio **prealloc)
1116{
1117	fpb_t flags = FPB_MERGE_WRITE;
1118	struct page *page;
1119	struct folio *folio;
1120	int err, nr;
1121
1122	page = vm_normal_page(src_vma, addr, pte);
1123	if (unlikely(!page))
1124		goto copy_pte;
1125
1126	folio = page_folio(page);
1127
1128	/*
1129	 * If we likely have to copy, just don't bother with batching. Make
1130	 * sure that the common "small folio" case is as fast as possible
1131	 * by keeping the batching logic separate.
1132	 */
1133	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
1134		if (!(src_vma->vm_flags & VM_SHARED))
1135			flags |= FPB_RESPECT_DIRTY;
1136		if (vma_soft_dirty_enabled(src_vma))
1137			flags |= FPB_RESPECT_SOFT_DIRTY;
1138
1139		nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags);
1140		folio_ref_add(folio, nr);
1141		if (folio_test_anon(folio)) {
1142			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
1143								  nr, dst_vma, src_vma))) {
1144				folio_ref_sub(folio, nr);
1145				return -EAGAIN;
1146			}
1147			rss[MM_ANONPAGES] += nr;
1148			VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1149		} else {
1150			folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
1151			rss[mm_counter_file(folio)] += nr;
1152		}
1153		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
1154				    addr, nr);
1155		return nr;
1156	}
1157
1158	folio_get(folio);
1159	if (folio_test_anon(folio)) {
1160		/*
1161		 * If this page may have been pinned by the parent process,
1162		 * copy the page immediately for the child so that we'll always
1163		 * guarantee the pinned page won't be randomly replaced in the
1164		 * future.
1165		 */
1166		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
1167			/* Page may be pinned, we have to copy. */
1168			folio_put(folio);
1169			err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
1170						addr, rss, prealloc, page);
1171			return err ? err : 1;
1172		}
1173		rss[MM_ANONPAGES]++;
1174		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1175	} else {
1176		folio_dup_file_rmap_pte(folio, page, dst_vma);
1177		rss[mm_counter_file(folio)]++;
1178	}
1179
1180copy_pte:
1181	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
1182	return 1;
1183}
1184
1185static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
1186		struct vm_area_struct *vma, unsigned long addr, bool need_zero)
1187{
1188	struct folio *new_folio;
1189
1190	if (need_zero)
1191		new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
1192	else
1193		new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
1194
1195	if (!new_folio)
1196		return NULL;
1197
1198	if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
1199		folio_put(new_folio);
1200		return NULL;
1201	}
1202	folio_throttle_swaprate(new_folio, GFP_KERNEL);
1203
1204	return new_folio;
1205}
1206
1207static int
1208copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1209	       pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1210	       unsigned long end)
1211{
1212	struct mm_struct *dst_mm = dst_vma->vm_mm;
1213	struct mm_struct *src_mm = src_vma->vm_mm;
1214	pte_t *orig_src_pte, *orig_dst_pte;
1215	pte_t *src_pte, *dst_pte;
1216	pmd_t dummy_pmdval;
1217	pte_t ptent;
1218	spinlock_t *src_ptl, *dst_ptl;
1219	int progress, max_nr, ret = 0;
1220	int rss[NR_MM_COUNTERS];
1221	softleaf_t entry = softleaf_mk_none();
1222	struct folio *prealloc = NULL;
1223	int nr;
1224
1225again:
1226	progress = 0;
1227	init_rss_vec(rss);
1228
1229	/*
1230	 * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
1231	 * error handling here, assume that exclusive mmap_lock on dst and src
1232	 * protects anon from unexpected THP transitions; with shmem and file
1233	 * protected by mmap_lock-less collapse skipping areas with anon_vma
1234	 * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
1235	 * can remove such assumptions later, but this is good enough for now.
1236	 */
1237	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1238	if (!dst_pte) {
1239		ret = -ENOMEM;
1240		goto out;
1241	}
1242
1243	/*
1244	 * We already hold the exclusive mmap_lock, the copy_pte_range() and
1245	 * retract_page_tables() are using vma->anon_vma to be exclusive, so
1246	 * the PTE page is stable, and there is no need to get pmdval and do
1247	 * pmd_same() check.
1248	 */
1249	src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval,
1250					   &src_ptl);
1251	if (!src_pte) {
1252		pte_unmap_unlock(dst_pte, dst_ptl);
1253		/* ret == 0 */
1254		goto out;
1255	}
1256	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1257	orig_src_pte = src_pte;
1258	orig_dst_pte = dst_pte;
1259	arch_enter_lazy_mmu_mode();
1260
1261	do {
1262		nr = 1;
1263
1264		/*
1265		 * We are holding two locks at this point - either of them
1266		 * could generate latencies in another task on another CPU.
1267		 */
1268		if (progress >= 32) {
1269			progress = 0;
1270			if (need_resched() ||
1271			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
1272				break;
1273		}
1274		ptent = ptep_get(src_pte);
1275		if (pte_none(ptent)) {
1276			progress++;
1277			continue;
1278		}
1279		if (unlikely(!pte_present(ptent))) {
1280			ret = copy_nonpresent_pte(dst_mm, src_mm,
1281						  dst_pte, src_pte,
1282						  dst_vma, src_vma,
1283						  addr, rss);
1284			if (ret == -EIO) {
1285				entry = softleaf_from_pte(ptep_get(src_pte));
1286				break;
1287			} else if (ret == -EBUSY) {
1288				break;
1289			} else if (!ret) {
1290				progress += 8;
1291				continue;
1292			}
1293			ptent = ptep_get(src_pte);
1294			VM_WARN_ON_ONCE(!pte_present(ptent));
1295
1296			/*
1297			 * Device exclusive entry restored, continue by copying
1298			 * the now present pte.
1299			 */
1300			WARN_ON_ONCE(ret != -ENOENT);
1301		}
1302		/* copy_present_ptes() will clear `*prealloc' if consumed */
1303		max_nr = (end - addr) / PAGE_SIZE;
1304		ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
1305					ptent, addr, max_nr, rss, &prealloc);
1306		/*
1307		 * If we need a pre-allocated page for this pte, drop the
1308		 * locks, allocate, and try again.
1309		 * If copy failed due to hwpoison in source page, break out.
1310		 */
1311		if (unlikely(ret == -EAGAIN || ret == -EHWPOISON))
1312			break;
1313		if (unlikely(prealloc)) {
1314			/*
1315			 * pre-alloc page cannot be reused by next time so as
1316			 * to strictly follow mempolicy (e.g., alloc_page_vma()
1317			 * will allocate page according to address).  This
1318			 * could only happen if one pinned pte changed.
1319			 */
1320			folio_put(prealloc);
1321			prealloc = NULL;
1322		}
1323		nr = ret;
1324		progress += 8 * nr;
1325	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
1326		 addr != end);
1327
1328	arch_leave_lazy_mmu_mode();
1329	pte_unmap_unlock(orig_src_pte, src_ptl);
1330	add_mm_rss_vec(dst_mm, rss);
1331	pte_unmap_unlock(orig_dst_pte, dst_ptl);
1332	cond_resched();
1333
1334	if (ret == -EIO) {
1335		VM_WARN_ON_ONCE(!entry.val);
1336		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
1337			ret = -ENOMEM;
1338			goto out;
1339		}
1340		entry.val = 0;
1341	} else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) {
1342		goto out;
1343	} else if (ret ==  -EAGAIN) {
1344		prealloc = folio_prealloc(src_mm, src_vma, addr, false);
1345		if (!prealloc)
1346			return -ENOMEM;
1347	} else if (ret < 0) {
1348		VM_WARN_ON_ONCE(1);
1349	}
1350
1351	/* We've captured and resolved the error. Reset, try again. */
1352	ret = 0;
1353
1354	if (addr != end)
1355		goto again;
1356out:
1357	if (unlikely(prealloc))
1358		folio_put(prealloc);
1359	return ret;
1360}
1361
1362static inline int
1363copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1364	       pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1365	       unsigned long end)
1366{
1367	struct mm_struct *dst_mm = dst_vma->vm_mm;
1368	struct mm_struct *src_mm = src_vma->vm_mm;
1369	pmd_t *src_pmd, *dst_pmd;
1370	unsigned long next;
1371
1372	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
1373	if (!dst_pmd)
1374		return -ENOMEM;
1375	src_pmd = pmd_offset(src_pud, addr);
1376	do {
1377		next = pmd_addr_end(addr, end);
1378		if (pmd_is_huge(*src_pmd)) {
1379			int err;
1380
1381			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1382			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
1383					    addr, dst_vma, src_vma);
1384			if (err == -ENOMEM)
1385				return -ENOMEM;
1386			if (!err)
1387				continue;
1388			/* fall through */
1389		}
1390		if (pmd_none_or_clear_bad(src_pmd))
1391			continue;
1392		if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1393				   addr, next))
1394			return -ENOMEM;
1395	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
1396	return 0;
1397}
1398
1399static inline int
1400copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1401	       p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
1402	       unsigned long end)
1403{
1404	struct mm_struct *dst_mm = dst_vma->vm_mm;
1405	struct mm_struct *src_mm = src_vma->vm_mm;
1406	pud_t *src_pud, *dst_pud;
1407	unsigned long next;
1408
1409	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
1410	if (!dst_pud)
1411		return -ENOMEM;
1412	src_pud = pud_offset(src_p4d, addr);
1413	do {
1414		next = pud_addr_end(addr, end);
1415		if (pud_trans_huge(*src_pud)) {
1416			int err;
1417
1418			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1419			err = copy_huge_pud(dst_mm, src_mm,
1420					    dst_pud, src_pud, addr, src_vma);
1421			if (err == -ENOMEM)
1422				return -ENOMEM;
1423			if (!err)
1424				continue;
1425			/* fall through */
1426		}
1427		if (pud_none_or_clear_bad(src_pud))
1428			continue;
1429		if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1430				   addr, next))
1431			return -ENOMEM;
1432	} while (dst_pud++, src_pud++, addr = next, addr != end);
1433	return 0;
1434}
1435
1436static inline int
1437copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
1438	       pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
1439	       unsigned long end)
1440{
1441	struct mm_struct *dst_mm = dst_vma->vm_mm;
1442	p4d_t *src_p4d, *dst_p4d;
1443	unsigned long next;
1444
1445	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
1446	if (!dst_p4d)
1447		return -ENOMEM;
1448	src_p4d = p4d_offset(src_pgd, addr);
1449	do {
1450		next = p4d_addr_end(addr, end);
1451		if (p4d_none_or_clear_bad(src_p4d))
1452			continue;
1453		if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1454				   addr, next))
1455			return -ENOMEM;
1456	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
1457	return 0;
1458}
1459
1460/*
1461 * Return true if the vma needs to copy the pgtable during this fork().  Return
1462 * false when we can speed up fork() by allowing lazy page faults later until
1463 * when the child accesses the memory range.
1464 */
1465static bool
1466vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1467{
1468	/*
1469	 * We check against dst_vma as while sane VMA flags will have been
1470	 * copied, VM_UFFD_WP may be set only on dst_vma.
1471	 */
1472	if (dst_vma->vm_flags & VM_COPY_ON_FORK)
1473		return true;
1474	/*
1475	 * The presence of an anon_vma indicates an anonymous VMA has page
1476	 * tables which naturally cannot be reconstituted on page fault.
1477	 */
1478	if (src_vma->anon_vma)
1479		return true;
1480
1481	/*
1482	 * Don't copy ptes where a page fault will fill them correctly.  Fork
1483	 * becomes much lighter when there are big shared or private readonly
1484	 * mappings. The tradeoff is that copy_page_range is more efficient
1485	 * than faulting.
1486	 */
1487	return false;
1488}
1489
1490int
1491copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1492{
1493	pgd_t *src_pgd, *dst_pgd;
1494	unsigned long addr = src_vma->vm_start;
1495	unsigned long end = src_vma->vm_end;
1496	struct mm_struct *dst_mm = dst_vma->vm_mm;
1497	struct mm_struct *src_mm = src_vma->vm_mm;
1498	struct mmu_notifier_range range;
1499	unsigned long next;
1500	bool is_cow;
1501	int ret;
1502
1503	if (!vma_needs_copy(dst_vma, src_vma))
1504		return 0;
1505
1506	if (is_vm_hugetlb_page(src_vma))
1507		return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
1508
1509	/*
1510	 * We need to invalidate the secondary MMU mappings only when
1511	 * there could be a permission downgrade on the ptes of the
1512	 * parent mm. And a permission downgrade will only happen if
1513	 * is_cow_mapping() returns true.
1514	 */
1515	is_cow = is_cow_mapping(src_vma->vm_flags);
1516
1517	if (is_cow) {
1518		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
1519					0, src_mm, addr, end);
1520		mmu_notifier_invalidate_range_start(&range);
1521		/*
1522		 * Disabling preemption is not needed for the write side, as
1523		 * the read side doesn't spin, but goes to the mmap_lock.
1524		 *
1525		 * Use the raw variant of the seqcount_t write API to avoid
1526		 * lockdep complaining about preemptibility.
1527		 */
1528		vma_assert_write_locked(src_vma);
1529		raw_write_seqcount_begin(&src_mm->write_protect_seq);
1530	}
1531
1532	ret = 0;
1533	dst_pgd = pgd_offset(dst_mm, addr);
1534	src_pgd = pgd_offset(src_mm, addr);
1535	do {
1536		next = pgd_addr_end(addr, end);
1537		if (pgd_none_or_clear_bad(src_pgd))
1538			continue;
1539		if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1540					    addr, next))) {
1541			ret = -ENOMEM;
1542			break;
1543		}
1544	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
1545
1546	if (is_cow) {
1547		raw_write_seqcount_end(&src_mm->write_protect_seq);
1548		mmu_notifier_invalidate_range_end(&range);
1549	}
1550	return ret;
1551}
1552
1553/* Whether we should zap all COWed (private) pages too */
1554static inline bool should_zap_cows(struct zap_details *details)
1555{
1556	/* By default, zap all pages */
1557	if (!details || details->reclaim_pt)
1558		return true;
1559
1560	/* Or, we zap COWed pages only if the caller wants to */
1561	return details->even_cows;
1562}
1563
1564/* Decides whether we should zap this folio with the folio pointer specified */
1565static inline bool should_zap_folio(struct zap_details *details,
1566				    struct folio *folio)
1567{
1568	/* If we can make a decision without *folio.. */
1569	if (should_zap_cows(details))
1570		return true;
1571
1572	/* Otherwise we should only zap non-anon folios */
1573	return !folio_test_anon(folio);
1574}
1575
1576static inline bool zap_drop_markers(struct zap_details *details)
1577{
1578	if (!details)
1579		return false;
1580
1581	return details->zap_flags & ZAP_FLAG_DROP_MARKER;
1582}
1583
1584/*
1585 * This function makes sure that we'll replace the none pte with an uffd-wp
1586 * swap special pte marker when necessary. Must be with the pgtable lock held.
1587 *
1588 * Returns true if uffd-wp ptes was installed, false otherwise.
1589 */
1590static inline bool
1591zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
1592			      unsigned long addr, pte_t *pte, int nr,
1593			      struct zap_details *details, pte_t pteval)
1594{
1595	bool was_installed = false;
1596
1597	if (!uffd_supports_wp_marker())
1598		return false;
1599
1600	/* Zap on anonymous always means dropping everything */
1601	if (vma_is_anonymous(vma))
1602		return false;
1603
1604	if (zap_drop_markers(details))
1605		return false;
1606
1607	for (;;) {
1608		/* the PFN in the PTE is irrelevant. */
1609		if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
1610			was_installed = true;
1611		if (--nr == 0)
1612			break;
1613		pte++;
1614		addr += PAGE_SIZE;
1615	}
1616
1617	return was_installed;
1618}
1619
1620static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
1621		struct vm_area_struct *vma, struct folio *folio,
1622		struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
1623		unsigned long addr, struct zap_details *details, int *rss,
1624		bool *force_flush, bool *force_break, bool *any_skipped)
1625{
1626	struct mm_struct *mm = tlb->mm;
1627	bool delay_rmap = false;
1628
1629	if (!folio_test_anon(folio)) {
1630		ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
1631		if (pte_dirty(ptent)) {
1632			folio_mark_dirty(folio);
1633			if (tlb_delay_rmap(tlb)) {
1634				delay_rmap = true;
1635				*force_flush = true;
1636			}
1637		}
1638		if (pte_young(ptent) && likely(vma_has_recency(vma)))
1639			folio_mark_accessed(folio);
1640		rss[mm_counter(folio)] -= nr;
1641	} else {
1642		/* We don't need up-to-date accessed/dirty bits. */
1643		clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
1644		rss[MM_ANONPAGES] -= nr;
1645	}
1646	/* Checking a single PTE in a batch is sufficient. */
1647	arch_check_zapped_pte(vma, ptent);
1648	tlb_remove_tlb_entries(tlb, pte, nr, addr);
1649	if (unlikely(userfaultfd_pte_wp(vma, ptent)))
1650		*any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
1651							     nr, details, ptent);
1652
1653	if (!delay_rmap) {
1654		folio_remove_rmap_ptes(folio, page, nr, vma);
1655
1656		if (unlikely(folio_mapcount(folio) < 0))
1657			print_bad_pte(vma, addr, ptent, page);
1658	}
1659	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
1660		*force_flush = true;
1661		*force_break = true;
1662	}
1663}
1664
1665/*
1666 * Zap or skip at least one present PTE, trying to batch-process subsequent
1667 * PTEs that map consecutive pages of the same folio.
1668 *
1669 * Returns the number of processed (skipped or zapped) PTEs (at least 1).
1670 */
1671static inline int zap_present_ptes(struct mmu_gather *tlb,
1672		struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
1673		unsigned int max_nr, unsigned long addr,
1674		struct zap_details *details, int *rss, bool *force_flush,
1675		bool *force_break, bool *any_skipped)
1676{
1677	struct mm_struct *mm = tlb->mm;
1678	struct folio *folio;
1679	struct page *page;
1680	int nr;
1681
1682	page = vm_normal_page(vma, addr, ptent);
1683	if (!page) {
1684		/* We don't need up-to-date accessed/dirty bits. */
1685		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
1686		arch_check_zapped_pte(vma, ptent);
1687		tlb_remove_tlb_entry(tlb, pte, addr);
1688		if (userfaultfd_pte_wp(vma, ptent))
1689			*any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
1690						pte, 1, details, ptent);
1691		ksm_might_unmap_zero_page(mm, ptent);
1692		return 1;
1693	}
1694
1695	folio = page_folio(page);
1696	if (unlikely(!should_zap_folio(details, folio))) {
1697		*any_skipped = true;
1698		return 1;
1699	}
1700
1701	/*
1702	 * Make sure that the common "small folio" case is as fast as possible
1703	 * by keeping the batching logic separate.
1704	 */
1705	if (unlikely(folio_test_large(folio) && max_nr != 1)) {
1706		nr = folio_pte_batch(folio, pte, ptent, max_nr);
1707		zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
1708				       addr, details, rss, force_flush,
1709				       force_break, any_skipped);
1710		return nr;
1711	}
1712	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
1713			       details, rss, force_flush, force_break, any_skipped);
1714	return 1;
1715}
1716
1717static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
1718		struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
1719		unsigned int max_nr, unsigned long addr,
1720		struct zap_details *details, int *rss, bool *any_skipped)
1721{
1722	softleaf_t entry;
1723	int nr = 1;
1724
1725	*any_skipped = true;
1726	entry = softleaf_from_pte(ptent);
1727	if (softleaf_is_device_private(entry) ||
1728	    softleaf_is_device_exclusive(entry)) {
1729		struct page *page = softleaf_to_page(entry);
1730		struct folio *folio = page_folio(page);
1731
1732		if (unlikely(!should_zap_folio(details, folio)))
1733			return 1;
1734		/*
1735		 * Both device private/exclusive mappings should only
1736		 * work with anonymous page so far, so we don't need to
1737		 * consider uffd-wp bit when zap. For more information,
1738		 * see zap_install_uffd_wp_if_needed().
1739		 */
1740		WARN_ON_ONCE(!vma_is_anonymous(vma));
1741		rss[mm_counter(folio)]--;
1742		folio_remove_rmap_pte(folio, page, vma);
1743		folio_put(folio);
1744	} else if (softleaf_is_swap(entry)) {
1745		/* Genuine swap entries, hence a private anon pages */
1746		if (!should_zap_cows(details))
1747			return 1;
1748
1749		nr = swap_pte_batch(pte, max_nr, ptent);
1750		rss[MM_SWAPENTS] -= nr;
1751		free_swap_and_cache_nr(entry, nr);
1752	} else if (softleaf_is_migration(entry)) {
1753		struct folio *folio = softleaf_to_folio(entry);
1754
1755		if (!should_zap_folio(details, folio))
1756			return 1;
1757		rss[mm_counter(folio)]--;
1758	} else if (softleaf_is_uffd_wp_marker(entry)) {
1759		/*
1760		 * For anon: always drop the marker; for file: only
1761		 * drop the marker if explicitly requested.
1762		 */
1763		if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
1764			return 1;
1765	} else if (softleaf_is_guard_marker(entry)) {
1766		/*
1767		 * Ordinary zapping should not remove guard PTE
1768		 * markers. Only do so if we should remove PTE markers
1769		 * in general.
1770		 */
1771		if (!zap_drop_markers(details))
1772			return 1;
1773	} else if (softleaf_is_hwpoison(entry) ||
1774		   softleaf_is_poison_marker(entry)) {
1775		if (!should_zap_cows(details))
1776			return 1;
1777	} else {
1778		/* We should have covered all the swap entry types */
1779		pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
1780		WARN_ON_ONCE(1);
1781	}
1782	clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
1783	*any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
1784
1785	return nr;
1786}
1787
1788static inline int do_zap_pte_range(struct mmu_gather *tlb,
1789				   struct vm_area_struct *vma, pte_t *pte,
1790				   unsigned long addr, unsigned long end,
1791				   struct zap_details *details, int *rss,
1792				   bool *force_flush, bool *force_break,
1793				   bool *any_skipped)
1794{
1795	pte_t ptent = ptep_get(pte);
1796	int max_nr = (end - addr) / PAGE_SIZE;
1797	int nr = 0;
1798
1799	/* Skip all consecutive none ptes */
1800	if (pte_none(ptent)) {
1801		for (nr = 1; nr < max_nr; nr++) {
1802			ptent = ptep_get(pte + nr);
1803			if (!pte_none(ptent))
1804				break;
1805		}
1806		max_nr -= nr;
1807		if (!max_nr)
1808			return nr;
1809		pte += nr;
1810		addr += nr * PAGE_SIZE;
1811	}
1812
1813	if (pte_present(ptent))
1814		nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
1815				       details, rss, force_flush, force_break,
1816				       any_skipped);
1817	else
1818		nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
1819					  details, rss, any_skipped);
1820
1821	return nr;
1822}
1823
1824static unsigned long zap_pte_range(struct mmu_gather *tlb,
1825				struct vm_area_struct *vma, pmd_t *pmd,
1826				unsigned long addr, unsigned long end,
1827				struct zap_details *details)
1828{
1829	bool force_flush = false, force_break = false;
1830	struct mm_struct *mm = tlb->mm;
1831	int rss[NR_MM_COUNTERS];
1832	spinlock_t *ptl;
1833	pte_t *start_pte;
1834	pte_t *pte;
1835	pmd_t pmdval;
1836	unsigned long start = addr;
1837	bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
1838	bool direct_reclaim = true;
1839	int nr;
1840
1841retry:
1842	tlb_change_page_size(tlb, PAGE_SIZE);
1843	init_rss_vec(rss);
1844	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1845	if (!pte)
1846		return addr;
1847
1848	flush_tlb_batched_pending(mm);
1849	arch_enter_lazy_mmu_mode();
1850	do {
1851		bool any_skipped = false;
1852
1853		if (need_resched()) {
1854			direct_reclaim = false;
1855			break;
1856		}
1857
1858		nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
1859				      &force_flush, &force_break, &any_skipped);
1860		if (any_skipped)
1861			can_reclaim_pt = false;
1862		if (unlikely(force_break)) {
1863			addr += nr * PAGE_SIZE;
1864			direct_reclaim = false;
1865			break;
1866		}
1867	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
1868
1869	/*
1870	 * Fast path: try to hold the pmd lock and unmap the PTE page.
1871	 *
1872	 * If the pte lock was released midway (retry case), or if the attempt
1873	 * to hold the pmd lock failed, then we need to recheck all pte entries
1874	 * to ensure they are still none, thereby preventing the pte entries
1875	 * from being repopulated by another thread.
1876	 */
1877	if (can_reclaim_pt && direct_reclaim && addr == end)
1878		direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
1879
1880	add_mm_rss_vec(mm, rss);
1881	arch_leave_lazy_mmu_mode();
1882
1883	/* Do the actual TLB flush before dropping ptl */
1884	if (force_flush) {
1885		tlb_flush_mmu_tlbonly(tlb);
1886		tlb_flush_rmaps(tlb, vma);
1887	}
1888	pte_unmap_unlock(start_pte, ptl);
1889
1890	/*
1891	 * If we forced a TLB flush (either due to running out of
1892	 * batch buffers or because we needed to flush dirty TLB
1893	 * entries before releasing the ptl), free the batched
1894	 * memory too. Come back again if we didn't do everything.
1895	 */
1896	if (force_flush)
1897		tlb_flush_mmu(tlb);
1898
1899	if (addr != end) {
1900		cond_resched();
1901		force_flush = false;
1902		force_break = false;
1903		goto retry;
1904	}
1905
1906	if (can_reclaim_pt) {
1907		if (direct_reclaim)
1908			free_pte(mm, start, tlb, pmdval);
1909		else
1910			try_to_free_pte(mm, pmd, start, tlb);
1911	}
1912
1913	return addr;
1914}
1915
1916static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1917				struct vm_area_struct *vma, pud_t *pud,
1918				unsigned long addr, unsigned long end,
1919				struct zap_details *details)
1920{
1921	pmd_t *pmd;
1922	unsigned long next;
1923
1924	pmd = pmd_offset(pud, addr);
1925	do {
1926		next = pmd_addr_end(addr, end);
1927		if (pmd_is_huge(*pmd)) {
1928			if (next - addr != HPAGE_PMD_SIZE)
1929				__split_huge_pmd(vma, pmd, addr, false);
1930			else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
1931				addr = next;
1932				continue;
1933			}
1934			/* fall through */
1935		} else if (details && details->single_folio &&
1936			   folio_test_pmd_mappable(details->single_folio) &&
1937			   next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
1938			spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
1939			/*
1940			 * Take and drop THP pmd lock so that we cannot return
1941			 * prematurely, while zap_huge_pmd() has cleared *pmd,
1942			 * but not yet decremented compound_mapcount().
1943			 */
1944			spin_unlock(ptl);
1945		}
1946		if (pmd_none(*pmd)) {
1947			addr = next;
1948			continue;
1949		}
1950		addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
1951		if (addr != next)
1952			pmd--;
1953	} while (pmd++, cond_resched(), addr != end);
1954
1955	return addr;
1956}
1957
1958static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1959				struct vm_area_struct *vma, p4d_t *p4d,
1960				unsigned long addr, unsigned long end,
1961				struct zap_details *details)
1962{
1963	pud_t *pud;
1964	unsigned long next;
1965
1966	pud = pud_offset(p4d, addr);
1967	do {
1968		next = pud_addr_end(addr, end);
1969		if (pud_trans_huge(*pud)) {
1970			if (next - addr != HPAGE_PUD_SIZE)
1971				split_huge_pud(vma, pud, addr);
1972			else if (zap_huge_pud(tlb, vma, pud, addr))
1973				goto next;
1974			/* fall through */
1975		}
1976		if (pud_none_or_clear_bad(pud))
1977			continue;
1978		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1979next:
1980		cond_resched();
1981	} while (pud++, addr = next, addr != end);
1982
1983	return addr;
1984}
1985
1986static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1987				struct vm_area_struct *vma, pgd_t *pgd,
1988				unsigned long addr, unsigned long end,
1989				struct zap_details *details)
1990{
1991	p4d_t *p4d;
1992	unsigned long next;
1993
1994	p4d = p4d_offset(pgd, addr);
1995	do {
1996		next = p4d_addr_end(addr, end);
1997		if (p4d_none_or_clear_bad(p4d))
1998			continue;
1999		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
2000	} while (p4d++, addr = next, addr != end);
2001
2002	return addr;
2003}
2004
2005void unmap_page_range(struct mmu_gather *tlb,
2006			     struct vm_area_struct *vma,
2007			     unsigned long addr, unsigned long end,
2008			     struct zap_details *details)
2009{
2010	pgd_t *pgd;
2011	unsigned long next;
2012
2013	BUG_ON(addr >= end);
2014	tlb_start_vma(tlb, vma);
2015	pgd = pgd_offset(vma->vm_mm, addr);
2016	do {
2017		next = pgd_addr_end(addr, end);
2018		if (pgd_none_or_clear_bad(pgd))
2019			continue;
2020		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
2021	} while (pgd++, addr = next, addr != end);
2022	tlb_end_vma(tlb, vma);
2023}
2024
2025
2026static void unmap_single_vma(struct mmu_gather *tlb,
2027		struct vm_area_struct *vma, unsigned long start_addr,
2028		unsigned long end_addr, struct zap_details *details)
2029{
2030	unsigned long start = max(vma->vm_start, start_addr);
2031	unsigned long end;
2032
2033	if (start >= vma->vm_end)
2034		return;
2035	end = min(vma->vm_end, end_addr);
2036	if (end <= vma->vm_start)
2037		return;
2038
2039	if (vma->vm_file)
2040		uprobe_munmap(vma, start, end);
2041
2042	if (start != end) {
2043		if (unlikely(is_vm_hugetlb_page(vma))) {
2044			/*
2045			 * It is undesirable to test vma->vm_file as it
2046			 * should be non-null for valid hugetlb area.
2047			 * However, vm_file will be NULL in the error
2048			 * cleanup path of mmap_region. When
2049			 * hugetlbfs ->mmap method fails,
2050			 * mmap_region() nullifies vma->vm_file
2051			 * before calling this function to clean up.
2052			 * Since no pte has actually been setup, it is
2053			 * safe to do nothing in this case.
2054			 */
2055			if (vma->vm_file) {
2056				zap_flags_t zap_flags = details ?
2057				    details->zap_flags : 0;
2058				__unmap_hugepage_range(tlb, vma, start, end,
2059							     NULL, zap_flags);
2060			}
2061		} else
2062			unmap_page_range(tlb, vma, start, end, details);
2063	}
2064}
2065
2066/**
2067 * unmap_vmas - unmap a range of memory covered by a list of vma's
2068 * @tlb: address of the caller's struct mmu_gather
2069 * @mas: the maple state
2070 * @vma: the starting vma
2071 * @start_addr: virtual address at which to start unmapping
2072 * @end_addr: virtual address at which to end unmapping
2073 * @tree_end: The maximum index to check
2074 *
2075 * Unmap all pages in the vma list.
2076 *
2077 * Only addresses between `start' and `end' will be unmapped.
2078 *
2079 * The VMA list must be sorted in ascending virtual address order.
2080 *
2081 * unmap_vmas() assumes that the caller will flush the whole unmapped address
2082 * range after unmap_vmas() returns.  So the only responsibility here is to
2083 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
2084 * drops the lock and schedules.
2085 */
2086void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
2087		struct vm_area_struct *vma, unsigned long start_addr,
2088		unsigned long end_addr, unsigned long tree_end)
2089{
2090	struct mmu_notifier_range range;
2091	struct zap_details details = {
2092		.zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
2093		/* Careful - we need to zap private pages too! */
2094		.even_cows = true,
2095	};
2096
2097	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
2098				start_addr, end_addr);
2099	mmu_notifier_invalidate_range_start(&range);
2100	do {
2101		unsigned long start = start_addr;
2102		unsigned long end = end_addr;
2103		hugetlb_zap_begin(vma, &start, &end);
2104		unmap_single_vma(tlb, vma, start, end, &details);
2105		hugetlb_zap_end(vma, &details);
2106		vma = mas_find(mas, tree_end - 1);
2107	} while (vma && likely(!xa_is_zero(vma)));
2108	mmu_notifier_invalidate_range_end(&range);
2109}
2110
2111/**
2112 * zap_page_range_single_batched - remove user pages in a given range
2113 * @tlb: pointer to the caller's struct mmu_gather
2114 * @vma: vm_area_struct holding the applicable pages
2115 * @address: starting address of pages to remove
2116 * @size: number of bytes to remove
2117 * @details: details of shared cache invalidation
2118 *
2119 * @tlb shouldn't be NULL.  The range must fit into one VMA.  If @vma is for
2120 * hugetlb, @tlb is flushed and re-initialized by this function.
2121 */
2122void zap_page_range_single_batched(struct mmu_gather *tlb,
2123		struct vm_area_struct *vma, unsigned long address,
2124		unsigned long size, struct zap_details *details)
2125{
2126	const unsigned long end = address + size;
2127	struct mmu_notifier_range range;
2128
2129	VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);
2130
2131	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2132				address, end);
2133	hugetlb_zap_begin(vma, &range.start, &range.end);
2134	update_hiwater_rss(vma->vm_mm);
2135	mmu_notifier_invalidate_range_start(&range);
2136	/*
2137	 * unmap 'address-end' not 'range.start-range.end' as range
2138	 * could have been expanded for hugetlb pmd sharing.
2139	 */
2140	unmap_single_vma(tlb, vma, address, end, details);
2141	mmu_notifier_invalidate_range_end(&range);
2142	if (is_vm_hugetlb_page(vma)) {
2143		/*
2144		 * flush tlb and free resources before hugetlb_zap_end(), to
2145		 * avoid concurrent page faults' allocation failure.
2146		 */
2147		tlb_finish_mmu(tlb);
2148		hugetlb_zap_end(vma, details);
2149		tlb_gather_mmu(tlb, vma->vm_mm);
2150	}
2151}
2152
2153/**
2154 * zap_page_range_single - remove user pages in a given range
2155 * @vma: vm_area_struct holding the applicable pages
2156 * @address: starting address of pages to zap
2157 * @size: number of bytes to zap
2158 * @details: details of shared cache invalidation
2159 *
2160 * The range must fit into one VMA.
2161 */
2162void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
2163		unsigned long size, struct zap_details *details)
2164{
2165	struct mmu_gather tlb;
2166
2167	tlb_gather_mmu(&tlb, vma->vm_mm);
2168	zap_page_range_single_batched(&tlb, vma, address, size, details);
2169	tlb_finish_mmu(&tlb);
2170}
2171
2172/**
2173 * zap_vma_ptes - remove ptes mapping the vma
2174 * @vma: vm_area_struct holding ptes to be zapped
2175 * @address: starting address of pages to zap
2176 * @size: number of bytes to zap
2177 *
2178 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
2179 *
2180 * The entire address range must be fully contained within the vma.
2181 *
2182 */
2183void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
2184		unsigned long size)
2185{
2186	if (!range_in_vma(vma, address, address + size) ||
2187	    		!(vma->vm_flags & VM_PFNMAP))
2188		return;
2189
2190	zap_page_range_single(vma, address, size, NULL);
2191}
2192EXPORT_SYMBOL_GPL(zap_vma_ptes);
2193
2194static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
2195{
2196	pgd_t *pgd;
2197	p4d_t *p4d;
2198	pud_t *pud;
2199	pmd_t *pmd;
2200
2201	pgd = pgd_offset(mm, addr);
2202	p4d = p4d_alloc(mm, pgd, addr);
2203	if (!p4d)
2204		return NULL;
2205	pud = pud_alloc(mm, p4d, addr);
2206	if (!pud)
2207		return NULL;
2208	pmd = pmd_alloc(mm, pud, addr);
2209	if (!pmd)
2210		return NULL;
2211
2212	VM_BUG_ON(pmd_trans_huge(*pmd));
2213	return pmd;
2214}
2215
2216pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
2217			spinlock_t **ptl)
2218{
2219	pmd_t *pmd = walk_to_pmd(mm, addr);
2220
2221	if (!pmd)
2222		return NULL;
2223	return pte_alloc_map_lock(mm, pmd, addr, ptl);
2224}
2225
2226static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
2227{
2228	VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
2229	/*
2230	 * Whoever wants to forbid the zeropage after some zeropages
2231	 * might already have been mapped has to scan the page tables and
2232	 * bail out on any zeropages. Zeropages in COW mappings can
2233	 * be unshared using FAULT_FLAG_UNSHARE faults.
2234	 */
2235	if (mm_forbids_zeropage(vma->vm_mm))
2236		return false;
2237	/* zeropages in COW mappings are common and unproblematic. */
2238	if (is_cow_mapping(vma->vm_flags))
2239		return true;
2240	/* Mappings that do not allow for writable PTEs are unproblematic. */
2241	if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
2242		return true;
2243	/*
2244	 * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
2245	 * find the shared zeropage and longterm-pin it, which would
2246	 * be problematic as soon as the zeropage gets replaced by a different
2247	 * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
2248	 * now differ to what GUP looked up. FSDAX is incompatible to
2249	 * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
2250	 * check_vma_flags).
2251	 */
2252	return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
2253	       (vma_is_fsdax(vma) || vma->vm_flags & VM_IO);
2254}
2255
2256static int validate_page_before_insert(struct vm_area_struct *vma,
2257				       struct page *page)
2258{
2259	struct folio *folio = page_folio(page);
2260
2261	if (!folio_ref_count(folio))
2262		return -EINVAL;
2263	if (unlikely(is_zero_folio(folio))) {
2264		if (!vm_mixed_zeropage_allowed(vma))
2265			return -EINVAL;
2266		return 0;
2267	}
2268	if (folio_test_anon(folio) || page_has_type(page))
2269		return -EINVAL;
2270	flush_dcache_folio(folio);
2271	return 0;
2272}
2273
2274static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
2275				unsigned long addr, struct page *page,
2276				pgprot_t prot, bool mkwrite)
2277{
2278	struct folio *folio = page_folio(page);
2279	pte_t pteval = ptep_get(pte);
2280
2281	if (!pte_none(pteval)) {
2282		if (!mkwrite)
2283			return -EBUSY;
2284
2285		/* see insert_pfn(). */
2286		if (pte_pfn(pteval) != page_to_pfn(page)) {
2287			WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
2288			return -EFAULT;
2289		}
2290		pteval = maybe_mkwrite(pteval, vma);
2291		pteval = pte_mkyoung(pteval);
2292		if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
2293			update_mmu_cache(vma, addr, pte);
2294		return 0;
2295	}
2296
2297	/* Ok, finally just insert the thing.. */
2298	pteval = mk_pte(page, prot);
2299	if (unlikely(is_zero_folio(folio))) {
2300		pteval = pte_mkspecial(pteval);
2301	} else {
2302		folio_get(folio);
2303		pteval = mk_pte(page, prot);
2304		if (mkwrite) {
2305			pteval = pte_mkyoung(pteval);
2306			pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
2307		}
2308		inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
2309		folio_add_file_rmap_pte(folio, page, vma);
2310	}
2311	set_pte_at(vma->vm_mm, addr, pte, pteval);
2312	return 0;
2313}
2314
2315static int insert_page(struct vm_area_struct *vma, unsigned long addr,
2316			struct page *page, pgprot_t prot, bool mkwrite)
2317{
2318	int retval;
2319	pte_t *pte;
2320	spinlock_t *ptl;
2321
2322	retval = validate_page_before_insert(vma, page);
2323	if (retval)
2324		goto out;
2325	retval = -ENOMEM;
2326	pte = get_locked_pte(vma->vm_mm, addr, &ptl);
2327	if (!pte)
2328		goto out;
2329	retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
2330					mkwrite);
2331	pte_unmap_unlock(pte, ptl);
2332out:
2333	return retval;
2334}
2335
2336static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
2337			unsigned long addr, struct page *page, pgprot_t prot)
2338{
2339	int err;
2340
2341	err = validate_page_before_insert(vma, page);
2342	if (err)
2343		return err;
2344	return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
2345}
2346
2347/* insert_pages() amortizes the cost of spinlock operations
2348 * when inserting pages in a loop.
2349 */
2350static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
2351			struct page **pages, unsigned long *num, pgprot_t prot)
2352{
2353	pmd_t *pmd = NULL;
2354	pte_t *start_pte, *pte;
2355	spinlock_t *pte_lock;
2356	struct mm_struct *const mm = vma->vm_mm;
2357	unsigned long curr_page_idx = 0;
2358	unsigned long remaining_pages_total = *num;
2359	unsigned long pages_to_write_in_pmd;
2360	int ret;
2361more:
2362	ret = -EFAULT;
2363	pmd = walk_to_pmd(mm, addr);
2364	if (!pmd)
2365		goto out;
2366
2367	pages_to_write_in_pmd = min_t(unsigned long,
2368		remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
2369
2370	/* Allocate the PTE if necessary; takes PMD lock once only. */
2371	ret = -ENOMEM;
2372	if (pte_alloc(mm, pmd))
2373		goto out;
2374
2375	while (pages_to_write_in_pmd) {
2376		int pte_idx = 0;
2377		const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
2378
2379		start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
2380		if (!start_pte) {
2381			ret = -EFAULT;
2382			goto out;
2383		}
2384		for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
2385			int err = insert_page_in_batch_locked(vma, pte,
2386				addr, pages[curr_page_idx], prot);
2387			if (unlikely(err)) {
2388				pte_unmap_unlock(start_pte, pte_lock);
2389				ret = err;
2390				remaining_pages_total -= pte_idx;
2391				goto out;
2392			}
2393			addr += PAGE_SIZE;
2394			++curr_page_idx;
2395		}
2396		pte_unmap_unlock(start_pte, pte_lock);
2397		pages_to_write_in_pmd -= batch_size;
2398		remaining_pages_total -= batch_size;
2399	}
2400	if (remaining_pages_total)
2401		goto more;
2402	ret = 0;
2403out:
2404	*num = remaining_pages_total;
2405	return ret;
2406}
2407
2408/**
2409 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
2410 * @vma: user vma to map to
2411 * @addr: target start user address of these pages
2412 * @pages: source kernel pages
2413 * @num: in: number of pages to map. out: number of pages that were *not*
2414 * mapped. (0 means all pages were successfully mapped).
2415 *
2416 * Preferred over vm_insert_page() when inserting multiple pages.
2417 *
2418 * In case of error, we may have mapped a subset of the provided
2419 * pages. It is the caller's responsibility to account for this case.
2420 *
2421 * The same restrictions apply as in vm_insert_page().
2422 */
2423int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
2424			struct page **pages, unsigned long *num)
2425{
2426	const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
2427
2428	if (addr < vma->vm_start || end_addr >= vma->vm_end)
2429		return -EFAULT;
2430	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2431		BUG_ON(mmap_read_trylock(vma->vm_mm));
2432		BUG_ON(vma->vm_flags & VM_PFNMAP);
2433		vm_flags_set(vma, VM_MIXEDMAP);
2434	}
2435	/* Defer page refcount checking till we're about to map that page. */
2436	return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
2437}
2438EXPORT_SYMBOL(vm_insert_pages);
2439
2440/**
2441 * vm_insert_page - insert single page into user vma
2442 * @vma: user vma to map to
2443 * @addr: target user address of this page
2444 * @page: source kernel page
2445 *
2446 * This allows drivers to insert individual pages they've allocated
2447 * into a user vma. The zeropage is supported in some VMAs,
2448 * see vm_mixed_zeropage_allowed().
2449 *
2450 * The page has to be a nice clean _individual_ kernel allocation.
2451 * If you allocate a compound page, you need to have marked it as
2452 * such (__GFP_COMP), or manually just split the page up yourself
2453 * (see split_page()).
2454 *
2455 * NOTE! Traditionally this was done with "remap_pfn_range()" which
2456 * took an arbitrary page protection parameter. This doesn't allow
2457 * that. Your vma protection will have to be set up correctly, which
2458 * means that if you want a shared writable mapping, you'd better
2459 * ask for a shared writable mapping!
2460 *
2461 * The page does not need to be reserved.
2462 *
2463 * Usually this function is called from f_op->mmap() handler
2464 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
2465 * Caller must set VM_MIXEDMAP on vma if it wants to call this
2466 * function from other places, for example from page-fault handler.
2467 *
2468 * Return: %0 on success, negative error code otherwise.
2469 */
2470int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2471			struct page *page)
2472{
2473	if (addr < vma->vm_start || addr >= vma->vm_end)
2474		return -EFAULT;
2475	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2476		BUG_ON(mmap_read_trylock(vma->vm_mm));
2477		BUG_ON(vma->vm_flags & VM_PFNMAP);
2478		vm_flags_set(vma, VM_MIXEDMAP);
2479	}
2480	return insert_page(vma, addr, page, vma->vm_page_prot, false);
2481}
2482EXPORT_SYMBOL(vm_insert_page);
2483
2484/*
2485 * __vm_map_pages - maps range of kernel pages into user vma
2486 * @vma: user vma to map to
2487 * @pages: pointer to array of source kernel pages
2488 * @num: number of pages in page array
2489 * @offset: user's requested vm_pgoff
2490 *
2491 * This allows drivers to map range of kernel pages into a user vma.
2492 * The zeropage is supported in some VMAs, see
2493 * vm_mixed_zeropage_allowed().
2494 *
2495 * Return: 0 on success and error code otherwise.
2496 */
2497static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
2498				unsigned long num, unsigned long offset)
2499{
2500	unsigned long count = vma_pages(vma);
2501	unsigned long uaddr = vma->vm_start;
2502	int ret, i;
2503
2504	/* Fail if the user requested offset is beyond the end of the object */
2505	if (offset >= num)
2506		return -ENXIO;
2507
2508	/* Fail if the user requested size exceeds available object size */
2509	if (count > num - offset)
2510		return -ENXIO;
2511
2512	for (i = 0; i < count; i++) {
2513		ret = vm_insert_page(vma, uaddr, pages[offset + i]);
2514		if (ret < 0)
2515			return ret;
2516		uaddr += PAGE_SIZE;
2517	}
2518
2519	return 0;
2520}
2521
2522/**
2523 * vm_map_pages - maps range of kernel pages starts with non zero offset
2524 * @vma: user vma to map to
2525 * @pages: pointer to array of source kernel pages
2526 * @num: number of pages in page array
2527 *
2528 * Maps an object consisting of @num pages, catering for the user's
2529 * requested vm_pgoff
2530 *
2531 * If we fail to insert any page into the vma, the function will return
2532 * immediately leaving any previously inserted pages present.  Callers
2533 * from the mmap handler may immediately return the error as their caller
2534 * will destroy the vma, removing any successfully inserted pages. Other
2535 * callers should make their own arrangements for calling unmap_region().
2536 *
2537 * Context: Process context. Called by mmap handlers.
2538 * Return: 0 on success and error code otherwise.
2539 */
2540int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
2541				unsigned long num)
2542{
2543	return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
2544}
2545EXPORT_SYMBOL(vm_map_pages);
2546
2547/**
2548 * vm_map_pages_zero - map range of kernel pages starts with zero offset
2549 * @vma: user vma to map to
2550 * @pages: pointer to array of source kernel pages
2551 * @num: number of pages in page array
2552 *
2553 * Similar to vm_map_pages(), except that it explicitly sets the offset
2554 * to 0. This function is intended for the drivers that did not consider
2555 * vm_pgoff.
2556 *
2557 * Context: Process context. Called by mmap handlers.
2558 * Return: 0 on success and error code otherwise.
2559 */
2560int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
2561				unsigned long num)
2562{
2563	return __vm_map_pages(vma, pages, num, 0);
2564}
2565EXPORT_SYMBOL(vm_map_pages_zero);
2566
2567static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2568			unsigned long pfn, pgprot_t prot, bool mkwrite)
2569{
2570	struct mm_struct *mm = vma->vm_mm;
2571	pte_t *pte, entry;
2572	spinlock_t *ptl;
2573
2574	pte = get_locked_pte(mm, addr, &ptl);
2575	if (!pte)
2576		return VM_FAULT_OOM;
2577	entry = ptep_get(pte);
2578	if (!pte_none(entry)) {
2579		if (mkwrite) {
2580			/*
2581			 * For read faults on private mappings the PFN passed
2582			 * in may not match the PFN we have mapped if the
2583			 * mapped PFN is a writeable COW page.  In the mkwrite
2584			 * case we are creating a writable PTE for a shared
2585			 * mapping and we expect the PFNs to match. If they
2586			 * don't match, we are likely racing with block
2587			 * allocation and mapping invalidation so just skip the
2588			 * update.
2589			 */
2590			if (pte_pfn(entry) != pfn) {
2591				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
2592				goto out_unlock;
2593			}
2594			entry = pte_mkyoung(entry);
2595			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2596			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
2597				update_mmu_cache(vma, addr, pte);
2598		}
2599		goto out_unlock;
2600	}
2601
2602	/* Ok, finally just insert the thing.. */
2603	entry = pte_mkspecial(pfn_pte(pfn, prot));
2604
2605	if (mkwrite) {
2606		entry = pte_mkyoung(entry);
2607		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2608	}
2609
2610	set_pte_at(mm, addr, pte, entry);
2611	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
2612
2613out_unlock:
2614	pte_unmap_unlock(pte, ptl);
2615	return VM_FAULT_NOPAGE;
2616}
2617
2618/**
2619 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
2620 * @vma: user vma to map to
2621 * @addr: target user address of this page
2622 * @pfn: source kernel pfn
2623 * @pgprot: pgprot flags for the inserted page
2624 *
2625 * This is exactly like vmf_insert_pfn(), except that it allows drivers
2626 * to override pgprot on a per-page basis.
2627 *
2628 * This only makes sense for IO mappings, and it makes no sense for
2629 * COW mappings.  In general, using multiple vmas is preferable;
2630 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
2631 * impractical.
2632 *
2633 * pgprot typically only differs from @vma->vm_page_prot when drivers set
2634 * caching- and encryption bits different than those of @vma->vm_page_prot,
2635 * because the caching- or encryption mode may not be known at mmap() time.
2636 *
2637 * This is ok as long as @vma->vm_page_prot is not used by the core vm
2638 * to set caching and encryption bits for those vmas (except for COW pages).
2639 * This is ensured by core vm only modifying these page table entries using
2640 * functions that don't touch caching- or encryption bits, using pte_modify()
2641 * if needed. (See for example mprotect()).
2642 *
2643 * Also when new page-table entries are created, this is only done using the
2644 * fault() callback, and never using the value of vma->vm_page_prot,
2645 * except for page-table entries that point to anonymous pages as the result
2646 * of COW.
2647 *
2648 * Context: Process context.  May allocate using %GFP_KERNEL.
2649 * Return: vm_fault_t value.
2650 */
2651vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
2652			unsigned long pfn, pgprot_t pgprot)
2653{
2654	/*
2655	 * Technically, architectures with pte_special can avoid all these
2656	 * restrictions (same for remap_pfn_range).  However we would like
2657	 * consistency in testing and feature parity among all, so we should
2658	 * try to keep these invariants in place for everybody.
2659	 */
2660	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2661	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
2662						(VM_PFNMAP|VM_MIXEDMAP));
2663	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2664	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2665
2666	if (addr < vma->vm_start || addr >= vma->vm_end)
2667		return VM_FAULT_SIGBUS;
2668
2669	if (!pfn_modify_allowed(pfn, pgprot))
2670		return VM_FAULT_SIGBUS;
2671
2672	pfnmap_setup_cachemode_pfn(pfn, &pgprot);
2673
2674	return insert_pfn(vma, addr, pfn, pgprot, false);
2675}
2676EXPORT_SYMBOL(vmf_insert_pfn_prot);
2677
2678/**
2679 * vmf_insert_pfn - insert single pfn into user vma
2680 * @vma: user vma to map to
2681 * @addr: target user address of this page
2682 * @pfn: source kernel pfn
2683 *
2684 * Similar to vm_insert_page, this allows drivers to insert individual pages
2685 * they've allocated into a user vma. Same comments apply.
2686 *
2687 * This function should only be called from a vm_ops->fault handler, and
2688 * in that case the handler should return the result of this function.
2689 *
2690 * vma cannot be a COW mapping.
2691 *
2692 * As this is called only for pages that do not currently exist, we
2693 * do not need to flush old virtual caches or the TLB.
2694 *
2695 * Context: Process context.  May allocate using %GFP_KERNEL.
2696 * Return: vm_fault_t value.
2697 */
2698vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2699			unsigned long pfn)
2700{
2701	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2702}
2703EXPORT_SYMBOL(vmf_insert_pfn);
2704
2705static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
2706			bool mkwrite)
2707{
2708	if (unlikely(is_zero_pfn(pfn)) &&
2709	    (mkwrite || !vm_mixed_zeropage_allowed(vma)))
2710		return false;
2711	/* these checks mirror the abort conditions in vm_normal_page */
2712	if (vma->vm_flags & VM_MIXEDMAP)
2713		return true;
2714	if (is_zero_pfn(pfn))
2715		return true;
2716	return false;
2717}
2718
2719static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2720		unsigned long addr, unsigned long pfn, bool mkwrite)
2721{
2722	pgprot_t pgprot = vma->vm_page_prot;
2723	int err;
2724
2725	if (!vm_mixed_ok(vma, pfn, mkwrite))
2726		return VM_FAULT_SIGBUS;
2727
2728	if (addr < vma->vm_start || addr >= vma->vm_end)
2729		return VM_FAULT_SIGBUS;
2730
2731	pfnmap_setup_cachemode_pfn(pfn, &pgprot);
2732
2733	if (!pfn_modify_allowed(pfn, pgprot))
2734		return VM_FAULT_SIGBUS;
2735
2736	/*
2737	 * If we don't have pte special, then we have to use the pfn_valid()
2738	 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
2739	 * refcount the page if pfn_valid is true (hence insert_page rather
2740	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
2741	 * without pte special, it would there be refcounted as a normal page.
2742	 */
2743	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
2744		struct page *page;
2745
2746		/*
2747		 * At this point we are committed to insert_page()
2748		 * regardless of whether the caller specified flags that
2749		 * result in pfn_t_has_page() == false.
2750		 */
2751		page = pfn_to_page(pfn);
2752		err = insert_page(vma, addr, page, pgprot, mkwrite);
2753	} else {
2754		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
2755	}
2756
2757	if (err == -ENOMEM)
2758		return VM_FAULT_OOM;
2759	if (err < 0 && err != -EBUSY)
2760		return VM_FAULT_SIGBUS;
2761
2762	return VM_FAULT_NOPAGE;
2763}
2764
2765vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
2766			bool write)
2767{
2768	pgprot_t pgprot = vmf->vma->vm_page_prot;
2769	unsigned long addr = vmf->address;
2770	int err;
2771
2772	if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
2773		return VM_FAULT_SIGBUS;
2774
2775	err = insert_page(vmf->vma, addr, page, pgprot, write);
2776	if (err == -ENOMEM)
2777		return VM_FAULT_OOM;
2778	if (err < 0 && err != -EBUSY)
2779		return VM_FAULT_SIGBUS;
2780
2781	return VM_FAULT_NOPAGE;
2782}
2783EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
2784
2785vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2786		unsigned long pfn)
2787{
2788	return __vm_insert_mixed(vma, addr, pfn, false);
2789}
2790EXPORT_SYMBOL(vmf_insert_mixed);
2791
2792/*
2793 *  If the insertion of PTE failed because someone else already added a
2794 *  different entry in the mean time, we treat that as success as we assume
2795 *  the same entry was actually inserted.
2796 */
2797vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2798		unsigned long addr, unsigned long pfn)
2799{
2800	return __vm_insert_mixed(vma, addr, pfn, true);
2801}
2802
2803/*
2804 * maps a range of physical memory into the requested pages. the old
2805 * mappings are removed. any references to nonexistent pages results
2806 * in null mappings (currently treated as "copy-on-access")
2807 */
2808static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
2809			unsigned long addr, unsigned long end,
2810			unsigned long pfn, pgprot_t prot)
2811{
2812	pte_t *pte, *mapped_pte;
2813	spinlock_t *ptl;
2814	int err = 0;
2815
2816	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2817	if (!pte)
2818		return -ENOMEM;
2819	arch_enter_lazy_mmu_mode();
2820	do {
2821		BUG_ON(!pte_none(ptep_get(pte)));
2822		if (!pfn_modify_allowed(pfn, prot)) {
2823			err = -EACCES;
2824			break;
2825		}
2826		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2827		pfn++;
2828	} while (pte++, addr += PAGE_SIZE, addr != end);
2829	arch_leave_lazy_mmu_mode();
2830	pte_unmap_unlock(mapped_pte, ptl);
2831	return err;
2832}
2833
2834static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
2835			unsigned long addr, unsigned long end,
2836			unsigned long pfn, pgprot_t prot)
2837{
2838	pmd_t *pmd;
2839	unsigned long next;
2840	int err;
2841
2842	pfn -= addr >> PAGE_SHIFT;
2843	pmd = pmd_alloc(mm, pud, addr);
2844	if (!pmd)
2845		return -ENOMEM;
2846	VM_BUG_ON(pmd_trans_huge(*pmd));
2847	do {
2848		next = pmd_addr_end(addr, end);
2849		err = remap_pte_range(mm, pmd, addr, next,
2850				pfn + (addr >> PAGE_SHIFT), prot);
2851		if (err)
2852			return err;
2853	} while (pmd++, addr = next, addr != end);
2854	return 0;
2855}
2856
2857static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
2858			unsigned long addr, unsigned long end,
2859			unsigned long pfn, pgprot_t prot)
2860{
2861	pud_t *pud;
2862	unsigned long next;
2863	int err;
2864
2865	pfn -= addr >> PAGE_SHIFT;
2866	pud = pud_alloc(mm, p4d, addr);
2867	if (!pud)
2868		return -ENOMEM;
2869	do {
2870		next = pud_addr_end(addr, end);
2871		err = remap_pmd_range(mm, pud, addr, next,
2872				pfn + (addr >> PAGE_SHIFT), prot);
2873		if (err)
2874			return err;
2875	} while (pud++, addr = next, addr != end);
2876	return 0;
2877}
2878
2879static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
2880			unsigned long addr, unsigned long end,
2881			unsigned long pfn, pgprot_t prot)
2882{
2883	p4d_t *p4d;
2884	unsigned long next;
2885	int err;
2886
2887	pfn -= addr >> PAGE_SHIFT;
2888	p4d = p4d_alloc(mm, pgd, addr);
2889	if (!p4d)
2890		return -ENOMEM;
2891	do {
2892		next = p4d_addr_end(addr, end);
2893		err = remap_pud_range(mm, p4d, addr, next,
2894				pfn + (addr >> PAGE_SHIFT), prot);
2895		if (err)
2896			return err;
2897	} while (p4d++, addr = next, addr != end);
2898	return 0;
2899}
2900
2901static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
2902		unsigned long end, unsigned long vm_start, unsigned long vm_end,
2903		unsigned long pfn, pgoff_t *vm_pgoff_p)
2904{
2905	/*
2906	 * There's a horrible special case to handle copy-on-write
2907	 * behaviour that some programs depend on. We mark the "original"
2908	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2909	 * See vm_normal_page() for details.
2910	 */
2911	if (is_cow_mapping(vm_flags)) {
2912		if (addr != vm_start || end != vm_end)
2913			return -EINVAL;
2914		*vm_pgoff_p = pfn;
2915	}
2916
2917	return 0;
2918}
2919
2920static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
2921		unsigned long pfn, unsigned long size, pgprot_t prot)
2922{
2923	pgd_t *pgd;
2924	unsigned long next;
2925	unsigned long end = addr + PAGE_ALIGN(size);
2926	struct mm_struct *mm = vma->vm_mm;
2927	int err;
2928
2929	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2930		return -EINVAL;
2931
2932	VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
2933
2934	BUG_ON(addr >= end);
2935	pfn -= addr >> PAGE_SHIFT;
2936	pgd = pgd_offset(mm, addr);
2937	flush_cache_range(vma, addr, end);
2938	do {
2939		next = pgd_addr_end(addr, end);
2940		err = remap_p4d_range(mm, pgd, addr, next,
2941				pfn + (addr >> PAGE_SHIFT), prot);
2942		if (err)
2943			return err;
2944	} while (pgd++, addr = next, addr != end);
2945
2946	return 0;
2947}
2948
2949/*
2950 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
2951 * must have pre-validated the caching bits of the pgprot_t.
2952 */
2953static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
2954		unsigned long pfn, unsigned long size, pgprot_t prot)
2955{
2956	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
2957
2958	if (!error)
2959		return 0;
2960
2961	/*
2962	 * A partial pfn range mapping is dangerous: it does not
2963	 * maintain page reference counts, and callers may free
2964	 * pages due to the error. So zap it early.
2965	 */
2966	zap_page_range_single(vma, addr, size, NULL);
2967	return error;
2968}
2969
2970#ifdef __HAVE_PFNMAP_TRACKING
2971static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
2972		unsigned long size, pgprot_t *prot)
2973{
2974	struct pfnmap_track_ctx *ctx;
2975
2976	if (pfnmap_track(pfn, size, prot))
2977		return ERR_PTR(-EINVAL);
2978
2979	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
2980	if (unlikely(!ctx)) {
2981		pfnmap_untrack(pfn, size);
2982		return ERR_PTR(-ENOMEM);
2983	}
2984
2985	ctx->pfn = pfn;
2986	ctx->size = size;
2987	kref_init(&ctx->kref);
2988	return ctx;
2989}
2990
2991void pfnmap_track_ctx_release(struct kref *ref)
2992{
2993	struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
2994
2995	pfnmap_untrack(ctx->pfn, ctx->size);
2996	kfree(ctx);
2997}
2998
2999static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr,
3000		unsigned long pfn, unsigned long size, pgprot_t prot)
3001{
3002	struct pfnmap_track_ctx *ctx = NULL;
3003	int err;
3004
3005	size = PAGE_ALIGN(size);
3006
3007	/*
3008	 * If we cover the full VMA, we'll perform actual tracking, and
3009	 * remember to untrack when the last reference to our tracking
3010	 * context from a VMA goes away. We'll keep tracking the whole pfn
3011	 * range even during VMA splits and partial unmapping.
3012	 *
3013	 * If we only cover parts of the VMA, we'll only setup the cachemode
3014	 * in the pgprot for the pfn range.
3015	 */
3016	if (addr == vma->vm_start && addr + size == vma->vm_end) {
3017		if (vma->pfnmap_track_ctx)
3018			return -EINVAL;
3019		ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
3020		if (IS_ERR(ctx))
3021			return PTR_ERR(ctx);
3022	} else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
3023		return -EINVAL;
3024	}
3025
3026	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
3027	if (ctx) {
3028		if (err)
3029			kref_put(&ctx->kref, pfnmap_track_ctx_release);
3030		else
3031			vma->pfnmap_track_ctx = ctx;
3032	}
3033	return err;
3034}
3035
3036static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
3037		unsigned long pfn, unsigned long size, pgprot_t prot)
3038{
3039	return remap_pfn_range_track(vma, addr, pfn, size, prot);
3040}
3041#else
3042static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
3043		unsigned long pfn, unsigned long size, pgprot_t prot)
3044{
3045	return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
3046}
3047#endif
3048
3049void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
3050{
3051	/*
3052	 * We set addr=VMA start, end=VMA end here, so this won't fail, but we
3053	 * check it again on complete and will fail there if specified addr is
3054	 * invalid.
3055	 */
3056	get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
3057			desc->start, desc->end, pfn, &desc->pgoff);
3058	desc->vm_flags |= VM_REMAP_FLAGS;
3059}
3060
3061static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
3062		unsigned long pfn, unsigned long size)
3063{
3064	unsigned long end = addr + PAGE_ALIGN(size);
3065	int err;
3066
3067	err = get_remap_pgoff(vma->vm_flags, addr, end,
3068			      vma->vm_start, vma->vm_end,
3069			      pfn, &vma->vm_pgoff);
3070	if (err)
3071		return err;
3072
3073	vm_flags_set(vma, VM_REMAP_FLAGS);
3074	return 0;
3075}
3076
3077/**
3078 * remap_pfn_range - remap kernel memory to userspace
3079 * @vma: user vma to map to
3080 * @addr: target page aligned user address to start at
3081 * @pfn: page frame number of kernel physical memory address
3082 * @size: size of mapping area
3083 * @prot: page protection flags for this mapping
3084 *
3085 * Note: this is only safe if the mm semaphore is held when called.
3086 *
3087 * Return: %0 on success, negative error code otherwise.
3088 */
3089int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
3090		    unsigned long pfn, unsigned long size, pgprot_t prot)
3091{
3092	int err;
3093
3094	err = remap_pfn_range_prepare_vma(vma, addr, pfn, size);
3095	if (err)
3096		return err;
3097
3098	return do_remap_pfn_range(vma, addr, pfn, size, prot);
3099}
3100EXPORT_SYMBOL(remap_pfn_range);
3101
3102int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
3103		unsigned long pfn, unsigned long size, pgprot_t prot)
3104{
3105	return do_remap_pfn_range(vma, addr, pfn, size, prot);
3106}
3107
3108/**
3109 * vm_iomap_memory - remap memory to userspace
3110 * @vma: user vma to map to
3111 * @start: start of the physical memory to be mapped
3112 * @len: size of area
3113 *
3114 * This is a simplified io_remap_pfn_range() for common driver use. The
3115 * driver just needs to give us the physical memory range to be mapped,
3116 * we'll figure out the rest from the vma information.
3117 *
3118 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
3119 * whatever write-combining details or similar.
3120 *
3121 * Return: %0 on success, negative error code otherwise.
3122 */
3123int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
3124{
3125	unsigned long vm_len, pfn, pages;
3126
3127	/* Check that the physical memory area passed in looks valid */
3128	if (start + len < start)
3129		return -EINVAL;
3130	/*
3131	 * You *really* shouldn't map things that aren't page-aligned,
3132	 * but we've historically allowed it because IO memory might
3133	 * just have smaller alignment.
3134	 */
3135	len += start & ~PAGE_MASK;
3136	pfn = start >> PAGE_SHIFT;
3137	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
3138	if (pfn + pages < pfn)
3139		return -EINVAL;
3140
3141	/* We start the mapping 'vm_pgoff' pages into the area */
3142	if (vma->vm_pgoff > pages)
3143		return -EINVAL;
3144	pfn += vma->vm_pgoff;
3145	pages -= vma->vm_pgoff;
3146
3147	/* Can we fit all of the mapping? */
3148	vm_len = vma->vm_end - vma->vm_start;
3149	if (vm_len >> PAGE_SHIFT > pages)
3150		return -EINVAL;
3151
3152	/* Ok, let it rip */
3153	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
3154}
3155EXPORT_SYMBOL(vm_iomap_memory);
3156
3157static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
3158				     unsigned long addr, unsigned long end,
3159				     pte_fn_t fn, void *data, bool create,
3160				     pgtbl_mod_mask *mask)
3161{
3162	pte_t *pte, *mapped_pte;
3163	int err = 0;
3164	spinlock_t *ptl;
3165
3166	if (create) {
3167		mapped_pte = pte = (mm == &init_mm) ?
3168			pte_alloc_kernel_track(pmd, addr, mask) :
3169			pte_alloc_map_lock(mm, pmd, addr, &ptl);
3170		if (!pte)
3171			return -ENOMEM;
3172	} else {
3173		mapped_pte = pte = (mm == &init_mm) ?
3174			pte_offset_kernel(pmd, addr) :
3175			pte_offset_map_lock(mm, pmd, addr, &ptl);
3176		if (!pte)
3177			return -EINVAL;
3178	}
3179
3180	arch_enter_lazy_mmu_mode();
3181
3182	if (fn) {
3183		do {
3184			if (create || !pte_none(ptep_get(pte))) {
3185				err = fn(pte, addr, data);
3186				if (err)
3187					break;
3188			}
3189		} while (pte++, addr += PAGE_SIZE, addr != end);
3190	}
3191	*mask |= PGTBL_PTE_MODIFIED;
3192
3193	arch_leave_lazy_mmu_mode();
3194
3195	if (mm != &init_mm)
3196		pte_unmap_unlock(mapped_pte, ptl);
3197	return err;
3198}
3199
3200static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
3201				     unsigned long addr, unsigned long end,
3202				     pte_fn_t fn, void *data, bool create,
3203				     pgtbl_mod_mask *mask)
3204{
3205	pmd_t *pmd;
3206	unsigned long next;
3207	int err = 0;
3208
3209	BUG_ON(pud_leaf(*pud));
3210
3211	if (create) {
3212		pmd = pmd_alloc_track(mm, pud, addr, mask);
3213		if (!pmd)
3214			return -ENOMEM;
3215	} else {
3216		pmd = pmd_offset(pud, addr);
3217	}
3218	do {
3219		next = pmd_addr_end(addr, end);
3220		if (pmd_none(*pmd) && !create)
3221			continue;
3222		if (WARN_ON_ONCE(pmd_leaf(*pmd)))
3223			return -EINVAL;
3224		if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
3225			if (!create)
3226				continue;
3227			pmd_clear_bad(pmd);
3228		}
3229		err = apply_to_pte_range(mm, pmd, addr, next,
3230					 fn, data, create, mask);
3231		if (err)
3232			break;
3233	} while (pmd++, addr = next, addr != end);
3234
3235	return err;
3236}
3237
3238static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
3239				     unsigned long addr, unsigned long end,
3240				     pte_fn_t fn, void *data, bool create,
3241				     pgtbl_mod_mask *mask)
3242{
3243	pud_t *pud;
3244	unsigned long next;
3245	int err = 0;
3246
3247	if (create) {
3248		pud = pud_alloc_track(mm, p4d, addr, mask);
3249		if (!pud)
3250			return -ENOMEM;
3251	} else {
3252		pud = pud_offset(p4d, addr);
3253	}
3254	do {
3255		next = pud_addr_end(addr, end);
3256		if (pud_none(*pud) && !create)
3257			continue;
3258		if (WARN_ON_ONCE(pud_leaf(*pud)))
3259			return -EINVAL;
3260		if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
3261			if (!create)
3262				continue;
3263			pud_clear_bad(pud);
3264		}
3265		err = apply_to_pmd_range(mm, pud, addr, next,
3266					 fn, data, create, mask);
3267		if (err)
3268			break;
3269	} while (pud++, addr = next, addr != end);
3270
3271	return err;
3272}
3273
3274static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
3275				     unsigned long addr, unsigned long end,
3276				     pte_fn_t fn, void *data, bool create,
3277				     pgtbl_mod_mask *mask)
3278{
3279	p4d_t *p4d;
3280	unsigned long next;
3281	int err = 0;
3282
3283	if (create) {
3284		p4d = p4d_alloc_track(mm, pgd, addr, mask);
3285		if (!p4d)
3286			return -ENOMEM;
3287	} else {
3288		p4d = p4d_offset(pgd, addr);
3289	}
3290	do {
3291		next = p4d_addr_end(addr, end);
3292		if (p4d_none(*p4d) && !create)
3293			continue;
3294		if (WARN_ON_ONCE(p4d_leaf(*p4d)))
3295			return -EINVAL;
3296		if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
3297			if (!create)
3298				continue;
3299			p4d_clear_bad(p4d);
3300		}
3301		err = apply_to_pud_range(mm, p4d, addr, next,
3302					 fn, data, create, mask);
3303		if (err)
3304			break;
3305	} while (p4d++, addr = next, addr != end);
3306
3307	return err;
3308}
3309
3310static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
3311				 unsigned long size, pte_fn_t fn,
3312				 void *data, bool create)
3313{
3314	pgd_t *pgd;
3315	unsigned long start = addr, next;
3316	unsigned long end = addr + size;
3317	pgtbl_mod_mask mask = 0;
3318	int err = 0;
3319
3320	if (WARN_ON(addr >= end))
3321		return -EINVAL;
3322
3323	pgd = pgd_offset(mm, addr);
3324	do {
3325		next = pgd_addr_end(addr, end);
3326		if (pgd_none(*pgd) && !create)
3327			continue;
3328		if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
3329			err = -EINVAL;
3330			break;
3331		}
3332		if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
3333			if (!create)
3334				continue;
3335			pgd_clear_bad(pgd);
3336		}
3337		err = apply_to_p4d_range(mm, pgd, addr, next,
3338					 fn, data, create, &mask);
3339		if (err)
3340			break;
3341	} while (pgd++, addr = next, addr != end);
3342
3343	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
3344		arch_sync_kernel_mappings(start, start + size);
3345
3346	return err;
3347}
3348
3349/*
3350 * Scan a region of virtual memory, filling in page tables as necessary
3351 * and calling a provided function on each leaf page table.
3352 */
3353int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
3354			unsigned long size, pte_fn_t fn, void *data)
3355{
3356	return __apply_to_page_range(mm, addr, size, fn, data, true);
3357}
3358EXPORT_SYMBOL_GPL(apply_to_page_range);
3359
3360/*
3361 * Scan a region of virtual memory, calling a provided function on
3362 * each leaf page table where it exists.
3363 *
3364 * Unlike apply_to_page_range, this does _not_ fill in page tables
3365 * where they are absent.
3366 */
3367int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
3368				 unsigned long size, pte_fn_t fn, void *data)
3369{
3370	return __apply_to_page_range(mm, addr, size, fn, data, false);
3371}
3372
3373/*
3374 * handle_pte_fault chooses page fault handler according to an entry which was
3375 * read non-atomically.  Before making any commitment, on those architectures
3376 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
3377 * parts, do_swap_page must check under lock before unmapping the pte and
3378 * proceeding (but do_wp_page is only called after already making such a check;
3379 * and do_anonymous_page can safely check later on).
3380 */
3381static inline int pte_unmap_same(struct vm_fault *vmf)
3382{
3383	int same = 1;
3384#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
3385	if (sizeof(pte_t) > sizeof(unsigned long)) {
3386		spin_lock(vmf->ptl);
3387		same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
3388		spin_unlock(vmf->ptl);
3389	}
3390#endif
3391	pte_unmap(vmf->pte);
3392	vmf->pte = NULL;
3393	return same;
3394}
3395
3396/*
3397 * Return:
3398 *	0:		copied succeeded
3399 *	-EHWPOISON:	copy failed due to hwpoison in source page
3400 *	-EAGAIN:	copied failed (some other reason)
3401 */
3402static inline int __wp_page_copy_user(struct page *dst, struct page *src,
3403				      struct vm_fault *vmf)
3404{
3405	int ret;
3406	void *kaddr;
3407	void __user *uaddr;
3408	struct vm_area_struct *vma = vmf->vma;
3409	struct mm_struct *mm = vma->vm_mm;
3410	unsigned long addr = vmf->address;
3411
3412	if (likely(src)) {
3413		if (copy_mc_user_highpage(dst, src, addr, vma))
3414			return -EHWPOISON;
3415		return 0;
3416	}
3417
3418	/*
3419	 * If the source page was a PFN mapping, we don't have
3420	 * a "struct page" for it. We do a best-effort copy by
3421	 * just copying from the original user address. If that
3422	 * fails, we just zero-fill it. Live with it.
3423	 */
3424	kaddr = kmap_local_page(dst);
3425	pagefault_disable();
3426	uaddr = (void __user *)(addr & PAGE_MASK);
3427
3428	/*
3429	 * On architectures with software "accessed" bits, we would
3430	 * take a double page fault, so mark it accessed here.
3431	 */
3432	vmf->pte = NULL;
3433	if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
3434		pte_t entry;
3435
3436		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
3437		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3438			/*
3439			 * Other thread has already handled the fault
3440			 * and update local tlb only
3441			 */
3442			if (vmf->pte)
3443				update_mmu_tlb(vma, addr, vmf->pte);
3444			ret = -EAGAIN;
3445			goto pte_unlock;
3446		}
3447
3448		entry = pte_mkyoung(vmf->orig_pte);
3449		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
3450			update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
3451	}
3452
3453	/*
3454	 * This really shouldn't fail, because the page is there
3455	 * in the page tables. But it might just be unreadable,
3456	 * in which case we just give up and fill the result with
3457	 * zeroes.
3458	 */
3459	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
3460		if (vmf->pte)
3461			goto warn;
3462
3463		/* Re-validate under PTL if the page is still mapped */
3464		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
3465		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3466			/* The PTE changed under us, update local tlb */
3467			if (vmf->pte)
3468				update_mmu_tlb(vma, addr, vmf->pte);
3469			ret = -EAGAIN;
3470			goto pte_unlock;
3471		}
3472
3473		/*
3474		 * The same page can be mapped back since last copy attempt.
3475		 * Try to copy again under PTL.
3476		 */
3477		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
3478			/*
3479			 * Give a warn in case there can be some obscure
3480			 * use-case
3481			 */
3482warn:
3483			WARN_ON_ONCE(1);
3484			clear_page(kaddr);
3485		}
3486	}
3487
3488	ret = 0;
3489
3490pte_unlock:
3491	if (vmf->pte)
3492		pte_unmap_unlock(vmf->pte, vmf->ptl);
3493	pagefault_enable();
3494	kunmap_local(kaddr);
3495	flush_dcache_page(dst);
3496
3497	return ret;
3498}
3499
3500static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
3501{
3502	struct file *vm_file = vma->vm_file;
3503
3504	if (vm_file)
3505		return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
3506
3507	/*
3508	 * Special mappings (e.g. VDSO) do not have any file so fake
3509	 * a default GFP_KERNEL for them.
3510	 */
3511	return GFP_KERNEL;
3512}
3513
3514/*
3515 * Notify the address space that the page is about to become writable so that
3516 * it can prohibit this or wait for the page to get into an appropriate state.
3517 *
3518 * We do this without the lock held, so that it can sleep if it needs to.
3519 */
3520static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
3521{
3522	vm_fault_t ret;
3523	unsigned int old_flags = vmf->flags;
3524
3525	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3526
3527	if (vmf->vma->vm_file &&
3528	    IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
3529		return VM_FAULT_SIGBUS;
3530
3531	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
3532	/* Restore original flags so that caller is not surprised */
3533	vmf->flags = old_flags;
3534	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
3535		return ret;
3536	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
3537		folio_lock(folio);
3538		if (!folio->mapping) {
3539			folio_unlock(folio);
3540			return 0; /* retry */
3541		}
3542		ret |= VM_FAULT_LOCKED;
3543	} else
3544		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3545	return ret;
3546}
3547
3548/*
3549 * Handle dirtying of a page in shared file mapping on a write fault.
3550 *
3551 * The function expects the page to be locked and unlocks it.
3552 */
3553static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
3554{
3555	struct vm_area_struct *vma = vmf->vma;
3556	struct address_space *mapping;
3557	struct folio *folio = page_folio(vmf->page);
3558	bool dirtied;
3559	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
3560
3561	dirtied = folio_mark_dirty(folio);
3562	VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
3563	/*
3564	 * Take a local copy of the address_space - folio.mapping may be zeroed
3565	 * by truncate after folio_unlock().   The address_space itself remains
3566	 * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
3567	 * release semantics to prevent the compiler from undoing this copying.
3568	 */
3569	mapping = folio_raw_mapping(folio);
3570	folio_unlock(folio);
3571
3572	if (!page_mkwrite)
3573		file_update_time(vma->vm_file);
3574
3575	/*
3576	 * Throttle page dirtying rate down to writeback speed.
3577	 *
3578	 * mapping may be NULL here because some device drivers do not
3579	 * set page.mapping but still dirty their pages
3580	 *
3581	 * Drop the mmap_lock before waiting on IO, if we can. The file
3582	 * is pinning the mapping, as per above.
3583	 */
3584	if ((dirtied || page_mkwrite) && mapping) {
3585		struct file *fpin;
3586
3587		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
3588		balance_dirty_pages_ratelimited(mapping);
3589		if (fpin) {
3590			fput(fpin);
3591			return VM_FAULT_COMPLETED;
3592		}
3593	}
3594
3595	return 0;
3596}
3597
3598/*
3599 * Handle write page faults for pages that can be reused in the current vma
3600 *
3601 * This can happen either due to the mapping being with the VM_SHARED flag,
3602 * or due to us being the last reference standing to the page. In either
3603 * case, all we need to do here is to mark the page as writable and update
3604 * any related book-keeping.
3605 */
3606static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
3607	__releases(vmf->ptl)
3608{
3609	struct vm_area_struct *vma = vmf->vma;
3610	pte_t entry;
3611
3612	VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
3613	VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));
3614
3615	if (folio) {
3616		VM_BUG_ON(folio_test_anon(folio) &&
3617			  !PageAnonExclusive(vmf->page));
3618		/*
3619		 * Clear the folio's cpupid information as the existing
3620		 * information potentially belongs to a now completely
3621		 * unrelated process.
3622		 */
3623		folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
3624	}
3625
3626	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3627	entry = pte_mkyoung(vmf->orig_pte);
3628	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3629	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
3630		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
3631	pte_unmap_unlock(vmf->pte, vmf->ptl);
3632	count_vm_event(PGREUSE);
3633}
3634
3635/*
3636 * We could add a bitflag somewhere, but for now, we know that all
3637 * vm_ops that have a ->map_pages have been audited and don't need
3638 * the mmap_lock to be held.
3639 */
3640static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
3641{
3642	struct vm_area_struct *vma = vmf->vma;
3643
3644	if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
3645		return 0;
3646	vma_end_read(vma);
3647	return VM_FAULT_RETRY;
3648}
3649
3650/**
3651 * __vmf_anon_prepare - Prepare to handle an anonymous fault.
3652 * @vmf: The vm_fault descriptor passed from the fault handler.
3653 *
3654 * When preparing to insert an anonymous page into a VMA from a
3655 * fault handler, call this function rather than anon_vma_prepare().
3656 * If this vma does not already have an associated anon_vma and we are
3657 * only protected by the per-VMA lock, the caller must retry with the
3658 * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
3659 * determine if this VMA can share its anon_vma, and that's not safe to
3660 * do with only the per-VMA lock held for this VMA.
3661 *
3662 * Return: 0 if fault handling can proceed.  Any other value should be
3663 * returned to the caller.
3664 */
3665vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
3666{
3667	struct vm_area_struct *vma = vmf->vma;
3668	vm_fault_t ret = 0;
3669
3670	if (likely(vma->anon_vma))
3671		return 0;
3672	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
3673		if (!mmap_read_trylock(vma->vm_mm))
3674			return VM_FAULT_RETRY;
3675	}
3676	if (__anon_vma_prepare(vma))
3677		ret = VM_FAULT_OOM;
3678	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
3679		mmap_read_unlock(vma->vm_mm);
3680	return ret;
3681}
3682
3683/*
3684 * Handle the case of a page which we actually need to copy to a new page,
3685 * either due to COW or unsharing.
3686 *
3687 * Called with mmap_lock locked and the old page referenced, but
3688 * without the ptl held.
3689 *
3690 * High level logic flow:
3691 *
3692 * - Allocate a page, copy the content of the old page to the new one.
3693 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
3694 * - Take the PTL. If the pte changed, bail out and release the allocated page
3695 * - If the pte is still the way we remember it, update the page table and all
3696 *   relevant references. This includes dropping the reference the page-table
3697 *   held to the old page, as well as updating the rmap.
3698 * - In any case, unlock the PTL and drop the reference we took to the old page.
3699 */
3700static vm_fault_t wp_page_copy(struct vm_fault *vmf)
3701{
3702	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
3703	struct vm_area_struct *vma = vmf->vma;
3704	struct mm_struct *mm = vma->vm_mm;
3705	struct folio *old_folio = NULL;
3706	struct folio *new_folio = NULL;
3707	pte_t entry;
3708	int page_copied = 0;
3709	struct mmu_notifier_range range;
3710	vm_fault_t ret;
3711	bool pfn_is_zero;
3712
3713	delayacct_wpcopy_start();
3714
3715	if (vmf->page)
3716		old_folio = page_folio(vmf->page);
3717	ret = vmf_anon_prepare(vmf);
3718	if (unlikely(ret))
3719		goto out;
3720
3721	pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
3722	new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
3723	if (!new_folio)
3724		goto oom;
3725
3726	if (!pfn_is_zero) {
3727		int err;
3728
3729		err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
3730		if (err) {
3731			/*
3732			 * COW failed, if the fault was solved by other,
3733			 * it's fine. If not, userspace would re-fault on
3734			 * the same address and we will handle the fault
3735			 * from the second attempt.
3736			 * The -EHWPOISON case will not be retried.
3737			 */
3738			folio_put(new_folio);
3739			if (old_folio)
3740				folio_put(old_folio);
3741
3742			delayacct_wpcopy_end();
3743			return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
3744		}
3745		kmsan_copy_page_meta(&new_folio->page, vmf->page);
3746	}
3747
3748	__folio_mark_uptodate(new_folio);
3749
3750	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
3751				vmf->address & PAGE_MASK,
3752				(vmf->address & PAGE_MASK) + PAGE_SIZE);
3753	mmu_notifier_invalidate_range_start(&range);
3754
3755	/*
3756	 * Re-check the pte - we dropped the lock
3757	 */
3758	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
3759	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3760		if (old_folio) {
3761			if (!folio_test_anon(old_folio)) {
3762				dec_mm_counter(mm, mm_counter_file(old_folio));
3763				inc_mm_counter(mm, MM_ANONPAGES);
3764			}
3765		} else {
3766			ksm_might_unmap_zero_page(mm, vmf->orig_pte);
3767			inc_mm_counter(mm, MM_ANONPAGES);
3768		}
3769		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
3770		entry = folio_mk_pte(new_folio, vma->vm_page_prot);
3771		entry = pte_sw_mkyoung(entry);
3772		if (unlikely(unshare)) {
3773			if (pte_soft_dirty(vmf->orig_pte))
3774				entry = pte_mksoft_dirty(entry);
3775			if (pte_uffd_wp(vmf->orig_pte))
3776				entry = pte_mkuffd_wp(entry);
3777		} else {
3778			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3779		}
3780
3781		/*
3782		 * Clear the pte entry and flush it first, before updating the
3783		 * pte with the new entry, to keep TLBs on different CPUs in
3784		 * sync. This code used to set the new PTE then flush TLBs, but
3785		 * that left a window where the new PTE could be loaded into
3786		 * some TLBs while the old PTE remains in others.
3787		 */
3788		ptep_clear_flush(vma, vmf->address, vmf->pte);
3789		folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE);
3790		folio_add_lru_vma(new_folio, vma);
3791		BUG_ON(unshare && pte_write(entry));
3792		set_pte_at(mm, vmf->address, vmf->pte, entry);
3793		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
3794		if (old_folio) {
3795			/*
3796			 * Only after switching the pte to the new page may
3797			 * we remove the mapcount here. Otherwise another
3798			 * process may come and find the rmap count decremented
3799			 * before the pte is switched to the new page, and
3800			 * "reuse" the old page writing into it while our pte
3801			 * here still points into it and can be read by other
3802			 * threads.
3803			 *
3804			 * The critical issue is to order this
3805			 * folio_remove_rmap_pte() with the ptp_clear_flush
3806			 * above. Those stores are ordered by (if nothing else,)
3807			 * the barrier present in the atomic_add_negative
3808			 * in folio_remove_rmap_pte();
3809			 *
3810			 * Then the TLB flush in ptep_clear_flush ensures that
3811			 * no process can access the old page before the
3812			 * decremented mapcount is visible. And the old page
3813			 * cannot be reused until after the decremented
3814			 * mapcount is visible. So transitively, TLBs to
3815			 * old page will be flushed before it can be reused.
3816			 */
3817			folio_remove_rmap_pte(old_folio, vmf->page, vma);
3818		}
3819
3820		/* Free the old page.. */
3821		new_folio = old_folio;
3822		page_copied = 1;
3823		pte_unmap_unlock(vmf->pte, vmf->ptl);
3824	} else if (vmf->pte) {
3825		update_mmu_tlb(vma, vmf->address, vmf->pte);
3826		pte_unmap_unlock(vmf->pte, vmf->ptl);
3827	}
3828
3829	mmu_notifier_invalidate_range_end(&range);
3830
3831	if (new_folio)
3832		folio_put(new_folio);
3833	if (old_folio) {
3834		if (page_copied)
3835			free_swap_cache(old_folio);
3836		folio_put(old_folio);
3837	}
3838
3839	delayacct_wpcopy_end();
3840	return 0;
3841oom:
3842	ret = VM_FAULT_OOM;
3843out:
3844	if (old_folio)
3845		folio_put(old_folio);
3846
3847	delayacct_wpcopy_end();
3848	return ret;
3849}
3850
3851/**
3852 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
3853 *			  writeable once the page is prepared
3854 *
3855 * @vmf: structure describing the fault
3856 * @folio: the folio of vmf->page
3857 *
3858 * This function handles all that is needed to finish a write page fault in a
3859 * shared mapping due to PTE being read-only once the mapped page is prepared.
3860 * It handles locking of PTE and modifying it.
3861 *
3862 * The function expects the page to be locked or other protection against
3863 * concurrent faults / writeback (such as DAX radix tree locks).
3864 *
3865 * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
3866 * we acquired PTE lock.
3867 */
3868static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
3869{
3870	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
3871	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
3872				       &vmf->ptl);
3873	if (!vmf->pte)
3874		return VM_FAULT_NOPAGE;
3875	/*
3876	 * We might have raced with another page fault while we released the
3877	 * pte_offset_map_lock.
3878	 */
3879	if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
3880		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
3881		pte_unmap_unlock(vmf->pte, vmf->ptl);
3882		return VM_FAULT_NOPAGE;
3883	}
3884	wp_page_reuse(vmf, folio);
3885	return 0;
3886}
3887
3888/*
3889 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
3890 * mapping
3891 */
3892static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3893{
3894	struct vm_area_struct *vma = vmf->vma;
3895
3896	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3897		vm_fault_t ret;
3898
3899		pte_unmap_unlock(vmf->pte, vmf->ptl);
3900		ret = vmf_can_call_fault(vmf);
3901		if (ret)
3902			return ret;
3903
3904		vmf->flags |= FAULT_FLAG_MKWRITE;
3905		ret = vma->vm_ops->pfn_mkwrite(vmf);
3906		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
3907			return ret;
3908		return finish_mkwrite_fault(vmf, NULL);
3909	}
3910	wp_page_reuse(vmf, NULL);
3911	return 0;
3912}
3913
3914static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
3915	__releases(vmf->ptl)
3916{
3917	struct vm_area_struct *vma = vmf->vma;
3918	vm_fault_t ret = 0;
3919
3920	folio_get(folio);
3921
3922	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3923		vm_fault_t tmp;
3924
3925		pte_unmap_unlock(vmf->pte, vmf->ptl);
3926		tmp = vmf_can_call_fault(vmf);
3927		if (tmp) {
3928			folio_put(folio);
3929			return tmp;
3930		}
3931
3932		tmp = do_page_mkwrite(vmf, folio);
3933		if (unlikely(!tmp || (tmp &
3934				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3935			folio_put(folio);
3936			return tmp;
3937		}
3938		tmp = finish_mkwrite_fault(vmf, folio);
3939		if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3940			folio_unlock(folio);
3941			folio_put(folio);
3942			return tmp;
3943		}
3944	} else {
3945		wp_page_reuse(vmf, folio);
3946		folio_lock(folio);
3947	}
3948	ret |= fault_dirty_shared_page(vmf);
3949	folio_put(folio);
3950
3951	return ret;
3952}
3953
3954#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3955static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
3956		struct vm_area_struct *vma)
3957{
3958	bool exclusive = false;
3959
3960	/* Let's just free up a large folio if only a single page is mapped. */
3961	if (folio_large_mapcount(folio) <= 1)
3962		return false;
3963
3964	/*
3965	 * The assumption for anonymous folios is that each page can only get
3966	 * mapped once into each MM. The only exception are KSM folios, which
3967	 * are always small.
3968	 *
3969	 * Each taken mapcount must be paired with exactly one taken reference,
3970	 * whereby the refcount must be incremented before the mapcount when
3971	 * mapping a page, and the refcount must be decremented after the
3972	 * mapcount when unmapping a page.
3973	 *
3974	 * If all folio references are from mappings, and all mappings are in
3975	 * the page tables of this MM, then this folio is exclusive to this MM.
3976	 */
3977	if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
3978		return false;
3979
3980	VM_WARN_ON_ONCE(folio_test_ksm(folio));
3981
3982	if (unlikely(folio_test_swapcache(folio))) {
3983		/*
3984		 * Note: freeing up the swapcache will fail if some PTEs are
3985		 * still swap entries.
3986		 */
3987		if (!folio_trylock(folio))
3988			return false;
3989		folio_free_swap(folio);
3990		folio_unlock(folio);
3991	}
3992
3993	if (folio_large_mapcount(folio) != folio_ref_count(folio))
3994		return false;
3995
3996	/* Stabilize the mapcount vs. refcount and recheck. */
3997	folio_lock_large_mapcount(folio);
3998	VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
3999
4000	if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
4001		goto unlock;
4002	if (folio_large_mapcount(folio) != folio_ref_count(folio))
4003		goto unlock;
4004
4005	VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
4006	VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
4007	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
4008			folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
4009
4010	/*
4011	 * Do we need the folio lock? Likely not. If there would have been
4012	 * references from page migration/swapout, we would have detected
4013	 * an additional folio reference and never ended up here.
4014	 */
4015	exclusive = true;
4016unlock:
4017	folio_unlock_large_mapcount(folio);
4018	return exclusive;
4019}
4020#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
4021static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
4022		struct vm_area_struct *vma)
4023{
4024	BUILD_BUG();
4025}
4026#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4027
4028static bool wp_can_reuse_anon_folio(struct folio *folio,
4029				    struct vm_area_struct *vma)
4030{
4031	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
4032		return __wp_can_reuse_large_anon_folio(folio, vma);
4033
4034	/*
4035	 * We have to verify under folio lock: these early checks are
4036	 * just an optimization to avoid locking the folio and freeing
4037	 * the swapcache if there is little hope that we can reuse.
4038	 *
4039	 * KSM doesn't necessarily raise the folio refcount.
4040	 */
4041	if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
4042		return false;
4043	if (!folio_test_lru(folio))
4044		/*
4045		 * We cannot easily detect+handle references from
4046		 * remote LRU caches or references to LRU folios.
4047		 */
4048		lru_add_drain();
4049	if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
4050		return false;
4051	if (!folio_trylock(folio))
4052		return false;
4053	if (folio_test_swapcache(folio))
4054		folio_free_swap(folio);
4055	if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
4056		folio_unlock(folio);
4057		return false;
4058	}
4059	/*
4060	 * Ok, we've got the only folio reference from our mapping
4061	 * and the folio is locked, it's dark out, and we're wearing
4062	 * sunglasses. Hit it.
4063	 */
4064	folio_move_anon_rmap(folio, vma);
4065	folio_unlock(folio);
4066	return true;
4067}
4068
4069/*
4070 * This routine handles present pages, when
4071 * * users try to write to a shared page (FAULT_FLAG_WRITE)
4072 * * GUP wants to take a R/O pin on a possibly shared anonymous page
4073 *   (FAULT_FLAG_UNSHARE)
4074 *
4075 * It is done by copying the page to a new address and decrementing the
4076 * shared-page counter for the old page.
4077 *
4078 * Note that this routine assumes that the protection checks have been
4079 * done by the caller (the low-level page fault routine in most cases).
4080 * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
4081 * done any necessary COW.
4082 *
4083 * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
4084 * though the page will change only once the write actually happens. This
4085 * avoids a few races, and potentially makes it more efficient.
4086 *
4087 * We enter with non-exclusive mmap_lock (to exclude vma changes,
4088 * but allow concurrent faults), with pte both mapped and locked.
4089 * We return with mmap_lock still held, but pte unmapped and unlocked.
4090 */
4091static vm_fault_t do_wp_page(struct vm_fault *vmf)
4092	__releases(vmf->ptl)
4093{
4094	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
4095	struct vm_area_struct *vma = vmf->vma;
4096	struct folio *folio = NULL;
4097	pte_t pte;
4098
4099	if (likely(!unshare)) {
4100		if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
4101			if (!userfaultfd_wp_async(vma)) {
4102				pte_unmap_unlock(vmf->pte, vmf->ptl);
4103				return handle_userfault(vmf, VM_UFFD_WP);
4104			}
4105
4106			/*
4107			 * Nothing needed (cache flush, TLB invalidations,
4108			 * etc.) because we're only removing the uffd-wp bit,
4109			 * which is completely invisible to the user.
4110			 */
4111			pte = pte_clear_uffd_wp(ptep_get(vmf->pte));
4112
4113			set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
4114			/*
4115			 * Update this to be prepared for following up CoW
4116			 * handling
4117			 */
4118			vmf->orig_pte = pte;
4119		}
4120
4121		/*
4122		 * Userfaultfd write-protect can defer flushes. Ensure the TLB
4123		 * is flushed in this case before copying.
4124		 */
4125		if (unlikely(userfaultfd_wp(vmf->vma) &&
4126			     mm_tlb_flush_pending(vmf->vma->vm_mm)))
4127			flush_tlb_page(vmf->vma, vmf->address);
4128	}
4129
4130	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
4131
4132	if (vmf->page)
4133		folio = page_folio(vmf->page);
4134
4135	/*
4136	 * Shared mapping: we are guaranteed to have VM_WRITE and
4137	 * FAULT_FLAG_WRITE set at this point.
4138	 */
4139	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
4140		/*
4141		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
4142		 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
4143		 *
4144		 * We should not cow pages in a shared writeable mapping.
4145		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
4146		 */
4147		if (!vmf->page || is_fsdax_page(vmf->page)) {
4148			vmf->page = NULL;
4149			return wp_pfn_shared(vmf);
4150		}
4151		return wp_page_shared(vmf, folio);
4152	}
4153
4154	/*
4155	 * Private mapping: create an exclusive anonymous page copy if reuse
4156	 * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
4157	 *
4158	 * If we encounter a page that is marked exclusive, we must reuse
4159	 * the page without further checks.
4160	 */
4161	if (folio && folio_test_anon(folio) &&
4162	    (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
4163		if (!PageAnonExclusive(vmf->page))
4164			SetPageAnonExclusive(vmf->page);
4165		if (unlikely(unshare)) {
4166			pte_unmap_unlock(vmf->pte, vmf->ptl);
4167			return 0;
4168		}
4169		wp_page_reuse(vmf, folio);
4170		return 0;
4171	}
4172	/*
4173	 * Ok, we need to copy. Oh, well..
4174	 */
4175	if (folio)
4176		folio_get(folio);
4177
4178	pte_unmap_unlock(vmf->pte, vmf->ptl);
4179#ifdef CONFIG_KSM
4180	if (folio && folio_test_ksm(folio))
4181		count_vm_event(COW_KSM);
4182#endif
4183	return wp_page_copy(vmf);
4184}
4185
4186static void unmap_mapping_range_vma(struct vm_area_struct *vma,
4187		unsigned long start_addr, unsigned long end_addr,
4188		struct zap_details *details)
4189{
4190	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
4191}
4192
4193static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
4194					    pgoff_t first_index,
4195					    pgoff_t last_index,
4196					    struct zap_details *details)
4197{
4198	struct vm_area_struct *vma;
4199	pgoff_t vba, vea, zba, zea;
4200
4201	vma_interval_tree_foreach(vma, root, first_index, last_index) {
4202		vba = vma->vm_pgoff;
4203		vea = vba + vma_pages(vma) - 1;
4204		zba = max(first_index, vba);
4205		zea = min(last_index, vea);
4206
4207		unmap_mapping_range_vma(vma,
4208			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
4209			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
4210				details);
4211	}
4212}
4213
4214/**
4215 * unmap_mapping_folio() - Unmap single folio from processes.
4216 * @folio: The locked folio to be unmapped.
4217 *
4218 * Unmap this folio from any userspace process which still has it mmaped.
4219 * Typically, for efficiency, the range of nearby pages has already been
4220 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
4221 * truncation or invalidation holds the lock on a folio, it may find that
4222 * the page has been remapped again: and then uses unmap_mapping_folio()
4223 * to unmap it finally.
4224 */
4225void unmap_mapping_folio(struct folio *folio)
4226{
4227	struct address_space *mapping = folio->mapping;
4228	struct zap_details details = { };
4229	pgoff_t	first_index;
4230	pgoff_t	last_index;
4231
4232	VM_BUG_ON(!folio_test_locked(folio));
4233
4234	first_index = folio->index;
4235	last_index = folio_next_index(folio) - 1;
4236
4237	details.even_cows = false;
4238	details.single_folio = folio;
4239	details.zap_flags = ZAP_FLAG_DROP_MARKER;
4240
4241	i_mmap_lock_read(mapping);
4242	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
4243		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
4244					 last_index, &details);
4245	i_mmap_unlock_read(mapping);
4246}
4247
4248/**
4249 * unmap_mapping_pages() - Unmap pages from processes.
4250 * @mapping: The address space containing pages to be unmapped.
4251 * @start: Index of first page to be unmapped.
4252 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
4253 * @even_cows: Whether to unmap even private COWed pages.
4254 *
4255 * Unmap the pages in this address space from any userspace process which
4256 * has them mmaped.  Generally, you want to remove COWed pages as well when
4257 * a file is being truncated, but not when invalidating pages from the page
4258 * cache.
4259 */
4260void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
4261		pgoff_t nr, bool even_cows)
4262{
4263	struct zap_details details = { };
4264	pgoff_t	first_index = start;
4265	pgoff_t	last_index = start + nr - 1;
4266
4267	details.even_cows = even_cows;
4268	if (last_index < first_index)
4269		last_index = ULONG_MAX;
4270
4271	i_mmap_lock_read(mapping);
4272	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
4273		unmap_mapping_range_tree(&mapping->i_mmap, first_index,
4274					 last_index, &details);
4275	i_mmap_unlock_read(mapping);
4276}
4277EXPORT_SYMBOL_GPL(unmap_mapping_pages);
4278
4279/**
4280 * unmap_mapping_range - unmap the portion of all mmaps in the specified
4281 * address_space corresponding to the specified byte range in the underlying
4282 * file.
4283 *
4284 * @mapping: the address space containing mmaps to be unmapped.
4285 * @holebegin: byte in first page to unmap, relative to the start of
4286 * the underlying file.  This will be rounded down to a PAGE_SIZE
4287 * boundary.  Note that this is different from truncate_pagecache(), which
4288 * must keep the partial page.  In contrast, we must get rid of
4289 * partial pages.
4290 * @holelen: size of prospective hole in bytes.  This will be rounded
4291 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
4292 * end of the file.
4293 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
4294 * but 0 when invalidating pagecache, don't throw away private data.
4295 */
4296void unmap_mapping_range(struct address_space *mapping,
4297		loff_t const holebegin, loff_t const holelen, int even_cows)
4298{
4299	pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
4300	pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;
4301
4302	/* Check for overflow. */
4303	if (sizeof(holelen) > sizeof(hlen)) {
4304		long long holeend =
4305			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
4306		if (holeend & ~(long long)ULONG_MAX)
4307			hlen = ULONG_MAX - hba + 1;
4308	}
4309
4310	unmap_mapping_pages(mapping, hba, hlen, even_cows);
4311}
4312EXPORT_SYMBOL(unmap_mapping_range);
4313
4314/*
4315 * Restore a potential device exclusive pte to a working pte entry
4316 */
4317static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
4318{
4319	struct folio *folio = page_folio(vmf->page);
4320	struct vm_area_struct *vma = vmf->vma;
4321	struct mmu_notifier_range range;
4322	vm_fault_t ret;
4323
4324	/*
4325	 * We need a reference to lock the folio because we don't hold
4326	 * the PTL so a racing thread can remove the device-exclusive
4327	 * entry and unmap it. If the folio is free the entry must
4328	 * have been removed already. If it happens to have already
4329	 * been re-allocated after being freed all we do is lock and
4330	 * unlock it.
4331	 */
4332	if (!folio_try_get(folio))
4333		return 0;
4334
4335	ret = folio_lock_or_retry(folio, vmf);
4336	if (ret) {
4337		folio_put(folio);
4338		return ret;
4339	}
4340	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
4341				vma->vm_mm, vmf->address & PAGE_MASK,
4342				(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
4343	mmu_notifier_invalidate_range_start(&range);
4344
4345	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
4346				&vmf->ptl);
4347	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4348		restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
4349				      vmf->pte, vmf->orig_pte);
4350
4351	if (vmf->pte)
4352		pte_unmap_unlock(vmf->pte, vmf->ptl);
4353	folio_unlock(folio);
4354	folio_put(folio);
4355
4356	mmu_notifier_invalidate_range_end(&range);
4357	return 0;
4358}
4359
4360static inline bool should_try_to_free_swap(struct folio *folio,
4361					   struct vm_area_struct *vma,
4362					   unsigned int fault_flags)
4363{
4364	if (!folio_test_swapcache(folio))
4365		return false;
4366	if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
4367	    folio_test_mlocked(folio))
4368		return true;
4369	/*
4370	 * If we want to map a page that's in the swapcache writable, we
4371	 * have to detect via the refcount if we're really the exclusive
4372	 * user. Try freeing the swapcache to get rid of the swapcache
4373	 * reference only in case it's likely that we'll be the exclusive user.
4374	 */
4375	return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
4376		folio_ref_count(folio) == (1 + folio_nr_pages(folio));
4377}
4378
4379static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
4380{
4381	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
4382				       vmf->address, &vmf->ptl);
4383	if (!vmf->pte)
4384		return 0;
4385	/*
4386	 * Be careful so that we will only recover a special uffd-wp pte into a
4387	 * none pte.  Otherwise it means the pte could have changed, so retry.
4388	 *
4389	 * This should also cover the case where e.g. the pte changed
4390	 * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
4391	 * So pte_is_marker() check is not enough to safely drop the pte.
4392	 */
4393	if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
4394		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
4395	pte_unmap_unlock(vmf->pte, vmf->ptl);
4396	return 0;
4397}
4398
4399static vm_fault_t do_pte_missing(struct vm_fault *vmf)
4400{
4401	if (vma_is_anonymous(vmf->vma))
4402		return do_anonymous_page(vmf);
4403	else
4404		return do_fault(vmf);
4405}
4406
4407/*
4408 * This is actually a page-missing access, but with uffd-wp special pte
4409 * installed.  It means this pte was wr-protected before being unmapped.
4410 */
4411static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
4412{
4413	/*
4414	 * Just in case there're leftover special ptes even after the region
4415	 * got unregistered - we can simply clear them.
4416	 */
4417	if (unlikely(!userfaultfd_wp(vmf->vma)))
4418		return pte_marker_clear(vmf);
4419
4420	return do_pte_missing(vmf);
4421}
4422
4423static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
4424{
4425	const softleaf_t entry = softleaf_from_pte(vmf->orig_pte);
4426	const pte_marker marker = softleaf_to_marker(entry);
4427
4428	/*
4429	 * PTE markers should never be empty.  If anything weird happened,
4430	 * the best thing to do is to kill the process along with its mm.
4431	 */
4432	if (WARN_ON_ONCE(!marker))
4433		return VM_FAULT_SIGBUS;
4434
4435	/* Higher priority than uffd-wp when data corrupted */
4436	if (marker & PTE_MARKER_POISONED)
4437		return VM_FAULT_HWPOISON;
4438
4439	/* Hitting a guard page is always a fatal condition. */
4440	if (marker & PTE_MARKER_GUARD)
4441		return VM_FAULT_SIGSEGV;
4442
4443	if (softleaf_is_uffd_wp_marker(entry))
4444		return pte_marker_handle_uffd_wp(vmf);
4445
4446	/* This is an unknown pte marker */
4447	return VM_FAULT_SIGBUS;
4448}
4449
4450static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
4451{
4452	struct vm_area_struct *vma = vmf->vma;
4453	struct folio *folio;
4454	softleaf_t entry;
4455
4456	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
4457	if (!folio)
4458		return NULL;
4459
4460	entry = softleaf_from_pte(vmf->orig_pte);
4461	if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
4462					   GFP_KERNEL, entry)) {
4463		folio_put(folio);
4464		return NULL;
4465	}
4466
4467	return folio;
4468}
4469
4470#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4471/*
4472 * Check if the PTEs within a range are contiguous swap entries
4473 * and have consistent swapcache, zeromap.
4474 */
4475static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
4476{
4477	unsigned long addr;
4478	softleaf_t entry;
4479	int idx;
4480	pte_t pte;
4481
4482	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
4483	idx = (vmf->address - addr) / PAGE_SIZE;
4484	pte = ptep_get(ptep);
4485
4486	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
4487		return false;
4488	entry = softleaf_from_pte(pte);
4489	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
4490		return false;
4491
4492	/*
4493	 * swap_read_folio() can't handle the case a large folio is hybridly
4494	 * from different backends. And they are likely corner cases. Similar
4495	 * things might be added once zswap support large folios.
4496	 */
4497	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
4498		return false;
4499	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
4500		return false;
4501
4502	return true;
4503}
4504
4505static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
4506						     unsigned long addr,
4507						     unsigned long orders)
4508{
4509	int order, nr;
4510
4511	order = highest_order(orders);
4512
4513	/*
4514	 * To swap in a THP with nr pages, we require that its first swap_offset
4515	 * is aligned with that number, as it was when the THP was swapped out.
4516	 * This helps filter out most invalid entries.
4517	 */
4518	while (orders) {
4519		nr = 1 << order;
4520		if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
4521			break;
4522		order = next_order(&orders, order);
4523	}
4524
4525	return orders;
4526}
4527
4528static struct folio *alloc_swap_folio(struct vm_fault *vmf)
4529{
4530	struct vm_area_struct *vma = vmf->vma;
4531	unsigned long orders;
4532	struct folio *folio;
4533	unsigned long addr;
4534	softleaf_t entry;
4535	spinlock_t *ptl;
4536	pte_t *pte;
4537	gfp_t gfp;
4538	int order;
4539
4540	/*
4541	 * If uffd is active for the vma we need per-page fault fidelity to
4542	 * maintain the uffd semantics.
4543	 */
4544	if (unlikely(userfaultfd_armed(vma)))
4545		goto fallback;
4546
4547	/*
4548	 * A large swapped out folio could be partially or fully in zswap. We
4549	 * lack handling for such cases, so fallback to swapping in order-0
4550	 * folio.
4551	 */
4552	if (!zswap_never_enabled())
4553		goto fallback;
4554
4555	entry = softleaf_from_pte(vmf->orig_pte);
4556	/*
4557	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
4558	 * and suitable for swapping THP.
4559	 */
4560	orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
4561					  BIT(PMD_ORDER) - 1);
4562	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
4563	orders = thp_swap_suitable_orders(swp_offset(entry),
4564					  vmf->address, orders);
4565
4566	if (!orders)
4567		goto fallback;
4568
4569	pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
4570				  vmf->address & PMD_MASK, &ptl);
4571	if (unlikely(!pte))
4572		goto fallback;
4573
4574	/*
4575	 * For do_swap_page, find the highest order where the aligned range is
4576	 * completely swap entries with contiguous swap offsets.
4577	 */
4578	order = highest_order(orders);
4579	while (orders) {
4580		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4581		if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
4582			break;
4583		order = next_order(&orders, order);
4584	}
4585
4586	pte_unmap_unlock(pte, ptl);
4587
4588	/* Try allocating the highest of the remaining orders. */
4589	gfp = vma_thp_gfp_mask(vma);
4590	while (orders) {
4591		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4592		folio = vma_alloc_folio(gfp, order, vma, addr);
4593		if (folio) {
4594			if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
4595							    gfp, entry))
4596				return folio;
4597			count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
4598			folio_put(folio);
4599		}
4600		count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
4601		order = next_order(&orders, order);
4602	}
4603
4604fallback:
4605	return __alloc_swap_folio(vmf);
4606}
4607#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
4608static struct folio *alloc_swap_folio(struct vm_fault *vmf)
4609{
4610	return __alloc_swap_folio(vmf);
4611}
4612#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4613
4614static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
4615
4616/*
4617 * We enter with non-exclusive mmap_lock (to exclude vma changes,
4618 * but allow concurrent faults), and pte mapped but not yet locked.
4619 * We return with pte unmapped and unlocked.
4620 *
4621 * We return with the mmap_lock locked or unlocked in the same cases
4622 * as does filemap_fault().
4623 */
4624vm_fault_t do_swap_page(struct vm_fault *vmf)
4625{
4626	struct vm_area_struct *vma = vmf->vma;
4627	struct folio *swapcache, *folio = NULL;
4628	DECLARE_WAITQUEUE(wait, current);
4629	struct page *page;
4630	struct swap_info_struct *si = NULL;
4631	rmap_t rmap_flags = RMAP_NONE;
4632	bool need_clear_cache = false;
4633	bool exclusive = false;
4634	softleaf_t entry;
4635	pte_t pte;
4636	vm_fault_t ret = 0;
4637	void *shadow = NULL;
4638	int nr_pages;
4639	unsigned long page_idx;
4640	unsigned long address;
4641	pte_t *ptep;
4642
4643	if (!pte_unmap_same(vmf))
4644		goto out;
4645
4646	entry = softleaf_from_pte(vmf->orig_pte);
4647	if (unlikely(!softleaf_is_swap(entry))) {
4648		if (softleaf_is_migration(entry)) {
4649			migration_entry_wait(vma->vm_mm, vmf->pmd,
4650					     vmf->address);
4651		} else if (softleaf_is_device_exclusive(entry)) {
4652			vmf->page = softleaf_to_page(entry);
4653			ret = remove_device_exclusive_entry(vmf);
4654		} else if (softleaf_is_device_private(entry)) {
4655			if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
4656				/*
4657				 * migrate_to_ram is not yet ready to operate
4658				 * under VMA lock.
4659				 */
4660				vma_end_read(vma);
4661				ret = VM_FAULT_RETRY;
4662				goto out;
4663			}
4664
4665			vmf->page = softleaf_to_page(entry);
4666			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4667					vmf->address, &vmf->ptl);
4668			if (unlikely(!vmf->pte ||
4669				     !pte_same(ptep_get(vmf->pte),
4670							vmf->orig_pte)))
4671				goto unlock;
4672
4673			/*
4674			 * Get a page reference while we know the page can't be
4675			 * freed.
4676			 */
4677			if (trylock_page(vmf->page)) {
4678				struct dev_pagemap *pgmap;
4679
4680				get_page(vmf->page);
4681				pte_unmap_unlock(vmf->pte, vmf->ptl);
4682				pgmap = page_pgmap(vmf->page);
4683				ret = pgmap->ops->migrate_to_ram(vmf);
4684				unlock_page(vmf->page);
4685				put_page(vmf->page);
4686			} else {
4687				pte_unmap_unlock(vmf->pte, vmf->ptl);
4688			}
4689		} else if (softleaf_is_hwpoison(entry)) {
4690			ret = VM_FAULT_HWPOISON;
4691		} else if (softleaf_is_marker(entry)) {
4692			ret = handle_pte_marker(vmf);
4693		} else {
4694			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
4695			ret = VM_FAULT_SIGBUS;
4696		}
4697		goto out;
4698	}
4699
4700	/* Prevent swapoff from happening to us. */
4701	si = get_swap_device(entry);
4702	if (unlikely(!si))
4703		goto out;
4704
4705	folio = swap_cache_get_folio(entry);
4706	if (folio)
4707		swap_update_readahead(folio, vma, vmf->address);
4708	swapcache = folio;
4709
4710	if (!folio) {
4711		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
4712		    __swap_count(entry) == 1) {
4713			/* skip swapcache */
4714			folio = alloc_swap_folio(vmf);
4715			if (folio) {
4716				__folio_set_locked(folio);
4717				__folio_set_swapbacked(folio);
4718
4719				nr_pages = folio_nr_pages(folio);
4720				if (folio_test_large(folio))
4721					entry.val = ALIGN_DOWN(entry.val, nr_pages);
4722				/*
4723				 * Prevent parallel swapin from proceeding with
4724				 * the cache flag. Otherwise, another thread
4725				 * may finish swapin first, free the entry, and
4726				 * swapout reusing the same entry. It's
4727				 * undetectable as pte_same() returns true due
4728				 * to entry reuse.
4729				 */
4730				if (swapcache_prepare(entry, nr_pages)) {
4731					/*
4732					 * Relax a bit to prevent rapid
4733					 * repeated page faults.
4734					 */
4735					add_wait_queue(&swapcache_wq, &wait);
4736					schedule_timeout_uninterruptible(1);
4737					remove_wait_queue(&swapcache_wq, &wait);
4738					goto out_page;
4739				}
4740				need_clear_cache = true;
4741
4742				memcg1_swapin(entry, nr_pages);
4743
4744				shadow = swap_cache_get_shadow(entry);
4745				if (shadow)
4746					workingset_refault(folio, shadow);
4747
4748				folio_add_lru(folio);
4749
4750				/* To provide entry to swap_read_folio() */
4751				folio->swap = entry;
4752				swap_read_folio(folio, NULL);
4753				folio->private = NULL;
4754			}
4755		} else {
4756			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
4757						vmf);
4758			swapcache = folio;
4759		}
4760
4761		if (!folio) {
4762			/*
4763			 * Back out if somebody else faulted in this pte
4764			 * while we released the pte lock.
4765			 */
4766			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
4767					vmf->address, &vmf->ptl);
4768			if (likely(vmf->pte &&
4769				   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4770				ret = VM_FAULT_OOM;
4771			goto unlock;
4772		}
4773
4774		/* Had to read the page from swap area: Major fault */
4775		ret = VM_FAULT_MAJOR;
4776		count_vm_event(PGMAJFAULT);
4777		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
4778	}
4779
4780	ret |= folio_lock_or_retry(folio, vmf);
4781	if (ret & VM_FAULT_RETRY)
4782		goto out_release;
4783
4784	page = folio_file_page(folio, swp_offset(entry));
4785	if (swapcache) {
4786		/*
4787		 * Make sure folio_free_swap() or swapoff did not release the
4788		 * swapcache from under us.  The page pin, and pte_same test
4789		 * below, are not enough to exclude that.  Even if it is still
4790		 * swapcache, we need to check that the page's swap has not
4791		 * changed.
4792		 */
4793		if (unlikely(!folio_matches_swap_entry(folio, entry)))
4794			goto out_page;
4795
4796		if (unlikely(PageHWPoison(page))) {
4797			/*
4798			 * hwpoisoned dirty swapcache pages are kept for killing
4799			 * owner processes (which may be unknown at hwpoison time)
4800			 */
4801			ret = VM_FAULT_HWPOISON;
4802			goto out_page;
4803		}
4804
4805		/*
4806		 * KSM sometimes has to copy on read faults, for example, if
4807		 * folio->index of non-ksm folios would be nonlinear inside the
4808		 * anon VMA -- the ksm flag is lost on actual swapout.
4809		 */
4810		folio = ksm_might_need_to_copy(folio, vma, vmf->address);
4811		if (unlikely(!folio)) {
4812			ret = VM_FAULT_OOM;
4813			folio = swapcache;
4814			goto out_page;
4815		} else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
4816			ret = VM_FAULT_HWPOISON;
4817			folio = swapcache;
4818			goto out_page;
4819		}
4820		if (folio != swapcache)
4821			page = folio_page(folio, 0);
4822
4823		/*
4824		 * If we want to map a page that's in the swapcache writable, we
4825		 * have to detect via the refcount if we're really the exclusive
4826		 * owner. Try removing the extra reference from the local LRU
4827		 * caches if required.
4828		 */
4829		if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
4830		    !folio_test_ksm(folio) && !folio_test_lru(folio))
4831			lru_add_drain();
4832	}
4833
4834	folio_throttle_swaprate(folio, GFP_KERNEL);
4835
4836	/*
4837	 * Back out if somebody else already faulted in this pte.
4838	 */
4839	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
4840			&vmf->ptl);
4841	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4842		goto out_nomap;
4843
4844	if (unlikely(!folio_test_uptodate(folio))) {
4845		ret = VM_FAULT_SIGBUS;
4846		goto out_nomap;
4847	}
4848
4849	/* allocated large folios for SWP_SYNCHRONOUS_IO */
4850	if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
4851		unsigned long nr = folio_nr_pages(folio);
4852		unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
4853		unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
4854		pte_t *folio_ptep = vmf->pte - idx;
4855		pte_t folio_pte = ptep_get(folio_ptep);
4856
4857		if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
4858		    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
4859			goto out_nomap;
4860
4861		page_idx = idx;
4862		address = folio_start;
4863		ptep = folio_ptep;
4864		goto check_folio;
4865	}
4866
4867	nr_pages = 1;
4868	page_idx = 0;
4869	address = vmf->address;
4870	ptep = vmf->pte;
4871	if (folio_test_large(folio) && folio_test_swapcache(folio)) {
4872		int nr = folio_nr_pages(folio);
4873		unsigned long idx = folio_page_idx(folio, page);
4874		unsigned long folio_start = address - idx * PAGE_SIZE;
4875		unsigned long folio_end = folio_start + nr * PAGE_SIZE;
4876		pte_t *folio_ptep;
4877		pte_t folio_pte;
4878
4879		if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
4880			goto check_folio;
4881		if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
4882			goto check_folio;
4883
4884		folio_ptep = vmf->pte - idx;
4885		folio_pte = ptep_get(folio_ptep);
4886		if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
4887		    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
4888			goto check_folio;
4889
4890		page_idx = idx;
4891		address = folio_start;
4892		ptep = folio_ptep;
4893		nr_pages = nr;
4894		entry = folio->swap;
4895		page = &folio->page;
4896	}
4897
4898check_folio:
4899	/*
4900	 * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
4901	 * must never point at an anonymous page in the swapcache that is
4902	 * PG_anon_exclusive. Sanity check that this holds and especially, that
4903	 * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
4904	 * check after taking the PT lock and making sure that nobody
4905	 * concurrently faulted in this page and set PG_anon_exclusive.
4906	 */
4907	BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
4908	BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
4909
4910	/*
4911	 * Check under PT lock (to protect against concurrent fork() sharing
4912	 * the swap entry concurrently) for certainly exclusive pages.
4913	 */
4914	if (!folio_test_ksm(folio)) {
4915		exclusive = pte_swp_exclusive(vmf->orig_pte);
4916		if (folio != swapcache) {
4917			/*
4918			 * We have a fresh page that is not exposed to the
4919			 * swapcache -> certainly exclusive.
4920			 */
4921			exclusive = true;
4922		} else if (exclusive && folio_test_writeback(folio) &&
4923			  data_race(si->flags & SWP_STABLE_WRITES)) {
4924			/*
4925			 * This is tricky: not all swap backends support
4926			 * concurrent page modifications while under writeback.
4927			 *
4928			 * So if we stumble over such a page in the swapcache
4929			 * we must not set the page exclusive, otherwise we can
4930			 * map it writable without further checks and modify it
4931			 * while still under writeback.
4932			 *
4933			 * For these problematic swap backends, simply drop the
4934			 * exclusive marker: this is perfectly fine as we start
4935			 * writeback only if we fully unmapped the page and
4936			 * there are no unexpected references on the page after
4937			 * unmapping succeeded. After fully unmapped, no
4938			 * further GUP references (FOLL_GET and FOLL_PIN) can
4939			 * appear, so dropping the exclusive marker and mapping
4940			 * it only R/O is fine.
4941			 */
4942			exclusive = false;
4943		}
4944	}
4945
4946	/*
4947	 * Some architectures may have to restore extra metadata to the page
4948	 * when reading from swap. This metadata may be indexed by swap entry
4949	 * so this must be called before swap_free().
4950	 */
4951	arch_swap_restore(folio_swap(entry, folio), folio);
4952
4953	/*
4954	 * Remove the swap entry and conditionally try to free up the swapcache.
4955	 * We're already holding a reference on the page but haven't mapped it
4956	 * yet.
4957	 */
4958	swap_free_nr(entry, nr_pages);
4959	if (should_try_to_free_swap(folio, vma, vmf->flags))
4960		folio_free_swap(folio);
4961
4962	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
4963	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
4964	pte = mk_pte(page, vma->vm_page_prot);
4965	if (pte_swp_soft_dirty(vmf->orig_pte))
4966		pte = pte_mksoft_dirty(pte);
4967	if (pte_swp_uffd_wp(vmf->orig_pte))
4968		pte = pte_mkuffd_wp(pte);
4969
4970	/*
4971	 * Same logic as in do_wp_page(); however, optimize for pages that are
4972	 * certainly not shared either because we just allocated them without
4973	 * exposing them to the swapcache or because the swap entry indicates
4974	 * exclusivity.
4975	 */
4976	if (!folio_test_ksm(folio) &&
4977	    (exclusive || folio_ref_count(folio) == 1)) {
4978		if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) &&
4979		    !pte_needs_soft_dirty_wp(vma, pte)) {
4980			pte = pte_mkwrite(pte, vma);
4981			if (vmf->flags & FAULT_FLAG_WRITE) {
4982				pte = pte_mkdirty(pte);
4983				vmf->flags &= ~FAULT_FLAG_WRITE;
4984			}
4985		}
4986		rmap_flags |= RMAP_EXCLUSIVE;
4987	}
4988	folio_ref_add(folio, nr_pages - 1);
4989	flush_icache_pages(vma, page, nr_pages);
4990	vmf->orig_pte = pte_advance_pfn(pte, page_idx);
4991
4992	/* ksm created a completely new copy */
4993	if (unlikely(folio != swapcache && swapcache)) {
4994		folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
4995		folio_add_lru_vma(folio, vma);
4996	} else if (!folio_test_anon(folio)) {
4997		/*
4998		 * We currently only expect small !anon folios which are either
4999		 * fully exclusive or fully shared, or new allocated large
5000		 * folios which are fully exclusive. If we ever get large
5001		 * folios within swapcache here, we have to be careful.
5002		 */
5003		VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
5004		VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
5005		folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
5006	} else {
5007		folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
5008					rmap_flags);
5009	}
5010
5011	VM_BUG_ON(!folio_test_anon(folio) ||
5012			(pte_write(pte) && !PageAnonExclusive(page)));
5013	set_ptes(vma->vm_mm, address, ptep, pte, nr_pages);
5014	arch_do_swap_page_nr(vma->vm_mm, vma, address,
5015			pte, pte, nr_pages);
5016
5017	folio_unlock(folio);
5018	if (folio != swapcache && swapcache) {
5019		/*
5020		 * Hold the lock to avoid the swap entry to be reused
5021		 * until we take the PT lock for the pte_same() check
5022		 * (to avoid false positives from pte_same). For
5023		 * further safety release the lock after the swap_free
5024		 * so that the swap count won't change under a
5025		 * parallel locked swapcache.
5026		 */
5027		folio_unlock(swapcache);
5028		folio_put(swapcache);
5029	}
5030
5031	if (vmf->flags & FAULT_FLAG_WRITE) {
5032		ret |= do_wp_page(vmf);
5033		if (ret & VM_FAULT_ERROR)
5034			ret &= VM_FAULT_ERROR;
5035		goto out;
5036	}
5037
5038	/* No need to invalidate - it was non-present before */
5039	update_mmu_cache_range(vmf, vma, address, ptep, nr_pages);
5040unlock:
5041	if (vmf->pte)
5042		pte_unmap_unlock(vmf->pte, vmf->ptl);
5043out:
5044	/* Clear the swap cache pin for direct swapin after PTL unlock */
5045	if (need_clear_cache) {
5046		swapcache_clear(si, entry, nr_pages);
5047		if (waitqueue_active(&swapcache_wq))
5048			wake_up(&swapcache_wq);
5049	}
5050	if (si)
5051		put_swap_device(si);
5052	return ret;
5053out_nomap:
5054	if (vmf->pte)
5055		pte_unmap_unlock(vmf->pte, vmf->ptl);
5056out_page:
5057	folio_unlock(folio);
5058out_release:
5059	folio_put(folio);
5060	if (folio != swapcache && swapcache) {
5061		folio_unlock(swapcache);
5062		folio_put(swapcache);
5063	}
5064	if (need_clear_cache) {
5065		swapcache_clear(si, entry, nr_pages);
5066		if (waitqueue_active(&swapcache_wq))
5067			wake_up(&swapcache_wq);
5068	}
5069	if (si)
5070		put_swap_device(si);
5071	return ret;
5072}
5073
5074static bool pte_range_none(pte_t *pte, int nr_pages)
5075{
5076	int i;
5077
5078	for (i = 0; i < nr_pages; i++) {
5079		if (!pte_none(ptep_get_lockless(pte + i)))
5080			return false;
5081	}
5082
5083	return true;
5084}
5085
5086static struct folio *alloc_anon_folio(struct vm_fault *vmf)
5087{
5088	struct vm_area_struct *vma = vmf->vma;
5089#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5090	unsigned long orders;
5091	struct folio *folio;
5092	unsigned long addr;
5093	pte_t *pte;
5094	gfp_t gfp;
5095	int order;
5096
5097	/*
5098	 * If uffd is active for the vma we need per-page fault fidelity to
5099	 * maintain the uffd semantics.
5100	 */
5101	if (unlikely(userfaultfd_armed(vma)))
5102		goto fallback;
5103
5104	/*
5105	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
5106	 * for this vma. Then filter out the orders that can't be allocated over
5107	 * the faulting address and still be fully contained in the vma.
5108	 */
5109	orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
5110					  BIT(PMD_ORDER) - 1);
5111	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
5112
5113	if (!orders)
5114		goto fallback;
5115
5116	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
5117	if (!pte)
5118		return ERR_PTR(-EAGAIN);
5119
5120	/*
5121	 * Find the highest order where the aligned range is completely
5122	 * pte_none(). Note that all remaining orders will be completely
5123	 * pte_none().
5124	 */
5125	order = highest_order(orders);
5126	while (orders) {
5127		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
5128		if (pte_range_none(pte + pte_index(addr), 1 << order))
5129			break;
5130		order = next_order(&orders, order);
5131	}
5132
5133	pte_unmap(pte);
5134
5135	if (!orders)
5136		goto fallback;
5137
5138	/* Try allocating the highest of the remaining orders. */
5139	gfp = vma_thp_gfp_mask(vma);
5140	while (orders) {
5141		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
5142		folio = vma_alloc_folio(gfp, order, vma, addr);
5143		if (folio) {
5144			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
5145				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
5146				folio_put(folio);
5147				goto next;
5148			}
5149			folio_throttle_swaprate(folio, gfp);
5150			/*
5151			 * When a folio is not zeroed during allocation
5152			 * (__GFP_ZERO not used) or user folios require special
5153			 * handling, folio_zero_user() is used to make sure
5154			 * that the page corresponding to the faulting address
5155			 * will be hot in the cache after zeroing.
5156			 */
5157			if (user_alloc_needs_zeroing())
5158				folio_zero_user(folio, vmf->address);
5159			return folio;
5160		}
5161next:
5162		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
5163		order = next_order(&orders, order);
5164	}
5165
5166fallback:
5167#endif
5168	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
5169}
5170
5171/*
5172 * We enter with non-exclusive mmap_lock (to exclude vma changes,
5173 * but allow concurrent faults), and pte mapped but not yet locked.
5174 * We return with mmap_lock still held, but pte unmapped and unlocked.
5175 */
5176static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
5177{
5178	struct vm_area_struct *vma = vmf->vma;
5179	unsigned long addr = vmf->address;
5180	struct folio *folio;
5181	vm_fault_t ret = 0;
5182	int nr_pages = 1;
5183	pte_t entry;
5184
5185	/* File mapping without ->vm_ops ? */
5186	if (vma->vm_flags & VM_SHARED)
5187		return VM_FAULT_SIGBUS;
5188
5189	/*
5190	 * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
5191	 * be distinguished from a transient failure of pte_offset_map().
5192	 */
5193	if (pte_alloc(vma->vm_mm, vmf->pmd))
5194		return VM_FAULT_OOM;
5195
5196	/* Use the zero-page for reads */
5197	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
5198			!mm_forbids_zeropage(vma->vm_mm)) {
5199		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
5200						vma->vm_page_prot));
5201		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
5202				vmf->address, &vmf->ptl);
5203		if (!vmf->pte)
5204			goto unlock;
5205		if (vmf_pte_changed(vmf)) {
5206			update_mmu_tlb(vma, vmf->address, vmf->pte);
5207			goto unlock;
5208		}
5209		ret = check_stable_address_space(vma->vm_mm);
5210		if (ret)
5211			goto unlock;
5212		/* Deliver the page fault to userland, check inside PT lock */
5213		if (userfaultfd_missing(vma)) {
5214			pte_unmap_unlock(vmf->pte, vmf->ptl);
5215			return handle_userfault(vmf, VM_UFFD_MISSING);
5216		}
5217		goto setpte;
5218	}
5219
5220	/* Allocate our own private page. */
5221	ret = vmf_anon_prepare(vmf);
5222	if (ret)
5223		return ret;
5224	/* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
5225	folio = alloc_anon_folio(vmf);
5226	if (IS_ERR(folio))
5227		return 0;
5228	if (!folio)
5229		goto oom;
5230
5231	nr_pages = folio_nr_pages(folio);
5232	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
5233
5234	/*
5235	 * The memory barrier inside __folio_mark_uptodate makes sure that
5236	 * preceding stores to the page contents become visible before
5237	 * the set_pte_at() write.
5238	 */
5239	__folio_mark_uptodate(folio);
5240
5241	entry = folio_mk_pte(folio, vma->vm_page_prot);
5242	entry = pte_sw_mkyoung(entry);
5243	if (vma->vm_flags & VM_WRITE)
5244		entry = pte_mkwrite(pte_mkdirty(entry), vma);
5245
5246	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
5247	if (!vmf->pte)
5248		goto release;
5249	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
5250		update_mmu_tlb(vma, addr, vmf->pte);
5251		goto release;
5252	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
5253		update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
5254		goto release;
5255	}
5256
5257	ret = check_stable_address_space(vma->vm_mm);
5258	if (ret)
5259		goto release;
5260
5261	/* Deliver the page fault to userland, check inside PT lock */
5262	if (userfaultfd_missing(vma)) {
5263		pte_unmap_unlock(vmf->pte, vmf->ptl);
5264		folio_put(folio);
5265		return handle_userfault(vmf, VM_UFFD_MISSING);
5266	}
5267
5268	folio_ref_add(folio, nr_pages - 1);
5269	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
5270	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
5271	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
5272	folio_add_lru_vma(folio, vma);
5273setpte:
5274	if (vmf_orig_pte_uffd_wp(vmf))
5275		entry = pte_mkuffd_wp(entry);
5276	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
5277
5278	/* No need to invalidate - it was non-present before */
5279	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
5280unlock:
5281	if (vmf->pte)
5282		pte_unmap_unlock(vmf->pte, vmf->ptl);
5283	return ret;
5284release:
5285	folio_put(folio);
5286	goto unlock;
5287oom:
5288	return VM_FAULT_OOM;
5289}
5290
5291/*
5292 * The mmap_lock must have been held on entry, and may have been
5293 * released depending on flags and vma->vm_ops->fault() return value.
5294 * See filemap_fault() and __lock_page_retry().
5295 */
5296static vm_fault_t __do_fault(struct vm_fault *vmf)
5297{
5298	struct vm_area_struct *vma = vmf->vma;
5299	struct folio *folio;
5300	vm_fault_t ret;
5301
5302	/*
5303	 * Preallocate pte before we take page_lock because this might lead to
5304	 * deadlocks for memcg reclaim which waits for pages under writeback:
5305	 *				lock_page(A)
5306	 *				SetPageWriteback(A)
5307	 *				unlock_page(A)
5308	 * lock_page(B)
5309	 *				lock_page(B)
5310	 * pte_alloc_one
5311	 *   shrink_folio_list
5312	 *     wait_on_page_writeback(A)
5313	 *				SetPageWriteback(B)
5314	 *				unlock_page(B)
5315	 *				# flush A, B to clear the writeback
5316	 */
5317	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
5318		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
5319		if (!vmf->prealloc_pte)
5320			return VM_FAULT_OOM;
5321	}
5322
5323	ret = vma->vm_ops->fault(vmf);
5324	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
5325			    VM_FAULT_DONE_COW)))
5326		return ret;
5327
5328	folio = page_folio(vmf->page);
5329	if (unlikely(PageHWPoison(vmf->page))) {
5330		vm_fault_t poisonret = VM_FAULT_HWPOISON;
5331		if (ret & VM_FAULT_LOCKED) {
5332			if (page_mapped(vmf->page))
5333				unmap_mapping_folio(folio);
5334			/* Retry if a clean folio was removed from the cache. */
5335			if (mapping_evict_folio(folio->mapping, folio))
5336				poisonret = VM_FAULT_NOPAGE;
5337			folio_unlock(folio);
5338		}
5339		folio_put(folio);
5340		vmf->page = NULL;
5341		return poisonret;
5342	}
5343
5344	if (unlikely(!(ret & VM_FAULT_LOCKED)))
5345		folio_lock(folio);
5346	else
5347		VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);
5348
5349	return ret;
5350}
5351
5352#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5353static void deposit_prealloc_pte(struct vm_fault *vmf)
5354{
5355	struct vm_area_struct *vma = vmf->vma;
5356
5357	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
5358	/*
5359	 * We are going to consume the prealloc table,
5360	 * count that as nr_ptes.
5361	 */
5362	mm_inc_nr_ptes(vma->vm_mm);
5363	vmf->prealloc_pte = NULL;
5364}
5365
5366vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
5367{
5368	struct vm_area_struct *vma = vmf->vma;
5369	bool write = vmf->flags & FAULT_FLAG_WRITE;
5370	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
5371	pmd_t entry;
5372	vm_fault_t ret = VM_FAULT_FALLBACK;
5373
5374	/*
5375	 * It is too late to allocate a small folio, we already have a large
5376	 * folio in the pagecache: especially s390 KVM cannot tolerate any
5377	 * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
5378	 * PMD mappings if THPs are disabled. As we already have a THP,
5379	 * behave as if we are forcing a collapse.
5380	 */
5381	if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
5382						     /* forced_collapse=*/ true))
5383		return ret;
5384
5385	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
5386		return ret;
5387
5388	if (folio_order(folio) != HPAGE_PMD_ORDER)
5389		return ret;
5390	page = &folio->page;
5391
5392	/*
5393	 * Just backoff if any subpage of a THP is corrupted otherwise
5394	 * the corrupted page may mapped by PMD silently to escape the
5395	 * check.  This kind of THP just can be PTE mapped.  Access to
5396	 * the corrupted subpage should trigger SIGBUS as expected.
5397	 */
5398	if (unlikely(folio_test_has_hwpoisoned(folio)))
5399		return ret;
5400
5401	/*
5402	 * Archs like ppc64 need additional space to store information
5403	 * related to pte entry. Use the preallocated table for that.
5404	 */
5405	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
5406		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
5407		if (!vmf->prealloc_pte)
5408			return VM_FAULT_OOM;
5409	}
5410
5411	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
5412	if (unlikely(!pmd_none(*vmf->pmd)))
5413		goto out;
5414
5415	flush_icache_pages(vma, page, HPAGE_PMD_NR);
5416
5417	entry = folio_mk_pmd(folio, vma->vm_page_prot);
5418	if (write)
5419		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
5420
5421	add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
5422	folio_add_file_rmap_pmd(folio, page, vma);
5423
5424	/*
5425	 * deposit and withdraw with pmd lock held
5426	 */
5427	if (arch_needs_pgtable_deposit())
5428		deposit_prealloc_pte(vmf);
5429
5430	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
5431
5432	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
5433
5434	/* fault is handled */
5435	ret = 0;
5436	count_vm_event(THP_FILE_MAPPED);
5437out:
5438	spin_unlock(vmf->ptl);
5439	return ret;
5440}
5441#else
5442vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
5443{
5444	return VM_FAULT_FALLBACK;
5445}
5446#endif
5447
5448/**
5449 * set_pte_range - Set a range of PTEs to point to pages in a folio.
5450 * @vmf: Fault description.
5451 * @folio: The folio that contains @page.
5452 * @page: The first page to create a PTE for.
5453 * @nr: The number of PTEs to create.
5454 * @addr: The first address to create a PTE for.
5455 */
5456void set_pte_range(struct vm_fault *vmf, struct folio *folio,
5457		struct page *page, unsigned int nr, unsigned long addr)
5458{
5459	struct vm_area_struct *vma = vmf->vma;
5460	bool write = vmf->flags & FAULT_FLAG_WRITE;
5461	bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
5462	pte_t entry;
5463
5464	flush_icache_pages(vma, page, nr);
5465	entry = mk_pte(page, vma->vm_page_prot);
5466
5467	if (prefault && arch_wants_old_prefaulted_pte())
5468		entry = pte_mkold(entry);
5469	else
5470		entry = pte_sw_mkyoung(entry);
5471
5472	if (write)
5473		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
5474	else if (pte_write(entry) && folio_test_dirty(folio))
5475		entry = pte_mkdirty(entry);
5476	if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
5477		entry = pte_mkuffd_wp(entry);
5478	/* copy-on-write page */
5479	if (write && !(vma->vm_flags & VM_SHARED)) {
5480		VM_BUG_ON_FOLIO(nr != 1, folio);
5481		folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
5482		folio_add_lru_vma(folio, vma);
5483	} else {
5484		folio_add_file_rmap_ptes(folio, page, nr, vma);
5485	}
5486	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
5487
5488	/* no need to invalidate: a not-present page won't be cached */
5489	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
5490}
5491
5492static bool vmf_pte_changed(struct vm_fault *vmf)
5493{
5494	if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
5495		return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
5496
5497	return !pte_none(ptep_get(vmf->pte));
5498}
5499
5500/**
5501 * finish_fault - finish page fault once we have prepared the page to fault
5502 *
5503 * @vmf: structure describing the fault
5504 *
5505 * This function handles all that is needed to finish a page fault once the
5506 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
5507 * given page, adds reverse page mapping, handles memcg charges and LRU
5508 * addition.
5509 *
5510 * The function expects the page to be locked and on success it consumes a
5511 * reference of a page being mapped (for the PTE which maps it).
5512 *
5513 * Return: %0 on success, %VM_FAULT_ code in case of error.
5514 */
5515vm_fault_t finish_fault(struct vm_fault *vmf)
5516{
5517	struct vm_area_struct *vma = vmf->vma;
5518	struct page *page;
5519	struct folio *folio;
5520	vm_fault_t ret;
5521	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
5522		      !(vma->vm_flags & VM_SHARED);
5523	int type, nr_pages;
5524	unsigned long addr;
5525	bool needs_fallback = false;
5526
5527fallback:
5528	addr = vmf->address;
5529
5530	/* Did we COW the page? */
5531	if (is_cow)
5532		page = vmf->cow_page;
5533	else
5534		page = vmf->page;
5535
5536	folio = page_folio(page);
5537	/*
5538	 * check even for read faults because we might have lost our CoWed
5539	 * page
5540	 */
5541	if (!(vma->vm_flags & VM_SHARED)) {
5542		ret = check_stable_address_space(vma->vm_mm);
5543		if (ret)
5544			return ret;
5545	}
5546
5547	if (!needs_fallback && vma->vm_file) {
5548		struct address_space *mapping = vma->vm_file->f_mapping;
5549		pgoff_t file_end;
5550
5551		file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
5552
5553		/*
5554		 * Do not allow to map with PTEs beyond i_size and with PMD
5555		 * across i_size to preserve SIGBUS semantics.
5556		 *
5557		 * Make an exception for shmem/tmpfs that for long time
5558		 * intentionally mapped with PMDs across i_size.
5559		 */
5560		needs_fallback = !shmem_mapping(mapping) &&
5561			file_end < folio_next_index(folio);
5562	}
5563
5564	if (pmd_none(*vmf->pmd)) {
5565		if (!needs_fallback && folio_test_pmd_mappable(folio)) {
5566			ret = do_set_pmd(vmf, folio, page);
5567			if (ret != VM_FAULT_FALLBACK)
5568				return ret;
5569		}
5570
5571		if (vmf->prealloc_pte)
5572			pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
5573		else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
5574			return VM_FAULT_OOM;
5575	}
5576
5577	nr_pages = folio_nr_pages(folio);
5578
5579	/* Using per-page fault to maintain the uffd semantics */
5580	if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
5581		nr_pages = 1;
5582	} else if (nr_pages > 1) {
5583		pgoff_t idx = folio_page_idx(folio, page);
5584		/* The page offset of vmf->address within the VMA. */
5585		pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5586		/* The index of the entry in the pagetable for fault page. */
5587		pgoff_t pte_off = pte_index(vmf->address);
5588
5589		/*
5590		 * Fallback to per-page fault in case the folio size in page
5591		 * cache beyond the VMA limits and PMD pagetable limits.
5592		 */
5593		if (unlikely(vma_off < idx ||
5594			    vma_off + (nr_pages - idx) > vma_pages(vma) ||
5595			    pte_off < idx ||
5596			    pte_off + (nr_pages - idx)  > PTRS_PER_PTE)) {
5597			nr_pages = 1;
5598		} else {
5599			/* Now we can set mappings for the whole large folio. */
5600			addr = vmf->address - idx * PAGE_SIZE;
5601			page = &folio->page;
5602		}
5603	}
5604
5605	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
5606				       addr, &vmf->ptl);
5607	if (!vmf->pte)
5608		return VM_FAULT_NOPAGE;
5609
5610	/* Re-check under ptl */
5611	if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
5612		update_mmu_tlb(vma, addr, vmf->pte);
5613		ret = VM_FAULT_NOPAGE;
5614		goto unlock;
5615	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
5616		needs_fallback = true;
5617		pte_unmap_unlock(vmf->pte, vmf->ptl);
5618		goto fallback;
5619	}
5620
5621	folio_ref_add(folio, nr_pages - 1);
5622	set_pte_range(vmf, folio, page, nr_pages, addr);
5623	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
5624	add_mm_counter(vma->vm_mm, type, nr_pages);
5625	ret = 0;
5626
5627unlock:
5628	pte_unmap_unlock(vmf->pte, vmf->ptl);
5629	return ret;
5630}
5631
5632static unsigned long fault_around_pages __read_mostly =
5633	65536 >> PAGE_SHIFT;
5634
5635#ifdef CONFIG_DEBUG_FS
5636static int fault_around_bytes_get(void *data, u64 *val)
5637{
5638	*val = fault_around_pages << PAGE_SHIFT;
5639	return 0;
5640}
5641
5642/*
5643 * fault_around_bytes must be rounded down to the nearest page order as it's
5644 * what do_fault_around() expects to see.
5645 */
5646static int fault_around_bytes_set(void *data, u64 val)
5647{
5648	if (val / PAGE_SIZE > PTRS_PER_PTE)
5649		return -EINVAL;
5650
5651	/*
5652	 * The minimum value is 1 page, however this results in no fault-around
5653	 * at all. See should_fault_around().
5654	 */
5655	val = max(val, PAGE_SIZE);
5656	fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;
5657
5658	return 0;
5659}
5660DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
5661		fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
5662
5663static int __init fault_around_debugfs(void)
5664{
5665	debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
5666				   &fault_around_bytes_fops);
5667	return 0;
5668}
5669late_initcall(fault_around_debugfs);
5670#endif
5671
5672/*
5673 * do_fault_around() tries to map few pages around the fault address. The hope
5674 * is that the pages will be needed soon and this will lower the number of
5675 * faults to handle.
5676 *
5677 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
5678 * not ready to be mapped: not up-to-date, locked, etc.
5679 *
5680 * This function doesn't cross VMA or page table boundaries, in order to call
5681 * map_pages() and acquire a PTE lock only once.
5682 *
5683 * fault_around_pages defines how many pages we'll try to map.
5684 * do_fault_around() expects it to be set to a power of two less than or equal
5685 * to PTRS_PER_PTE.
5686 *
5687 * The virtual address of the area that we map is naturally aligned to
5688 * fault_around_pages * PAGE_SIZE rounded down to the machine page size
5689 * (and therefore to page order).  This way it's easier to guarantee
5690 * that we don't cross page table boundaries.
5691 */
5692static vm_fault_t do_fault_around(struct vm_fault *vmf)
5693{
5694	pgoff_t nr_pages = READ_ONCE(fault_around_pages);
5695	pgoff_t pte_off = pte_index(vmf->address);
5696	/* The page offset of vmf->address within the VMA. */
5697	pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5698	pgoff_t from_pte, to_pte;
5699	vm_fault_t ret;
5700
5701	/* The PTE offset of the start address, clamped to the VMA. */
5702	from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
5703		       pte_off - min(pte_off, vma_off));
5704
5705	/* The PTE offset of the end address, clamped to the VMA and PTE. */
5706	to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
5707		      pte_off + vma_pages(vmf->vma) - vma_off) - 1;
5708
5709	if (pmd_none(*vmf->pmd)) {
5710		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
5711		if (!vmf->prealloc_pte)
5712			return VM_FAULT_OOM;
5713	}
5714
5715	rcu_read_lock();
5716	ret = vmf->vma->vm_ops->map_pages(vmf,
5717			vmf->pgoff + from_pte - pte_off,
5718			vmf->pgoff + to_pte - pte_off);
5719	rcu_read_unlock();
5720
5721	return ret;
5722}
5723
5724/* Return true if we should do read fault-around, false otherwise */
5725static inline bool should_fault_around(struct vm_fault *vmf)
5726{
5727	/* No ->map_pages?  No way to fault around... */
5728	if (!vmf->vma->vm_ops->map_pages)
5729		return false;
5730
5731	if (uffd_disable_fault_around(vmf->vma))
5732		return false;
5733
5734	/* A single page implies no faulting 'around' at all. */
5735	return fault_around_pages > 1;
5736}
5737
5738static vm_fault_t do_read_fault(struct vm_fault *vmf)
5739{
5740	vm_fault_t ret = 0;
5741	struct folio *folio;
5742
5743	/*
5744	 * Let's call ->map_pages() first and use ->fault() as fallback
5745	 * if page by the offset is not ready to be mapped (cold cache or
5746	 * something).
5747	 */
5748	if (should_fault_around(vmf)) {
5749		ret = do_fault_around(vmf);
5750		if (ret)
5751			return ret;
5752	}
5753
5754	ret = vmf_can_call_fault(vmf);
5755	if (ret)
5756		return ret;
5757
5758	ret = __do_fault(vmf);
5759	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5760		return ret;
5761
5762	ret |= finish_fault(vmf);
5763	folio = page_folio(vmf->page);
5764	folio_unlock(folio);
5765	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5766		folio_put(folio);
5767	return ret;
5768}
5769
5770static vm_fault_t do_cow_fault(struct vm_fault *vmf)
5771{
5772	struct vm_area_struct *vma = vmf->vma;
5773	struct folio *folio;
5774	vm_fault_t ret;
5775
5776	ret = vmf_can_call_fault(vmf);
5777	if (!ret)
5778		ret = vmf_anon_prepare(vmf);
5779	if (ret)
5780		return ret;
5781
5782	folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false);
5783	if (!folio)
5784		return VM_FAULT_OOM;
5785
5786	vmf->cow_page = &folio->page;
5787
5788	ret = __do_fault(vmf);
5789	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5790		goto uncharge_out;
5791	if (ret & VM_FAULT_DONE_COW)
5792		return ret;
5793
5794	if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) {
5795		ret = VM_FAULT_HWPOISON;
5796		goto unlock;
5797	}
5798	__folio_mark_uptodate(folio);
5799
5800	ret |= finish_fault(vmf);
5801unlock:
5802	unlock_page(vmf->page);
5803	put_page(vmf->page);
5804	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5805		goto uncharge_out;
5806	return ret;
5807uncharge_out:
5808	folio_put(folio);
5809	return ret;
5810}
5811
5812static vm_fault_t do_shared_fault(struct vm_fault *vmf)
5813{
5814	struct vm_area_struct *vma = vmf->vma;
5815	vm_fault_t ret, tmp;
5816	struct folio *folio;
5817
5818	ret = vmf_can_call_fault(vmf);
5819	if (ret)
5820		return ret;
5821
5822	ret = __do_fault(vmf);
5823	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
5824		return ret;
5825
5826	folio = page_folio(vmf->page);
5827
5828	/*
5829	 * Check if the backing address space wants to know that the page is
5830	 * about to become writable
5831	 */
5832	if (vma->vm_ops->page_mkwrite) {
5833		folio_unlock(folio);
5834		tmp = do_page_mkwrite(vmf, folio);
5835		if (unlikely(!tmp ||
5836				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
5837			folio_put(folio);
5838			return tmp;
5839		}
5840	}
5841
5842	ret |= finish_fault(vmf);
5843	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
5844					VM_FAULT_RETRY))) {
5845		folio_unlock(folio);
5846		folio_put(folio);
5847		return ret;
5848	}
5849
5850	ret |= fault_dirty_shared_page(vmf);
5851	return ret;
5852}
5853
5854/*
5855 * We enter with non-exclusive mmap_lock (to exclude vma changes,
5856 * but allow concurrent faults).
5857 * The mmap_lock may have been released depending on flags and our
5858 * return value.  See filemap_fault() and __folio_lock_or_retry().
5859 * If mmap_lock is released, vma may become invalid (for example
5860 * by other thread calling munmap()).
5861 */
5862static vm_fault_t do_fault(struct vm_fault *vmf)
5863{
5864	struct vm_area_struct *vma = vmf->vma;
5865	struct mm_struct *vm_mm = vma->vm_mm;
5866	vm_fault_t ret;
5867
5868	/*
5869	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
5870	 */
5871	if (!vma->vm_ops->fault) {
5872		vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
5873					       vmf->address, &vmf->ptl);
5874		if (unlikely(!vmf->pte))
5875			ret = VM_FAULT_SIGBUS;
5876		else {
5877			/*
5878			 * Make sure this is not a temporary clearing of pte
5879			 * by holding ptl and checking again. A R/M/W update
5880			 * of pte involves: take ptl, clearing the pte so that
5881			 * we don't have concurrent modification by hardware
5882			 * followed by an update.
5883			 */
5884			if (unlikely(pte_none(ptep_get(vmf->pte))))
5885				ret = VM_FAULT_SIGBUS;
5886			else
5887				ret = VM_FAULT_NOPAGE;
5888
5889			pte_unmap_unlock(vmf->pte, vmf->ptl);
5890		}
5891	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
5892		ret = do_read_fault(vmf);
5893	else if (!(vma->vm_flags & VM_SHARED))
5894		ret = do_cow_fault(vmf);
5895	else
5896		ret = do_shared_fault(vmf);
5897
5898	/* preallocated pagetable is unused: free it */
5899	if (vmf->prealloc_pte) {
5900		pte_free(vm_mm, vmf->prealloc_pte);
5901		vmf->prealloc_pte = NULL;
5902	}
5903	return ret;
5904}
5905
5906int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
5907		      unsigned long addr, int *flags,
5908		      bool writable, int *last_cpupid)
5909{
5910	struct vm_area_struct *vma = vmf->vma;
5911
5912	/*
5913	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
5914	 * much anyway since they can be in shared cache state. This misses
5915	 * the case where a mapping is writable but the process never writes
5916	 * to it but pte_write gets cleared during protection updates and
5917	 * pte_dirty has unpredictable behaviour between PTE scan updates,
5918	 * background writeback, dirty balancing and application behaviour.
5919	 */
5920	if (!writable)
5921		*flags |= TNF_NO_GROUP;
5922
5923	/*
5924	 * Flag if the folio is shared between multiple address spaces. This
5925	 * is later used when determining whether to group tasks together
5926	 */
5927	if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
5928		*flags |= TNF_SHARED;
5929	/*
5930	 * For memory tiering mode, cpupid of slow memory page is used
5931	 * to record page access time.  So use default value.
5932	 */
5933	if (folio_use_access_time(folio))
5934		*last_cpupid = (-1 & LAST_CPUPID_MASK);
5935	else
5936		*last_cpupid = folio_last_cpupid(folio);
5937
5938	/* Record the current PID acceesing VMA */
5939	vma_set_access_pid_bit(vma);
5940
5941	count_vm_numa_event(NUMA_HINT_FAULTS);
5942#ifdef CONFIG_NUMA_BALANCING
5943	count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
5944#endif
5945	if (folio_nid(folio) == numa_node_id()) {
5946		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
5947		*flags |= TNF_FAULT_LOCAL;
5948	}
5949
5950	return mpol_misplaced(folio, vmf, addr);
5951}
5952
5953static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
5954					unsigned long fault_addr, pte_t *fault_pte,
5955					bool writable)
5956{
5957	pte_t pte, old_pte;
5958
5959	old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
5960	pte = pte_modify(old_pte, vma->vm_page_prot);
5961	pte = pte_mkyoung(pte);
5962	if (writable)
5963		pte = pte_mkwrite(pte, vma);
5964	ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
5965	update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
5966}
5967
5968static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
5969				       struct folio *folio, pte_t fault_pte,
5970				       bool ignore_writable, bool pte_write_upgrade)
5971{
5972	int nr = pte_pfn(fault_pte) - folio_pfn(folio);
5973	unsigned long start, end, addr = vmf->address;
5974	unsigned long addr_start = addr - (nr << PAGE_SHIFT);
5975	unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
5976	pte_t *start_ptep;
5977
5978	/* Stay within the VMA and within the page table. */
5979	start = max3(addr_start, pt_start, vma->vm_start);
5980	end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
5981		   vma->vm_end);
5982	start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);
5983
5984	/* Restore all PTEs' mapping of the large folio */
5985	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
5986		pte_t ptent = ptep_get(start_ptep);
5987		bool writable = false;
5988
5989		if (!pte_present(ptent) || !pte_protnone(ptent))
5990			continue;
5991
5992		if (pfn_folio(pte_pfn(ptent)) != folio)
5993			continue;
5994
5995		if (!ignore_writable) {
5996			ptent = pte_modify(ptent, vma->vm_page_prot);
5997			writable = pte_write(ptent);
5998			if (!writable && pte_write_upgrade &&
5999			    can_change_pte_writable(vma, addr, ptent))
6000				writable = true;
6001		}
6002
6003		numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
6004	}
6005}
6006
6007static vm_fault_t do_numa_page(struct vm_fault *vmf)
6008{
6009	struct vm_area_struct *vma = vmf->vma;
6010	struct folio *folio = NULL;
6011	int nid = NUMA_NO_NODE;
6012	bool writable = false, ignore_writable = false;
6013	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
6014	int last_cpupid;
6015	int target_nid;
6016	pte_t pte, old_pte;
6017	int flags = 0, nr_pages;
6018
6019	/*
6020	 * The pte cannot be used safely until we verify, while holding the page
6021	 * table lock, that its contents have not changed during fault handling.
6022	 */
6023	spin_lock(vmf->ptl);
6024	/* Read the live PTE from the page tables: */
6025	old_pte = ptep_get(vmf->pte);
6026
6027	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
6028		pte_unmap_unlock(vmf->pte, vmf->ptl);
6029		return 0;
6030	}
6031
6032	pte = pte_modify(old_pte, vma->vm_page_prot);
6033
6034	/*
6035	 * Detect now whether the PTE could be writable; this information
6036	 * is only valid while holding the PT lock.
6037	 */
6038	writable = pte_write(pte);
6039	if (!writable && pte_write_upgrade &&
6040	    can_change_pte_writable(vma, vmf->address, pte))
6041		writable = true;
6042
6043	folio = vm_normal_folio(vma, vmf->address, pte);
6044	if (!folio || folio_is_zone_device(folio))
6045		goto out_map;
6046
6047	nid = folio_nid(folio);
6048	nr_pages = folio_nr_pages(folio);
6049
6050	target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
6051					writable, &last_cpupid);
6052	if (target_nid == NUMA_NO_NODE)
6053		goto out_map;
6054	if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
6055		flags |= TNF_MIGRATE_FAIL;
6056		goto out_map;
6057	}
6058	/* The folio is isolated and isolation code holds a folio reference. */
6059	pte_unmap_unlock(vmf->pte, vmf->ptl);
6060	writable = false;
6061	ignore_writable = true;
6062
6063	/* Migrate to the requested node */
6064	if (!migrate_misplaced_folio(folio, target_nid)) {
6065		nid = target_nid;
6066		flags |= TNF_MIGRATED;
6067		task_numa_fault(last_cpupid, nid, nr_pages, flags);
6068		return 0;
6069	}
6070
6071	flags |= TNF_MIGRATE_FAIL;
6072	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
6073				       vmf->address, &vmf->ptl);
6074	if (unlikely(!vmf->pte))
6075		return 0;
6076	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
6077		pte_unmap_unlock(vmf->pte, vmf->ptl);
6078		return 0;
6079	}
6080out_map:
6081	/*
6082	 * Make it present again, depending on how arch implements
6083	 * non-accessible ptes, some can allow access by kernel mode.
6084	 */
6085	if (folio && folio_test_large(folio))
6086		numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
6087					   pte_write_upgrade);
6088	else
6089		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
6090					    writable);
6091	pte_unmap_unlock(vmf->pte, vmf->ptl);
6092
6093	if (nid != NUMA_NO_NODE)
6094		task_numa_fault(last_cpupid, nid, nr_pages, flags);
6095	return 0;
6096}
6097
6098static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
6099{
6100	struct vm_area_struct *vma = vmf->vma;
6101	if (vma_is_anonymous(vma))
6102		return do_huge_pmd_anonymous_page(vmf);
6103	if (vma->vm_ops->huge_fault)
6104		return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
6105	return VM_FAULT_FALLBACK;
6106}
6107
6108/* `inline' is required to avoid gcc 4.1.2 build error */
6109static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
6110{
6111	struct vm_area_struct *vma = vmf->vma;
6112	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
6113	vm_fault_t ret;
6114
6115	if (vma_is_anonymous(vma)) {
6116		if (likely(!unshare) &&
6117		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
6118			if (userfaultfd_wp_async(vmf->vma))
6119				goto split;
6120			return handle_userfault(vmf, VM_UFFD_WP);
6121		}
6122		return do_huge_pmd_wp_page(vmf);
6123	}
6124
6125	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
6126		if (vma->vm_ops->huge_fault) {
6127			ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
6128			if (!(ret & VM_FAULT_FALLBACK))
6129				return ret;
6130		}
6131	}
6132
6133split:
6134	/* COW or write-notify handled on pte level: split pmd. */
6135	__split_huge_pmd(vma, vmf->pmd, vmf->address, false);
6136
6137	return VM_FAULT_FALLBACK;
6138}
6139
6140static vm_fault_t create_huge_pud(struct vm_fault *vmf)
6141{
6142#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
6143	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
6144	struct vm_area_struct *vma = vmf->vma;
6145	/* No support for anonymous transparent PUD pages yet */
6146	if (vma_is_anonymous(vma))
6147		return VM_FAULT_FALLBACK;
6148	if (vma->vm_ops->huge_fault)
6149		return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
6150#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
6151	return VM_FAULT_FALLBACK;
6152}
6153
6154static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
6155{
6156#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
6157	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
6158	struct vm_area_struct *vma = vmf->vma;
6159	vm_fault_t ret;
6160
6161	/* No support for anonymous transparent PUD pages yet */
6162	if (vma_is_anonymous(vma))
6163		goto split;
6164	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
6165		if (vma->vm_ops->huge_fault) {
6166			ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
6167			if (!(ret & VM_FAULT_FALLBACK))
6168				return ret;
6169		}
6170	}
6171split:
6172	/* COW or write-notify not handled on PUD level: split pud.*/
6173	__split_huge_pud(vma, vmf->pud, vmf->address);
6174#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
6175	return VM_FAULT_FALLBACK;
6176}
6177
6178/*
6179 * The page faults may be spurious because of the racy access to the
6180 * page table.  For example, a non-populated virtual page is accessed
6181 * on 2 CPUs simultaneously, thus the page faults are triggered on
6182 * both CPUs.  However, it's possible that one CPU (say CPU A) cannot
6183 * find the reason for the page fault if the other CPU (say CPU B) has
6184 * changed the page table before the PTE is checked on CPU A.  Most of
6185 * the time, the spurious page faults can be ignored safely.  However,
6186 * if the page fault is for the write access, it's possible that a
6187 * stale read-only TLB entry exists in the local CPU and needs to be
6188 * flushed on some architectures.  This is called the spurious page
6189 * fault fixing.
6190 *
6191 * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
6192 * by default and used as such on most architectures, while
6193 * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
6194 * used as such on most architectures.
6195 */
6196static void fix_spurious_fault(struct vm_fault *vmf,
6197			       enum pgtable_level ptlevel)
6198{
6199	/* Skip spurious TLB flush for retried page fault */
6200	if (vmf->flags & FAULT_FLAG_TRIED)
6201		return;
6202	/*
6203	 * This is needed only for protection faults but the arch code
6204	 * is not yet telling us if this is a protection fault or not.
6205	 * This still avoids useless tlb flushes for .text page faults
6206	 * with threads.
6207	 */
6208	if (vmf->flags & FAULT_FLAG_WRITE) {
6209		if (ptlevel == PGTABLE_LEVEL_PTE)
6210			flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
6211						     vmf->pte);
6212		else
6213			flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
6214							 vmf->pmd);
6215	}
6216}
6217/*
6218 * These routines also need to handle stuff like marking pages dirty
6219 * and/or accessed for architectures that don't do it in hardware (most
6220 * RISC architectures).  The early dirtying is also good on the i386.
6221 *
6222 * There is also a hook called "update_mmu_cache()" that architectures
6223 * with external mmu caches can use to update those (ie the Sparc or
6224 * PowerPC hashed page tables that act as extended TLBs).
6225 *
6226 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
6227 * concurrent faults).
6228 *
6229 * The mmap_lock may have been released depending on flags and our return value.
6230 * See filemap_fault() and __folio_lock_or_retry().
6231 */
6232static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
6233{
6234	pte_t entry;
6235
6236	if (unlikely(pmd_none(*vmf->pmd))) {
6237		/*
6238		 * Leave __pte_alloc() until later: because vm_ops->fault may
6239		 * want to allocate huge page, and if we expose page table
6240		 * for an instant, it will be difficult to retract from
6241		 * concurrent faults and from rmap lookups.
6242		 */
6243		vmf->pte = NULL;
6244		vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
6245	} else {
6246		pmd_t dummy_pmdval;
6247
6248		/*
6249		 * A regular pmd is established and it can't morph into a huge
6250		 * pmd by anon khugepaged, since that takes mmap_lock in write
6251		 * mode; but shmem or file collapse to THP could still morph
6252		 * it into a huge pmd: just retry later if so.
6253		 *
6254		 * Use the maywrite version to indicate that vmf->pte may be
6255		 * modified, but since we will use pte_same() to detect the
6256		 * change of the !pte_none() entry, there is no need to recheck
6257		 * the pmdval. Here we chooes to pass a dummy variable instead
6258		 * of NULL, which helps new user think about why this place is
6259		 * special.
6260		 */
6261		vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
6262						    vmf->address, &dummy_pmdval,
6263						    &vmf->ptl);
6264		if (unlikely(!vmf->pte))
6265			return 0;
6266		vmf->orig_pte = ptep_get_lockless(vmf->pte);
6267		vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
6268
6269		if (pte_none(vmf->orig_pte)) {
6270			pte_unmap(vmf->pte);
6271			vmf->pte = NULL;
6272		}
6273	}
6274
6275	if (!vmf->pte)
6276		return do_pte_missing(vmf);
6277
6278	if (!pte_present(vmf->orig_pte))
6279		return do_swap_page(vmf);
6280
6281	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
6282		return do_numa_page(vmf);
6283
6284	spin_lock(vmf->ptl);
6285	entry = vmf->orig_pte;
6286	if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
6287		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
6288		goto unlock;
6289	}
6290	if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
6291		if (!pte_write(entry))
6292			return do_wp_page(vmf);
6293		else if (likely(vmf->flags & FAULT_FLAG_WRITE))
6294			entry = pte_mkdirty(entry);
6295	}
6296	entry = pte_mkyoung(entry);
6297	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
6298				vmf->flags & FAULT_FLAG_WRITE))
6299		update_mmu_cache_range(vmf, vmf->vma, vmf->address,
6300				vmf->pte, 1);
6301	else
6302		fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
6303unlock:
6304	pte_unmap_unlock(vmf->pte, vmf->ptl);
6305	return 0;
6306}
6307
6308/*
6309 * On entry, we hold either the VMA lock or the mmap_lock
6310 * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
6311 * the result, the mmap_lock is not held on exit.  See filemap_fault()
6312 * and __folio_lock_or_retry().
6313 */
6314static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
6315		unsigned long address, unsigned int flags)
6316{
6317	struct vm_fault vmf = {
6318		.vma = vma,
6319		.address = address & PAGE_MASK,
6320		.real_address = address,
6321		.flags = flags,
6322		.pgoff = linear_page_index(vma, address),
6323		.gfp_mask = __get_fault_gfp_mask(vma),
6324	};
6325	struct mm_struct *mm = vma->vm_mm;
6326	vm_flags_t vm_flags = vma->vm_flags;
6327	pgd_t *pgd;
6328	p4d_t *p4d;
6329	vm_fault_t ret;
6330
6331	pgd = pgd_offset(mm, address);
6332	p4d = p4d_alloc(mm, pgd, address);
6333	if (!p4d)
6334		return VM_FAULT_OOM;
6335
6336	vmf.pud = pud_alloc(mm, p4d, address);
6337	if (!vmf.pud)
6338		return VM_FAULT_OOM;
6339retry_pud:
6340	if (pud_none(*vmf.pud) &&
6341	    thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
6342		ret = create_huge_pud(&vmf);
6343		if (!(ret & VM_FAULT_FALLBACK))
6344			return ret;
6345	} else {
6346		pud_t orig_pud = *vmf.pud;
6347
6348		barrier();
6349		if (pud_trans_huge(orig_pud)) {
6350
6351			/*
6352			 * TODO once we support anonymous PUDs: NUMA case and
6353			 * FAULT_FLAG_UNSHARE handling.
6354			 */
6355			if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
6356				ret = wp_huge_pud(&vmf, orig_pud);
6357				if (!(ret & VM_FAULT_FALLBACK))
6358					return ret;
6359			} else {
6360				huge_pud_set_accessed(&vmf, orig_pud);
6361				return 0;
6362			}
6363		}
6364	}
6365
6366	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
6367	if (!vmf.pmd)
6368		return VM_FAULT_OOM;
6369
6370	/* Huge pud page fault raced with pmd_alloc? */
6371	if (pud_trans_unstable(vmf.pud))
6372		goto retry_pud;
6373
6374	if (pmd_none(*vmf.pmd) &&
6375	    thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
6376		ret = create_huge_pmd(&vmf);
6377		if (ret & VM_FAULT_FALLBACK)
6378			goto fallback;
6379		else
6380			return ret;
6381	}
6382
6383	vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
6384	if (pmd_none(vmf.orig_pmd))
6385		goto fallback;
6386
6387	if (unlikely(!pmd_present(vmf.orig_pmd))) {
6388		if (pmd_is_device_private_entry(vmf.orig_pmd))
6389			return do_huge_pmd_device_private(&vmf);
6390
6391		if (pmd_is_migration_entry(vmf.orig_pmd))
6392			pmd_migration_entry_wait(mm, vmf.pmd);
6393		return 0;
6394	}
6395	if (pmd_trans_huge(vmf.orig_pmd)) {
6396		if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
6397			return do_huge_pmd_numa_page(&vmf);
6398
6399		if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
6400		    !pmd_write(vmf.orig_pmd)) {
6401			ret = wp_huge_pmd(&vmf);
6402			if (!(ret & VM_FAULT_FALLBACK))
6403				return ret;
6404		} else {
6405			vmf.ptl = pmd_lock(mm, vmf.pmd);
6406			if (!huge_pmd_set_accessed(&vmf))
6407				fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
6408			spin_unlock(vmf.ptl);
6409			return 0;
6410		}
6411	}
6412
6413fallback:
6414	return handle_pte_fault(&vmf);
6415}
6416
6417/**
6418 * mm_account_fault - Do page fault accounting
6419 * @mm: mm from which memcg should be extracted. It can be NULL.
6420 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
6421 *        of perf event counters, but we'll still do the per-task accounting to
6422 *        the task who triggered this page fault.
6423 * @address: the faulted address.
6424 * @flags: the fault flags.
6425 * @ret: the fault retcode.
6426 *
6427 * This will take care of most of the page fault accounting.  Meanwhile, it
6428 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
6429 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
6430 * still be in per-arch page fault handlers at the entry of page fault.
6431 */
6432static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
6433				    unsigned long address, unsigned int flags,
6434				    vm_fault_t ret)
6435{
6436	bool major;
6437
6438	/* Incomplete faults will be accounted upon completion. */
6439	if (ret & VM_FAULT_RETRY)
6440		return;
6441
6442	/*
6443	 * To preserve the behavior of older kernels, PGFAULT counters record
6444	 * both successful and failed faults, as opposed to perf counters,
6445	 * which ignore failed cases.
6446	 */
6447	count_vm_event(PGFAULT);
6448	count_memcg_event_mm(mm, PGFAULT);
6449
6450	/*
6451	 * Do not account for unsuccessful faults (e.g. when the address wasn't
6452	 * valid).  That includes arch_vma_access_permitted() failing before
6453	 * reaching here. So this is not a "this many hardware page faults"
6454	 * counter.  We should use the hw profiling for that.
6455	 */
6456	if (ret & VM_FAULT_ERROR)
6457		return;
6458
6459	/*
6460	 * We define the fault as a major fault when the final successful fault
6461	 * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
6462	 * handle it immediately previously).
6463	 */
6464	major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
6465
6466	if (major)
6467		current->maj_flt++;
6468	else
6469		current->min_flt++;
6470
6471	/*
6472	 * If the fault is done for GUP, regs will be NULL.  We only do the
6473	 * accounting for the per thread fault counters who triggered the
6474	 * fault, and we skip the perf event updates.
6475	 */
6476	if (!regs)
6477		return;
6478
6479	if (major)
6480		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
6481	else
6482		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
6483}
6484
6485#ifdef CONFIG_LRU_GEN
6486static void lru_gen_enter_fault(struct vm_area_struct *vma)
6487{
6488	/* the LRU algorithm only applies to accesses with recency */
6489	current->in_lru_fault = vma_has_recency(vma);
6490}
6491
6492static void lru_gen_exit_fault(void)
6493{
6494	current->in_lru_fault = false;
6495}
6496#else
6497static void lru_gen_enter_fault(struct vm_area_struct *vma)
6498{
6499}
6500
6501static void lru_gen_exit_fault(void)
6502{
6503}
6504#endif /* CONFIG_LRU_GEN */
6505
6506static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
6507				       unsigned int *flags)
6508{
6509	if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
6510		if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
6511			return VM_FAULT_SIGSEGV;
6512		/*
6513		 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
6514		 * just treat it like an ordinary read-fault otherwise.
6515		 */
6516		if (!is_cow_mapping(vma->vm_flags))
6517			*flags &= ~FAULT_FLAG_UNSHARE;
6518	} else if (*flags & FAULT_FLAG_WRITE) {
6519		/* Write faults on read-only mappings are impossible ... */
6520		if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
6521			return VM_FAULT_SIGSEGV;
6522		/* ... and FOLL_FORCE only applies to COW mappings. */
6523		if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
6524				 !is_cow_mapping(vma->vm_flags)))
6525			return VM_FAULT_SIGSEGV;
6526	}
6527#ifdef CONFIG_PER_VMA_LOCK
6528	/*
6529	 * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
6530	 * the assumption that lock is dropped on VM_FAULT_RETRY.
6531	 */
6532	if (WARN_ON_ONCE((*flags &
6533			(FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
6534			(FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
6535		return VM_FAULT_SIGSEGV;
6536#endif
6537
6538	return 0;
6539}
6540
6541/*
6542 * By the time we get here, we already hold either the VMA lock or the
6543 * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
6544 *
6545 * The mmap_lock may have been released depending on flags and our
6546 * return value.  See filemap_fault() and __folio_lock_or_retry().
6547 */
6548vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
6549			   unsigned int flags, struct pt_regs *regs)
6550{
6551	/* If the fault handler drops the mmap_lock, vma may be freed */
6552	struct mm_struct *mm = vma->vm_mm;
6553	vm_fault_t ret;
6554	bool is_droppable;
6555
6556	__set_current_state(TASK_RUNNING);
6557
6558	ret = sanitize_fault_flags(vma, &flags);
6559	if (ret)
6560		goto out;
6561
6562	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
6563					    flags & FAULT_FLAG_INSTRUCTION,
6564					    flags & FAULT_FLAG_REMOTE)) {
6565		ret = VM_FAULT_SIGSEGV;
6566		goto out;
6567	}
6568
6569	is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
6570
6571	/*
6572	 * Enable the memcg OOM handling for faults triggered in user
6573	 * space.  Kernel faults are handled more gracefully.
6574	 */
6575	if (flags & FAULT_FLAG_USER)
6576		mem_cgroup_enter_user_fault();
6577
6578	lru_gen_enter_fault(vma);
6579
6580	if (unlikely(is_vm_hugetlb_page(vma)))
6581		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
6582	else
6583		ret = __handle_mm_fault(vma, address, flags);
6584
6585	/*
6586	 * Warning: It is no longer safe to dereference vma-> after this point,
6587	 * because mmap_lock might have been dropped by __handle_mm_fault(), so
6588	 * vma might be destroyed from underneath us.
6589	 */
6590
6591	lru_gen_exit_fault();
6592
6593	/* If the mapping is droppable, then errors due to OOM aren't fatal. */
6594	if (is_droppable)
6595		ret &= ~VM_FAULT_OOM;
6596
6597	if (flags & FAULT_FLAG_USER) {
6598		mem_cgroup_exit_user_fault();
6599		/*
6600		 * The task may have entered a memcg OOM situation but
6601		 * if the allocation error was handled gracefully (no
6602		 * VM_FAULT_OOM), there is no need to kill anything.
6603		 * Just clean up the OOM state peacefully.
6604		 */
6605		if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
6606			mem_cgroup_oom_synchronize(false);
6607	}
6608out:
6609	mm_account_fault(mm, regs, address, flags, ret);
6610
6611	return ret;
6612}
6613EXPORT_SYMBOL_GPL(handle_mm_fault);
6614
6615#ifndef __PAGETABLE_P4D_FOLDED
6616/*
6617 * Allocate p4d page table.
6618 * We've already handled the fast-path in-line.
6619 */
6620int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
6621{
6622	p4d_t *new = p4d_alloc_one(mm, address);
6623	if (!new)
6624		return -ENOMEM;
6625
6626	spin_lock(&mm->page_table_lock);
6627	if (pgd_present(*pgd)) {	/* Another has populated it */
6628		p4d_free(mm, new);
6629	} else {
6630		smp_wmb(); /* See comment in pmd_install() */
6631		pgd_populate(mm, pgd, new);
6632	}
6633	spin_unlock(&mm->page_table_lock);
6634	return 0;
6635}
6636#endif /* __PAGETABLE_P4D_FOLDED */
6637
6638#ifndef __PAGETABLE_PUD_FOLDED
6639/*
6640 * Allocate page upper directory.
6641 * We've already handled the fast-path in-line.
6642 */
6643int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
6644{
6645	pud_t *new = pud_alloc_one(mm, address);
6646	if (!new)
6647		return -ENOMEM;
6648
6649	spin_lock(&mm->page_table_lock);
6650	if (!p4d_present(*p4d)) {
6651		mm_inc_nr_puds(mm);
6652		smp_wmb(); /* See comment in pmd_install() */
6653		p4d_populate(mm, p4d, new);
6654	} else	/* Another has populated it */
6655		pud_free(mm, new);
6656	spin_unlock(&mm->page_table_lock);
6657	return 0;
6658}
6659#endif /* __PAGETABLE_PUD_FOLDED */
6660
6661#ifndef __PAGETABLE_PMD_FOLDED
6662/*
6663 * Allocate page middle directory.
6664 * We've already handled the fast-path in-line.
6665 */
6666int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
6667{
6668	spinlock_t *ptl;
6669	pmd_t *new = pmd_alloc_one(mm, address);
6670	if (!new)
6671		return -ENOMEM;
6672
6673	ptl = pud_lock(mm, pud);
6674	if (!pud_present(*pud)) {
6675		mm_inc_nr_pmds(mm);
6676		smp_wmb(); /* See comment in pmd_install() */
6677		pud_populate(mm, pud, new);
6678	} else {	/* Another has populated it */
6679		pmd_free(mm, new);
6680	}
6681	spin_unlock(ptl);
6682	return 0;
6683}
6684#endif /* __PAGETABLE_PMD_FOLDED */
6685
6686static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
6687				     spinlock_t *lock, pte_t *ptep,
6688				     pgprot_t pgprot, unsigned long pfn_base,
6689				     unsigned long addr_mask, bool writable,
6690				     bool special)
6691{
6692	args->lock = lock;
6693	args->ptep = ptep;
6694	args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
6695	args->addr_mask = addr_mask;
6696	args->pgprot = pgprot;
6697	args->writable = writable;
6698	args->special = special;
6699}
6700
6701static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
6702{
6703#ifdef CONFIG_LOCKDEP
6704	struct file *file = vma->vm_file;
6705	struct address_space *mapping = file ? file->f_mapping : NULL;
6706
6707	if (mapping)
6708		lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
6709			       lockdep_is_held(&vma->vm_mm->mmap_lock));
6710	else
6711		lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
6712#endif
6713}
6714
6715/**
6716 * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
6717 * @args: Pointer to struct @follow_pfnmap_args
6718 *
6719 * The caller needs to setup args->vma and args->address to point to the
6720 * virtual address as the target of such lookup.  On a successful return,
6721 * the results will be put into other output fields.
6722 *
6723 * After the caller finished using the fields, the caller must invoke
6724 * another follow_pfnmap_end() to proper releases the locks and resources
6725 * of such look up request.
6726 *
6727 * During the start() and end() calls, the results in @args will be valid
6728 * as proper locks will be held.  After the end() is called, all the fields
6729 * in @follow_pfnmap_args will be invalid to be further accessed.  Further
6730 * use of such information after end() may require proper synchronizations
6731 * by the caller with page table updates, otherwise it can create a
6732 * security bug.
6733 *
6734 * If the PTE maps a refcounted page, callers are responsible to protect
6735 * against invalidation with MMU notifiers; otherwise access to the PFN at
6736 * a later point in time can trigger use-after-free.
6737 *
6738 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
6739 * should be taken for read, and the mmap semaphore cannot be released
6740 * before the end() is invoked.
6741 *
6742 * This function must not be used to modify PTE content.
6743 *
6744 * Return: zero on success, negative otherwise.
6745 */
6746int follow_pfnmap_start(struct follow_pfnmap_args *args)
6747{
6748	struct vm_area_struct *vma = args->vma;
6749	unsigned long address = args->address;
6750	struct mm_struct *mm = vma->vm_mm;
6751	spinlock_t *lock;
6752	pgd_t *pgdp;
6753	p4d_t *p4dp, p4d;
6754	pud_t *pudp, pud;
6755	pmd_t *pmdp, pmd;
6756	pte_t *ptep, pte;
6757
6758	pfnmap_lockdep_assert(vma);
6759
6760	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
6761		goto out;
6762
6763	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
6764		goto out;
6765retry:
6766	pgdp = pgd_offset(mm, address);
6767	if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
6768		goto out;
6769
6770	p4dp = p4d_offset(pgdp, address);
6771	p4d = p4dp_get(p4dp);
6772	if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
6773		goto out;
6774
6775	pudp = pud_offset(p4dp, address);
6776	pud = pudp_get(pudp);
6777	if (pud_none(pud))
6778		goto out;
6779	if (pud_leaf(pud)) {
6780		lock = pud_lock(mm, pudp);
6781		if (!unlikely(pud_leaf(pud))) {
6782			spin_unlock(lock);
6783			goto retry;
6784		}
6785		pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
6786				  pud_pfn(pud), PUD_MASK, pud_write(pud),
6787				  pud_special(pud));
6788		return 0;
6789	}
6790
6791	pmdp = pmd_offset(pudp, address);
6792	pmd = pmdp_get_lockless(pmdp);
6793	if (pmd_leaf(pmd)) {
6794		lock = pmd_lock(mm, pmdp);
6795		if (!unlikely(pmd_leaf(pmd))) {
6796			spin_unlock(lock);
6797			goto retry;
6798		}
6799		pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
6800				  pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
6801				  pmd_special(pmd));
6802		return 0;
6803	}
6804
6805	ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
6806	if (!ptep)
6807		goto out;
6808	pte = ptep_get(ptep);
6809	if (!pte_present(pte))
6810		goto unlock;
6811	pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
6812			  pte_pfn(pte), PAGE_MASK, pte_write(pte),
6813			  pte_special(pte));
6814	return 0;
6815unlock:
6816	pte_unmap_unlock(ptep, lock);
6817out:
6818	return -EINVAL;
6819}
6820EXPORT_SYMBOL_GPL(follow_pfnmap_start);
6821
6822/**
6823 * follow_pfnmap_end(): End a follow_pfnmap_start() process
6824 * @args: Pointer to struct @follow_pfnmap_args
6825 *
6826 * Must be used in pair of follow_pfnmap_start().  See the start() function
6827 * above for more information.
6828 */
6829void follow_pfnmap_end(struct follow_pfnmap_args *args)
6830{
6831	if (args->lock)
6832		spin_unlock(args->lock);
6833	if (args->ptep)
6834		pte_unmap(args->ptep);
6835}
6836EXPORT_SYMBOL_GPL(follow_pfnmap_end);
6837
6838#ifdef CONFIG_HAVE_IOREMAP_PROT
6839/**
6840 * generic_access_phys - generic implementation for iomem mmap access
6841 * @vma: the vma to access
6842 * @addr: userspace address, not relative offset within @vma
6843 * @buf: buffer to read/write
6844 * @len: length of transfer
6845 * @write: set to FOLL_WRITE when writing, otherwise reading
6846 *
6847 * This is a generic implementation for &vm_operations_struct.access for an
6848 * iomem mapping. This callback is used by access_process_vm() when the @vma is
6849 * not page based.
6850 */
6851int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
6852			void *buf, int len, int write)
6853{
6854	resource_size_t phys_addr;
6855	pgprot_t prot = __pgprot(0);
6856	void __iomem *maddr;
6857	int offset = offset_in_page(addr);
6858	int ret = -EINVAL;
6859	bool writable;
6860	struct follow_pfnmap_args args = { .vma = vma, .address = addr };
6861
6862retry:
6863	if (follow_pfnmap_start(&args))
6864		return -EINVAL;
6865	prot = args.pgprot;
6866	phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
6867	writable = args.writable;
6868	follow_pfnmap_end(&args);
6869
6870	if ((write & FOLL_WRITE) && !writable)
6871		return -EINVAL;
6872
6873	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
6874	if (!maddr)
6875		return -ENOMEM;
6876
6877	if (follow_pfnmap_start(&args))
6878		goto out_unmap;
6879
6880	if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
6881	    (phys_addr != (args.pfn << PAGE_SHIFT)) ||
6882	    (writable != args.writable)) {
6883		follow_pfnmap_end(&args);
6884		iounmap(maddr);
6885		goto retry;
6886	}
6887
6888	if (write)
6889		memcpy_toio(maddr + offset, buf, len);
6890	else
6891		memcpy_fromio(buf, maddr + offset, len);
6892	ret = len;
6893	follow_pfnmap_end(&args);
6894out_unmap:
6895	iounmap(maddr);
6896
6897	return ret;
6898}
6899EXPORT_SYMBOL_GPL(generic_access_phys);
6900#endif
6901
6902/*
6903 * Access another process' address space as given in mm.
6904 */
6905static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
6906			      void *buf, int len, unsigned int gup_flags)
6907{
6908	void *old_buf = buf;
6909	int write = gup_flags & FOLL_WRITE;
6910
6911	if (mmap_read_lock_killable(mm))
6912		return 0;
6913
6914	/* Untag the address before looking up the VMA */
6915	addr = untagged_addr_remote(mm, addr);
6916
6917	/* Avoid triggering the temporary warning in __get_user_pages */
6918	if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
6919		return 0;
6920
6921	/* ignore errors, just check how much was successfully transferred */
6922	while (len) {
6923		int bytes, offset;
6924		void *maddr;
6925		struct folio *folio;
6926		struct vm_area_struct *vma = NULL;
6927		struct page *page = get_user_page_vma_remote(mm, addr,
6928							     gup_flags, &vma);
6929
6930		if (IS_ERR(page)) {
6931			/* We might need to expand the stack to access it */
6932			vma = vma_lookup(mm, addr);
6933			if (!vma) {
6934				vma = expand_stack(mm, addr);
6935
6936				/* mmap_lock was dropped on failure */
6937				if (!vma)
6938					return buf - old_buf;
6939
6940				/* Try again if stack expansion worked */
6941				continue;
6942			}
6943
6944			/*
6945			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
6946			 * we can access using slightly different code.
6947			 */
6948			bytes = 0;
6949#ifdef CONFIG_HAVE_IOREMAP_PROT
6950			if (vma->vm_ops && vma->vm_ops->access)
6951				bytes = vma->vm_ops->access(vma, addr, buf,
6952							    len, write);
6953#endif
6954			if (bytes <= 0)
6955				break;
6956		} else {
6957			folio = page_folio(page);
6958			bytes = len;
6959			offset = addr & (PAGE_SIZE-1);
6960			if (bytes > PAGE_SIZE-offset)
6961				bytes = PAGE_SIZE-offset;
6962
6963			maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
6964			if (write) {
6965				copy_to_user_page(vma, page, addr,
6966						  maddr + offset, buf, bytes);
6967				folio_mark_dirty_lock(folio);
6968			} else {
6969				copy_from_user_page(vma, page, addr,
6970						    buf, maddr + offset, bytes);
6971			}
6972			folio_release_kmap(folio, maddr);
6973		}
6974		len -= bytes;
6975		buf += bytes;
6976		addr += bytes;
6977	}
6978	mmap_read_unlock(mm);
6979
6980	return buf - old_buf;
6981}
6982
6983/**
6984 * access_remote_vm - access another process' address space
6985 * @mm:		the mm_struct of the target address space
6986 * @addr:	start address to access
6987 * @buf:	source or destination buffer
6988 * @len:	number of bytes to transfer
6989 * @gup_flags:	flags modifying lookup behaviour
6990 *
6991 * The caller must hold a reference on @mm.
6992 *
6993 * Return: number of bytes copied from source to destination.
6994 */
6995int access_remote_vm(struct mm_struct *mm, unsigned long addr,
6996		void *buf, int len, unsigned int gup_flags)
6997{
6998	return __access_remote_vm(mm, addr, buf, len, gup_flags);
6999}
7000
7001/*
7002 * Access another process' address space.
7003 * Source/target buffer must be kernel space,
7004 * Do not walk the page table directly, use get_user_pages
7005 */
7006int access_process_vm(struct task_struct *tsk, unsigned long addr,
7007		void *buf, int len, unsigned int gup_flags)
7008{
7009	struct mm_struct *mm;
7010	int ret;
7011
7012	mm = get_task_mm(tsk);
7013	if (!mm)
7014		return 0;
7015
7016	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
7017
7018	mmput(mm);
7019
7020	return ret;
7021}
7022EXPORT_SYMBOL_GPL(access_process_vm);
7023
7024#ifdef CONFIG_BPF_SYSCALL
7025/*
7026 * Copy a string from another process's address space as given in mm.
7027 * If there is any error return -EFAULT.
7028 */
7029static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
7030				void *buf, int len, unsigned int gup_flags)
7031{
7032	void *old_buf = buf;
7033	int err = 0;
7034
7035	*(char *)buf = '\0';
7036
7037	if (mmap_read_lock_killable(mm))
7038		return -EFAULT;
7039
7040	addr = untagged_addr_remote(mm, addr);
7041
7042	/* Avoid triggering the temporary warning in __get_user_pages */
7043	if (!vma_lookup(mm, addr)) {
7044		err = -EFAULT;
7045		goto out;
7046	}
7047
7048	while (len) {
7049		int bytes, offset, retval;
7050		void *maddr;
7051		struct folio *folio;
7052		struct page *page;
7053		struct vm_area_struct *vma = NULL;
7054
7055		page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
7056		if (IS_ERR(page)) {
7057			/*
7058			 * Treat as a total failure for now until we decide how
7059			 * to handle the CONFIG_HAVE_IOREMAP_PROT case and
7060			 * stack expansion.
7061			 */
7062			*(char *)buf = '\0';
7063			err = -EFAULT;
7064			goto out;
7065		}
7066
7067		folio = page_folio(page);
7068		bytes = len;
7069		offset = addr & (PAGE_SIZE - 1);
7070		if (bytes > PAGE_SIZE - offset)
7071			bytes = PAGE_SIZE - offset;
7072
7073		maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
7074		retval = strscpy(buf, maddr + offset, bytes);
7075		if (retval >= 0) {
7076			/* Found the end of the string */
7077			buf += retval;
7078			folio_release_kmap(folio, maddr);
7079			break;
7080		}
7081
7082		buf += bytes - 1;
7083		/*
7084		 * Because strscpy always NUL terminates we need to
7085		 * copy the last byte in the page if we are going to
7086		 * load more pages
7087		 */
7088		if (bytes != len) {
7089			addr += bytes - 1;
7090			copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
7091			buf += 1;
7092			addr += 1;
7093		}
7094		len -= bytes;
7095
7096		folio_release_kmap(folio, maddr);
7097	}
7098
7099out:
7100	mmap_read_unlock(mm);
7101	if (err)
7102		return err;
7103	return buf - old_buf;
7104}
7105
7106/**
7107 * copy_remote_vm_str - copy a string from another process's address space.
7108 * @tsk:	the task of the target address space
7109 * @addr:	start address to read from
7110 * @buf:	destination buffer
7111 * @len:	number of bytes to copy
7112 * @gup_flags:	flags modifying lookup behaviour
7113 *
7114 * The caller must hold a reference on @mm.
7115 *
7116 * Return: number of bytes copied from @addr (source) to @buf (destination);
7117 * not including the trailing NUL. Always guaranteed to leave NUL-terminated
7118 * buffer. On any error, return -EFAULT.
7119 */
7120int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
7121		       void *buf, int len, unsigned int gup_flags)
7122{
7123	struct mm_struct *mm;
7124	int ret;
7125
7126	if (unlikely(len == 0))
7127		return 0;
7128
7129	mm = get_task_mm(tsk);
7130	if (!mm) {
7131		*(char *)buf = '\0';
7132		return -EFAULT;
7133	}
7134
7135	ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
7136
7137	mmput(mm);
7138
7139	return ret;
7140}
7141EXPORT_SYMBOL_GPL(copy_remote_vm_str);
7142#endif /* CONFIG_BPF_SYSCALL */
7143
7144/*
7145 * Print the name of a VMA.
7146 */
7147void print_vma_addr(char *prefix, unsigned long ip)
7148{
7149	struct mm_struct *mm = current->mm;
7150	struct vm_area_struct *vma;
7151
7152	/*
7153	 * we might be running from an atomic context so we cannot sleep
7154	 */
7155	if (!mmap_read_trylock(mm))
7156		return;
7157
7158	vma = vma_lookup(mm, ip);
7159	if (vma && vma->vm_file) {
7160		struct file *f = vma->vm_file;
7161		ip -= vma->vm_start;
7162		ip += vma->vm_pgoff << PAGE_SHIFT;
7163		printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
7164				vma->vm_start,
7165				vma->vm_end - vma->vm_start);
7166	}
7167	mmap_read_unlock(mm);
7168}
7169
7170#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
7171void __might_fault(const char *file, int line)
7172{
7173	if (pagefault_disabled())
7174		return;
7175	__might_sleep(file, line);
7176	if (current->mm)
7177		might_lock_read(&current->mm->mmap_lock);
7178}
7179EXPORT_SYMBOL(__might_fault);
7180#endif
7181
7182#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
7183/*
7184 * Process all subpages of the specified huge page with the specified
7185 * operation.  The target subpage will be processed last to keep its
7186 * cache lines hot.
7187 */
7188static inline int process_huge_page(
7189	unsigned long addr_hint, unsigned int nr_pages,
7190	int (*process_subpage)(unsigned long addr, int idx, void *arg),
7191	void *arg)
7192{
7193	int i, n, base, l, ret;
7194	unsigned long addr = addr_hint &
7195		~(((unsigned long)nr_pages << PAGE_SHIFT) - 1);
7196
7197	/* Process target subpage last to keep its cache lines hot */
7198	might_sleep();
7199	n = (addr_hint - addr) / PAGE_SIZE;
7200	if (2 * n <= nr_pages) {
7201		/* If target subpage in first half of huge page */
7202		base = 0;
7203		l = n;
7204		/* Process subpages at the end of huge page */
7205		for (i = nr_pages - 1; i >= 2 * n; i--) {
7206			cond_resched();
7207			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
7208			if (ret)
7209				return ret;
7210		}
7211	} else {
7212		/* If target subpage in second half of huge page */
7213		base = nr_pages - 2 * (nr_pages - n);
7214		l = nr_pages - n;
7215		/* Process subpages at the begin of huge page */
7216		for (i = 0; i < base; i++) {
7217			cond_resched();
7218			ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
7219			if (ret)
7220				return ret;
7221		}
7222	}
7223	/*
7224	 * Process remaining subpages in left-right-left-right pattern
7225	 * towards the target subpage
7226	 */
7227	for (i = 0; i < l; i++) {
7228		int left_idx = base + i;
7229		int right_idx = base + 2 * l - 1 - i;
7230
7231		cond_resched();
7232		ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
7233		if (ret)
7234			return ret;
7235		cond_resched();
7236		ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
7237		if (ret)
7238			return ret;
7239	}
7240	return 0;
7241}
7242
7243static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
7244				unsigned int nr_pages)
7245{
7246	unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
7247	int i;
7248
7249	might_sleep();
7250	for (i = 0; i < nr_pages; i++) {
7251		cond_resched();
7252		clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE);
7253	}
7254}
7255
7256static int clear_subpage(unsigned long addr, int idx, void *arg)
7257{
7258	struct folio *folio = arg;
7259
7260	clear_user_highpage(folio_page(folio, idx), addr);
7261	return 0;
7262}
7263
7264/**
7265 * folio_zero_user - Zero a folio which will be mapped to userspace.
7266 * @folio: The folio to zero.
7267 * @addr_hint: The address will be accessed or the base address if uncelar.
7268 */
7269void folio_zero_user(struct folio *folio, unsigned long addr_hint)
7270{
7271	unsigned int nr_pages = folio_nr_pages(folio);
7272
7273	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
7274		clear_gigantic_page(folio, addr_hint, nr_pages);
7275	else
7276		process_huge_page(addr_hint, nr_pages, clear_subpage, folio);
7277}
7278
7279static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
7280				   unsigned long addr_hint,
7281				   struct vm_area_struct *vma,
7282				   unsigned int nr_pages)
7283{
7284	unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
7285	struct page *dst_page;
7286	struct page *src_page;
7287	int i;
7288
7289	for (i = 0; i < nr_pages; i++) {
7290		dst_page = folio_page(dst, i);
7291		src_page = folio_page(src, i);
7292
7293		cond_resched();
7294		if (copy_mc_user_highpage(dst_page, src_page,
7295					  addr + i*PAGE_SIZE, vma))
7296			return -EHWPOISON;
7297	}
7298	return 0;
7299}
7300
7301struct copy_subpage_arg {
7302	struct folio *dst;
7303	struct folio *src;
7304	struct vm_area_struct *vma;
7305};
7306
7307static int copy_subpage(unsigned long addr, int idx, void *arg)
7308{
7309	struct copy_subpage_arg *copy_arg = arg;
7310	struct page *dst = folio_page(copy_arg->dst, idx);
7311	struct page *src = folio_page(copy_arg->src, idx);
7312
7313	if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma))
7314		return -EHWPOISON;
7315	return 0;
7316}
7317
7318int copy_user_large_folio(struct folio *dst, struct folio *src,
7319			  unsigned long addr_hint, struct vm_area_struct *vma)
7320{
7321	unsigned int nr_pages = folio_nr_pages(dst);
7322	struct copy_subpage_arg arg = {
7323		.dst = dst,
7324		.src = src,
7325		.vma = vma,
7326	};
7327
7328	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
7329		return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages);
7330
7331	return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg);
7332}
7333
7334long copy_folio_from_user(struct folio *dst_folio,
7335			   const void __user *usr_src,
7336			   bool allow_pagefault)
7337{
7338	void *kaddr;
7339	unsigned long i, rc = 0;
7340	unsigned int nr_pages = folio_nr_pages(dst_folio);
7341	unsigned long ret_val = nr_pages * PAGE_SIZE;
7342	struct page *subpage;
7343
7344	for (i = 0; i < nr_pages; i++) {
7345		subpage = folio_page(dst_folio, i);
7346		kaddr = kmap_local_page(subpage);
7347		if (!allow_pagefault)
7348			pagefault_disable();
7349		rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
7350		if (!allow_pagefault)
7351			pagefault_enable();
7352		kunmap_local(kaddr);
7353
7354		ret_val -= (PAGE_SIZE - rc);
7355		if (rc)
7356			break;
7357
7358		flush_dcache_page(subpage);
7359
7360		cond_resched();
7361	}
7362	return ret_val;
7363}
7364#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
7365
7366#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS
7367
7368static struct kmem_cache *page_ptl_cachep;
7369
7370void __init ptlock_cache_init(void)
7371{
7372	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
7373			SLAB_PANIC, NULL);
7374}
7375
7376bool ptlock_alloc(struct ptdesc *ptdesc)
7377{
7378	spinlock_t *ptl;
7379
7380	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
7381	if (!ptl)
7382		return false;
7383	ptdesc->ptl = ptl;
7384	return true;
7385}
7386
7387void ptlock_free(struct ptdesc *ptdesc)
7388{
7389	if (ptdesc->ptl)
7390		kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
7391}
7392#endif
7393
7394void vma_pgtable_walk_begin(struct vm_area_struct *vma)
7395{
7396	if (is_vm_hugetlb_page(vma))
7397		hugetlb_vma_lock_read(vma);
7398}
7399
7400void vma_pgtable_walk_end(struct vm_area_struct *vma)
7401{
7402	if (is_vm_hugetlb_page(vma))
7403		hugetlb_vma_unlock_read(vma);
7404}