mm/madvise.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / madvise.c
at master 61 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *	linux/mm/madvise.c
   4 *
   5 * Copyright (C) 1999  Linus Torvalds
   6 * Copyright (C) 2002  Christoph Hellwig
   7 */
   8
   9#include <linux/mman.h>
  10#include <linux/pagemap.h>
  11#include <linux/syscalls.h>
  12#include <linux/mempolicy.h>
  13#include <linux/page-isolation.h>
  14#include <linux/page_idle.h>
  15#include <linux/userfaultfd_k.h>
  16#include <linux/hugetlb.h>
  17#include <linux/falloc.h>
  18#include <linux/fadvise.h>
  19#include <linux/sched.h>
  20#include <linux/sched/mm.h>
  21#include <linux/mm_inline.h>
  22#include <linux/mmu_context.h>
  23#include <linux/string.h>
  24#include <linux/uio.h>
  25#include <linux/ksm.h>
  26#include <linux/fs.h>
  27#include <linux/file.h>
  28#include <linux/blkdev.h>
  29#include <linux/backing-dev.h>
  30#include <linux/pagewalk.h>
  31#include <linux/swap.h>
  32#include <linux/leafops.h>
  33#include <linux/shmem_fs.h>
  34#include <linux/mmu_notifier.h>
  35
  36#include <asm/tlb.h>
  37
  38#include "internal.h"
  39#include "swap.h"
  40
  41#define __MADV_SET_ANON_VMA_NAME (-1)
  42
  43/*
  44 * Maximum number of attempts we make to install guard pages before we give up
  45 * and return -ERESTARTNOINTR to have userspace try again.
  46 */
  47#define MAX_MADVISE_GUARD_RETRIES 3
  48
  49struct madvise_walk_private {
  50	struct mmu_gather *tlb;
  51	bool pageout;
  52};
  53
  54enum madvise_lock_mode {
  55	MADVISE_NO_LOCK,
  56	MADVISE_MMAP_READ_LOCK,
  57	MADVISE_MMAP_WRITE_LOCK,
  58	MADVISE_VMA_READ_LOCK,
  59};
  60
  61struct madvise_behavior_range {
  62	unsigned long start;
  63	unsigned long end;
  64};
  65
  66struct madvise_behavior {
  67	struct mm_struct *mm;
  68	int behavior;
  69	struct mmu_gather *tlb;
  70	enum madvise_lock_mode lock_mode;
  71	struct anon_vma_name *anon_name;
  72
  73	/*
  74	 * The range over which the behaviour is currently being applied. If
  75	 * traversing multiple VMAs, this is updated for each.
  76	 */
  77	struct madvise_behavior_range range;
  78	/* The VMA and VMA preceding it (if applicable) currently targeted. */
  79	struct vm_area_struct *prev;
  80	struct vm_area_struct *vma;
  81	bool lock_dropped;
  82};
  83
  84#ifdef CONFIG_ANON_VMA_NAME
  85static int madvise_walk_vmas(struct madvise_behavior *madv_behavior);
  86
  87struct anon_vma_name *anon_vma_name_alloc(const char *name)
  88{
  89	struct anon_vma_name *anon_name;
  90	size_t count;
  91
  92	/* Add 1 for NUL terminator at the end of the anon_name->name */
  93	count = strlen(name) + 1;
  94	anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
  95	if (anon_name) {
  96		kref_init(&anon_name->kref);
  97		memcpy(anon_name->name, name, count);
  98	}
  99
 100	return anon_name;
 101}
 102
 103void anon_vma_name_free(struct kref *kref)
 104{
 105	struct anon_vma_name *anon_name =
 106			container_of(kref, struct anon_vma_name, kref);
 107	kfree(anon_name);
 108}
 109
 110struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
 111{
 112	if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
 113		vma_assert_locked(vma);
 114
 115	return vma->anon_name;
 116}
 117
 118/* mmap_lock should be write-locked */
 119static int replace_anon_vma_name(struct vm_area_struct *vma,
 120				 struct anon_vma_name *anon_name)
 121{
 122	struct anon_vma_name *orig_name = anon_vma_name(vma);
 123
 124	if (!anon_name) {
 125		vma->anon_name = NULL;
 126		anon_vma_name_put(orig_name);
 127		return 0;
 128	}
 129
 130	if (anon_vma_name_eq(orig_name, anon_name))
 131		return 0;
 132
 133	vma->anon_name = anon_vma_name_reuse(anon_name);
 134	anon_vma_name_put(orig_name);
 135
 136	return 0;
 137}
 138#else /* CONFIG_ANON_VMA_NAME */
 139static int replace_anon_vma_name(struct vm_area_struct *vma,
 140				 struct anon_vma_name *anon_name)
 141{
 142	if (anon_name)
 143		return -EINVAL;
 144
 145	return 0;
 146}
 147#endif /* CONFIG_ANON_VMA_NAME */
 148/*
 149 * Update the vm_flags or anon_name on region of a vma, splitting it or merging
 150 * it as necessary. Must be called with mmap_lock held for writing.
 151 */
 152static int madvise_update_vma(vm_flags_t new_flags,
 153		struct madvise_behavior *madv_behavior)
 154{
 155	struct vm_area_struct *vma = madv_behavior->vma;
 156	struct madvise_behavior_range *range = &madv_behavior->range;
 157	struct anon_vma_name *anon_name = madv_behavior->anon_name;
 158	bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME;
 159	VMA_ITERATOR(vmi, madv_behavior->mm, range->start);
 160
 161	if (new_flags == vma->vm_flags && (!set_new_anon_name ||
 162			anon_vma_name_eq(anon_vma_name(vma), anon_name)))
 163		return 0;
 164
 165	if (set_new_anon_name)
 166		vma = vma_modify_name(&vmi, madv_behavior->prev, vma,
 167			range->start, range->end, anon_name);
 168	else
 169		vma = vma_modify_flags(&vmi, madv_behavior->prev, vma,
 170			range->start, range->end, &new_flags);
 171
 172	if (IS_ERR(vma))
 173		return PTR_ERR(vma);
 174
 175	madv_behavior->vma = vma;
 176
 177	/* vm_flags is protected by the mmap_lock held in write mode. */
 178	vma_start_write(vma);
 179	vm_flags_reset(vma, new_flags);
 180	if (set_new_anon_name)
 181		return replace_anon_vma_name(vma, anon_name);
 182
 183	return 0;
 184}
 185
 186#ifdef CONFIG_SWAP
 187static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 188		unsigned long end, struct mm_walk *walk)
 189{
 190	struct vm_area_struct *vma = walk->private;
 191	struct swap_iocb *splug = NULL;
 192	pte_t *ptep = NULL;
 193	spinlock_t *ptl;
 194	unsigned long addr;
 195
 196	for (addr = start; addr < end; addr += PAGE_SIZE) {
 197		pte_t pte;
 198		softleaf_t entry;
 199		struct folio *folio;
 200
 201		if (!ptep++) {
 202			ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 203			if (!ptep)
 204				break;
 205		}
 206
 207		pte = ptep_get(ptep);
 208		entry = softleaf_from_pte(pte);
 209		if (unlikely(!softleaf_is_swap(entry)))
 210			continue;
 211
 212		pte_unmap_unlock(ptep, ptl);
 213		ptep = NULL;
 214
 215		folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 216					     vma, addr, &splug);
 217		if (folio)
 218			folio_put(folio);
 219	}
 220
 221	if (ptep)
 222		pte_unmap_unlock(ptep, ptl);
 223	swap_read_unplug(splug);
 224	cond_resched();
 225
 226	return 0;
 227}
 228
 229static const struct mm_walk_ops swapin_walk_ops = {
 230	.pmd_entry		= swapin_walk_pmd_entry,
 231	.walk_lock		= PGWALK_RDLOCK,
 232};
 233
 234static void shmem_swapin_range(struct vm_area_struct *vma,
 235		unsigned long start, unsigned long end,
 236		struct address_space *mapping)
 237{
 238	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
 239	pgoff_t end_index = linear_page_index(vma, end) - 1;
 240	struct folio *folio;
 241	struct swap_iocb *splug = NULL;
 242
 243	rcu_read_lock();
 244	xas_for_each(&xas, folio, end_index) {
 245		unsigned long addr;
 246		swp_entry_t entry;
 247
 248		if (!xa_is_value(folio))
 249			continue;
 250		entry = radix_to_swp_entry(folio);
 251		/* There might be swapin error entries in shmem mapping. */
 252		if (!softleaf_is_swap(entry))
 253			continue;
 254
 255		addr = vma->vm_start +
 256			((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
 257		xas_pause(&xas);
 258		rcu_read_unlock();
 259
 260		folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
 261					     vma, addr, &splug);
 262		if (folio)
 263			folio_put(folio);
 264
 265		rcu_read_lock();
 266	}
 267	rcu_read_unlock();
 268	swap_read_unplug(splug);
 269}
 270#endif		/* CONFIG_SWAP */
 271
 272static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior)
 273{
 274	VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK);
 275	madv_behavior->lock_dropped = true;
 276}
 277
 278/*
 279 * Schedule all required I/O operations.  Do not wait for completion.
 280 */
 281static long madvise_willneed(struct madvise_behavior *madv_behavior)
 282{
 283	struct vm_area_struct *vma = madv_behavior->vma;
 284	struct mm_struct *mm = madv_behavior->mm;
 285	struct file *file = vma->vm_file;
 286	unsigned long start = madv_behavior->range.start;
 287	unsigned long end = madv_behavior->range.end;
 288	loff_t offset;
 289
 290#ifdef CONFIG_SWAP
 291	if (!file) {
 292		walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma);
 293		lru_add_drain(); /* Push any new pages onto the LRU now */
 294		return 0;
 295	}
 296
 297	if (shmem_mapping(file->f_mapping)) {
 298		shmem_swapin_range(vma, start, end, file->f_mapping);
 299		lru_add_drain(); /* Push any new pages onto the LRU now */
 300		return 0;
 301	}
 302#else
 303	if (!file)
 304		return -EBADF;
 305#endif
 306
 307	if (IS_DAX(file_inode(file))) {
 308		/* no bad return value, but ignore advice */
 309		return 0;
 310	}
 311
 312	/*
 313	 * Filesystem's fadvise may need to take various locks.  We need to
 314	 * explicitly grab a reference because the vma (and hence the
 315	 * vma's reference to the file) can go away as soon as we drop
 316	 * mmap_lock.
 317	 */
 318	mark_mmap_lock_dropped(madv_behavior);
 319	get_file(file);
 320	offset = (loff_t)(start - vma->vm_start)
 321			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 322	mmap_read_unlock(mm);
 323	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
 324	fput(file);
 325	mmap_read_lock(mm);
 326	return 0;
 327}
 328
 329static inline bool can_do_file_pageout(struct vm_area_struct *vma)
 330{
 331	if (!vma->vm_file)
 332		return false;
 333	/*
 334	 * paging out pagecache only for non-anonymous mappings that correspond
 335	 * to the files the calling process could (if tried) open for writing;
 336	 * otherwise we'd be including shared non-exclusive mappings, which
 337	 * opens a side channel.
 338	 */
 339	return inode_owner_or_capable(&nop_mnt_idmap,
 340				      file_inode(vma->vm_file)) ||
 341	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 342}
 343
 344static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
 345					  struct folio *folio, pte_t *ptep,
 346					  pte_t *ptentp)
 347{
 348	int max_nr = (end - addr) / PAGE_SIZE;
 349
 350	return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr,
 351				     FPB_MERGE_YOUNG_DIRTY);
 352}
 353
 354static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 355				unsigned long addr, unsigned long end,
 356				struct mm_walk *walk)
 357{
 358	struct madvise_walk_private *private = walk->private;
 359	struct mmu_gather *tlb = private->tlb;
 360	bool pageout = private->pageout;
 361	struct mm_struct *mm = tlb->mm;
 362	struct vm_area_struct *vma = walk->vma;
 363	pte_t *start_pte, *pte, ptent;
 364	spinlock_t *ptl;
 365	struct folio *folio = NULL;
 366	LIST_HEAD(folio_list);
 367	bool pageout_anon_only_filter;
 368	unsigned int batch_count = 0;
 369	int nr;
 370
 371	if (fatal_signal_pending(current))
 372		return -EINTR;
 373
 374	pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
 375					!can_do_file_pageout(vma);
 376
 377#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 378	if (pmd_trans_huge(*pmd)) {
 379		pmd_t orig_pmd;
 380		unsigned long next = pmd_addr_end(addr, end);
 381
 382		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 383		ptl = pmd_trans_huge_lock(pmd, vma);
 384		if (!ptl)
 385			return 0;
 386
 387		orig_pmd = *pmd;
 388		if (is_huge_zero_pmd(orig_pmd))
 389			goto huge_unlock;
 390
 391		if (unlikely(!pmd_present(orig_pmd))) {
 392			VM_BUG_ON(thp_migration_supported() &&
 393					!pmd_is_migration_entry(orig_pmd));
 394			goto huge_unlock;
 395		}
 396
 397		folio = pmd_folio(orig_pmd);
 398
 399		/* Do not interfere with other mappings of this folio */
 400		if (folio_maybe_mapped_shared(folio))
 401			goto huge_unlock;
 402
 403		if (pageout_anon_only_filter && !folio_test_anon(folio))
 404			goto huge_unlock;
 405
 406		if (next - addr != HPAGE_PMD_SIZE) {
 407			int err;
 408
 409			folio_get(folio);
 410			spin_unlock(ptl);
 411			folio_lock(folio);
 412			err = split_folio(folio);
 413			folio_unlock(folio);
 414			folio_put(folio);
 415			if (!err)
 416				goto regular_folio;
 417			return 0;
 418		}
 419
 420		if (!pageout && pmd_young(orig_pmd)) {
 421			pmdp_invalidate(vma, addr, pmd);
 422			orig_pmd = pmd_mkold(orig_pmd);
 423
 424			set_pmd_at(mm, addr, pmd, orig_pmd);
 425			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 426		}
 427
 428		folio_clear_referenced(folio);
 429		folio_test_clear_young(folio);
 430		if (folio_test_active(folio))
 431			folio_set_workingset(folio);
 432		if (pageout) {
 433			if (folio_isolate_lru(folio)) {
 434				if (folio_test_unevictable(folio))
 435					folio_putback_lru(folio);
 436				else
 437					list_add(&folio->lru, &folio_list);
 438			}
 439		} else
 440			folio_deactivate(folio);
 441huge_unlock:
 442		spin_unlock(ptl);
 443		if (pageout)
 444			reclaim_pages(&folio_list);
 445		return 0;
 446	}
 447
 448regular_folio:
 449#endif
 450	tlb_change_page_size(tlb, PAGE_SIZE);
 451restart:
 452	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 453	if (!start_pte)
 454		return 0;
 455	flush_tlb_batched_pending(mm);
 456	arch_enter_lazy_mmu_mode();
 457	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
 458		nr = 1;
 459		ptent = ptep_get(pte);
 460
 461		if (++batch_count == SWAP_CLUSTER_MAX) {
 462			batch_count = 0;
 463			if (need_resched()) {
 464				arch_leave_lazy_mmu_mode();
 465				pte_unmap_unlock(start_pte, ptl);
 466				cond_resched();
 467				goto restart;
 468			}
 469		}
 470
 471		if (pte_none(ptent))
 472			continue;
 473
 474		if (!pte_present(ptent))
 475			continue;
 476
 477		folio = vm_normal_folio(vma, addr, ptent);
 478		if (!folio || folio_is_zone_device(folio))
 479			continue;
 480
 481		/*
 482		 * If we encounter a large folio, only split it if it is not
 483		 * fully mapped within the range we are operating on. Otherwise
 484		 * leave it as is so that it can be swapped out whole. If we
 485		 * fail to split a folio, leave it in place and advance to the
 486		 * next pte in the range.
 487		 */
 488		if (folio_test_large(folio)) {
 489			nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
 490			if (nr < folio_nr_pages(folio)) {
 491				int err;
 492
 493				if (folio_maybe_mapped_shared(folio))
 494					continue;
 495				if (pageout_anon_only_filter && !folio_test_anon(folio))
 496					continue;
 497				if (!folio_trylock(folio))
 498					continue;
 499				folio_get(folio);
 500				arch_leave_lazy_mmu_mode();
 501				pte_unmap_unlock(start_pte, ptl);
 502				start_pte = NULL;
 503				err = split_folio(folio);
 504				folio_unlock(folio);
 505				folio_put(folio);
 506				start_pte = pte =
 507					pte_offset_map_lock(mm, pmd, addr, &ptl);
 508				if (!start_pte)
 509					break;
 510				flush_tlb_batched_pending(mm);
 511				arch_enter_lazy_mmu_mode();
 512				if (!err)
 513					nr = 0;
 514				continue;
 515			}
 516		}
 517
 518		/*
 519		 * Do not interfere with other mappings of this folio and
 520		 * non-LRU folio. If we have a large folio at this point, we
 521		 * know it is fully mapped so if its mapcount is the same as its
 522		 * number of pages, it must be exclusive.
 523		 */
 524		if (!folio_test_lru(folio) ||
 525		    folio_mapcount(folio) != folio_nr_pages(folio))
 526			continue;
 527
 528		if (pageout_anon_only_filter && !folio_test_anon(folio))
 529			continue;
 530
 531		if (!pageout && pte_young(ptent)) {
 532			clear_young_dirty_ptes(vma, addr, pte, nr,
 533					       CYDP_CLEAR_YOUNG);
 534			tlb_remove_tlb_entries(tlb, pte, nr, addr);
 535		}
 536
 537		/*
 538		 * We are deactivating a folio for accelerating reclaiming.
 539		 * VM couldn't reclaim the folio unless we clear PG_young.
 540		 * As a side effect, it makes confuse idle-page tracking
 541		 * because they will miss recent referenced history.
 542		 */
 543		folio_clear_referenced(folio);
 544		folio_test_clear_young(folio);
 545		if (folio_test_active(folio))
 546			folio_set_workingset(folio);
 547		if (pageout) {
 548			if (folio_isolate_lru(folio)) {
 549				if (folio_test_unevictable(folio))
 550					folio_putback_lru(folio);
 551				else
 552					list_add(&folio->lru, &folio_list);
 553			}
 554		} else
 555			folio_deactivate(folio);
 556	}
 557
 558	if (start_pte) {
 559		arch_leave_lazy_mmu_mode();
 560		pte_unmap_unlock(start_pte, ptl);
 561	}
 562	if (pageout)
 563		reclaim_pages(&folio_list);
 564	cond_resched();
 565
 566	return 0;
 567}
 568
 569static const struct mm_walk_ops cold_walk_ops = {
 570	.pmd_entry = madvise_cold_or_pageout_pte_range,
 571	.walk_lock = PGWALK_RDLOCK,
 572};
 573
 574static void madvise_cold_page_range(struct mmu_gather *tlb,
 575		struct madvise_behavior *madv_behavior)
 576
 577{
 578	struct vm_area_struct *vma = madv_behavior->vma;
 579	struct madvise_behavior_range *range = &madv_behavior->range;
 580	struct madvise_walk_private walk_private = {
 581		.pageout = false,
 582		.tlb = tlb,
 583	};
 584
 585	tlb_start_vma(tlb, vma);
 586	walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
 587			&walk_private);
 588	tlb_end_vma(tlb, vma);
 589}
 590
 591static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 592{
 593	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
 594}
 595
 596static long madvise_cold(struct madvise_behavior *madv_behavior)
 597{
 598	struct vm_area_struct *vma = madv_behavior->vma;
 599	struct mmu_gather tlb;
 600
 601	if (!can_madv_lru_vma(vma))
 602		return -EINVAL;
 603
 604	lru_add_drain();
 605	tlb_gather_mmu(&tlb, madv_behavior->mm);
 606	madvise_cold_page_range(&tlb, madv_behavior);
 607	tlb_finish_mmu(&tlb);
 608
 609	return 0;
 610}
 611
 612static void madvise_pageout_page_range(struct mmu_gather *tlb,
 613		struct vm_area_struct *vma,
 614		struct madvise_behavior_range *range)
 615{
 616	struct madvise_walk_private walk_private = {
 617		.pageout = true,
 618		.tlb = tlb,
 619	};
 620
 621	tlb_start_vma(tlb, vma);
 622	walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
 623			    &walk_private);
 624	tlb_end_vma(tlb, vma);
 625}
 626
 627static long madvise_pageout(struct madvise_behavior *madv_behavior)
 628{
 629	struct mmu_gather tlb;
 630	struct vm_area_struct *vma = madv_behavior->vma;
 631
 632	if (!can_madv_lru_vma(vma))
 633		return -EINVAL;
 634
 635	/*
 636	 * If the VMA belongs to a private file mapping, there can be private
 637	 * dirty pages which can be paged out if even this process is neither
 638	 * owner nor write capable of the file. We allow private file mappings
 639	 * further to pageout dirty anon pages.
 640	 */
 641	if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
 642				(vma->vm_flags & VM_MAYSHARE)))
 643		return 0;
 644
 645	lru_add_drain();
 646	tlb_gather_mmu(&tlb, madv_behavior->mm);
 647	madvise_pageout_page_range(&tlb, vma, &madv_behavior->range);
 648	tlb_finish_mmu(&tlb);
 649
 650	return 0;
 651}
 652
 653static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 654				unsigned long end, struct mm_walk *walk)
 655
 656{
 657	const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
 658	struct mmu_gather *tlb = walk->private;
 659	struct mm_struct *mm = tlb->mm;
 660	struct vm_area_struct *vma = walk->vma;
 661	spinlock_t *ptl;
 662	pte_t *start_pte, *pte, ptent;
 663	struct folio *folio;
 664	int nr_swap = 0;
 665	unsigned long next;
 666	int nr, max_nr;
 667
 668	next = pmd_addr_end(addr, end);
 669	if (pmd_trans_huge(*pmd))
 670		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
 671			return 0;
 672
 673	tlb_change_page_size(tlb, PAGE_SIZE);
 674	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 675	if (!start_pte)
 676		return 0;
 677	flush_tlb_batched_pending(mm);
 678	arch_enter_lazy_mmu_mode();
 679	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
 680		nr = 1;
 681		ptent = ptep_get(pte);
 682
 683		if (pte_none(ptent))
 684			continue;
 685		/*
 686		 * If the pte has swp_entry, just clear page table to
 687		 * prevent swap-in which is more expensive rather than
 688		 * (page allocation + zeroing).
 689		 */
 690		if (!pte_present(ptent)) {
 691			softleaf_t entry = softleaf_from_pte(ptent);
 692
 693			if (softleaf_is_swap(entry)) {
 694				max_nr = (end - addr) / PAGE_SIZE;
 695				nr = swap_pte_batch(pte, max_nr, ptent);
 696				nr_swap -= nr;
 697				free_swap_and_cache_nr(entry, nr);
 698				clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
 699			} else if (softleaf_is_hwpoison(entry) ||
 700				   softleaf_is_poison_marker(entry)) {
 701				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 702			}
 703			continue;
 704		}
 705
 706		folio = vm_normal_folio(vma, addr, ptent);
 707		if (!folio || folio_is_zone_device(folio))
 708			continue;
 709
 710		/*
 711		 * If we encounter a large folio, only split it if it is not
 712		 * fully mapped within the range we are operating on. Otherwise
 713		 * leave it as is so that it can be marked as lazyfree. If we
 714		 * fail to split a folio, leave it in place and advance to the
 715		 * next pte in the range.
 716		 */
 717		if (folio_test_large(folio)) {
 718			nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent);
 719			if (nr < folio_nr_pages(folio)) {
 720				int err;
 721
 722				if (folio_maybe_mapped_shared(folio))
 723					continue;
 724				if (!folio_trylock(folio))
 725					continue;
 726				folio_get(folio);
 727				arch_leave_lazy_mmu_mode();
 728				pte_unmap_unlock(start_pte, ptl);
 729				start_pte = NULL;
 730				err = split_folio(folio);
 731				folio_unlock(folio);
 732				folio_put(folio);
 733				pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 734				start_pte = pte;
 735				if (!start_pte)
 736					break;
 737				flush_tlb_batched_pending(mm);
 738				arch_enter_lazy_mmu_mode();
 739				if (!err)
 740					nr = 0;
 741				continue;
 742			}
 743		}
 744
 745		if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
 746			if (!folio_trylock(folio))
 747				continue;
 748			/*
 749			 * If we have a large folio at this point, we know it is
 750			 * fully mapped so if its mapcount is the same as its
 751			 * number of pages, it must be exclusive.
 752			 */
 753			if (folio_mapcount(folio) != folio_nr_pages(folio)) {
 754				folio_unlock(folio);
 755				continue;
 756			}
 757
 758			if (folio_test_swapcache(folio) &&
 759			    !folio_free_swap(folio)) {
 760				folio_unlock(folio);
 761				continue;
 762			}
 763
 764			folio_clear_dirty(folio);
 765			folio_unlock(folio);
 766		}
 767
 768		if (pte_young(ptent) || pte_dirty(ptent)) {
 769			clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
 770			tlb_remove_tlb_entries(tlb, pte, nr, addr);
 771		}
 772		folio_mark_lazyfree(folio);
 773	}
 774
 775	if (nr_swap)
 776		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 777	if (start_pte) {
 778		arch_leave_lazy_mmu_mode();
 779		pte_unmap_unlock(start_pte, ptl);
 780	}
 781	cond_resched();
 782
 783	return 0;
 784}
 785
 786static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
 787{
 788	switch (mode) {
 789	case MADVISE_VMA_READ_LOCK:
 790		return PGWALK_VMA_RDLOCK_VERIFY;
 791	case MADVISE_MMAP_READ_LOCK:
 792		return PGWALK_RDLOCK;
 793	default:
 794		/* Other modes don't require fixing up the walk_lock */
 795		WARN_ON_ONCE(1);
 796		return PGWALK_RDLOCK;
 797	}
 798}
 799
 800static int madvise_free_single_vma(struct madvise_behavior *madv_behavior)
 801{
 802	struct mm_struct *mm = madv_behavior->mm;
 803	struct vm_area_struct *vma = madv_behavior->vma;
 804	unsigned long start_addr = madv_behavior->range.start;
 805	unsigned long end_addr = madv_behavior->range.end;
 806	struct mmu_notifier_range range;
 807	struct mmu_gather *tlb = madv_behavior->tlb;
 808	struct mm_walk_ops walk_ops = {
 809		.pmd_entry		= madvise_free_pte_range,
 810	};
 811
 812	/* MADV_FREE works for only anon vma at the moment */
 813	if (!vma_is_anonymous(vma))
 814		return -EINVAL;
 815
 816	range.start = max(vma->vm_start, start_addr);
 817	if (range.start >= vma->vm_end)
 818		return -EINVAL;
 819	range.end = min(vma->vm_end, end_addr);
 820	if (range.end <= vma->vm_start)
 821		return -EINVAL;
 822	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
 823				range.start, range.end);
 824
 825	lru_add_drain();
 826	update_hiwater_rss(mm);
 827
 828	mmu_notifier_invalidate_range_start(&range);
 829	tlb_start_vma(tlb, vma);
 830	walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
 831	walk_page_range_vma(vma, range.start, range.end,
 832			&walk_ops, tlb);
 833	tlb_end_vma(tlb, vma);
 834	mmu_notifier_invalidate_range_end(&range);
 835	return 0;
 836}
 837
 838/*
 839 * Application no longer needs these pages.  If the pages are dirty,
 840 * it's OK to just throw them away.  The app will be more careful about
 841 * data it wants to keep.  Be sure to free swap resources too.  The
 842 * zap_page_range_single call sets things up for shrink_active_list to actually
 843 * free these pages later if no one else has touched them in the meantime,
 844 * although we could add these pages to a global reuse list for
 845 * shrink_active_list to pick up before reclaiming other pages.
 846 *
 847 * NB: This interface discards data rather than pushes it out to swap,
 848 * as some implementations do.  This has performance implications for
 849 * applications like large transactional databases which want to discard
 850 * pages in anonymous maps after committing to backing store the data
 851 * that was kept in them.  There is no reason to write this data out to
 852 * the swap area if the application is discarding it.
 853 *
 854 * An interface that causes the system to free clean pages and flush
 855 * dirty pages is already available as msync(MS_INVALIDATE).
 856 */
 857static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior)
 858
 859{
 860	struct madvise_behavior_range *range = &madv_behavior->range;
 861	struct zap_details details = {
 862		.reclaim_pt = true,
 863		.even_cows = true,
 864	};
 865
 866	zap_page_range_single_batched(
 867			madv_behavior->tlb, madv_behavior->vma, range->start,
 868			range->end - range->start, &details);
 869	return 0;
 870}
 871
 872static
 873bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior)
 874{
 875	struct vm_area_struct *vma = madv_behavior->vma;
 876	int behavior = madv_behavior->behavior;
 877	struct madvise_behavior_range *range = &madv_behavior->range;
 878
 879	if (!is_vm_hugetlb_page(vma)) {
 880		unsigned int forbidden = VM_PFNMAP;
 881
 882		if (behavior != MADV_DONTNEED_LOCKED)
 883			forbidden |= VM_LOCKED;
 884
 885		return !(vma->vm_flags & forbidden);
 886	}
 887
 888	if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
 889		return false;
 890	if (range->start & ~huge_page_mask(hstate_vma(vma)))
 891		return false;
 892
 893	/*
 894	 * Madvise callers expect the length to be rounded up to PAGE_SIZE
 895	 * boundaries, and may be unaware that this VMA uses huge pages.
 896	 * Avoid unexpected data loss by rounding down the number of
 897	 * huge pages freed.
 898	 */
 899	range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma)));
 900
 901	return true;
 902}
 903
 904static long madvise_dontneed_free(struct madvise_behavior *madv_behavior)
 905{
 906	struct mm_struct *mm = madv_behavior->mm;
 907	struct madvise_behavior_range *range = &madv_behavior->range;
 908	int behavior = madv_behavior->behavior;
 909
 910	if (!madvise_dontneed_free_valid_vma(madv_behavior))
 911		return -EINVAL;
 912
 913	if (range->start == range->end)
 914		return 0;
 915
 916	if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) {
 917		struct vm_area_struct *vma;
 918
 919		mark_mmap_lock_dropped(madv_behavior);
 920		mmap_read_lock(mm);
 921		madv_behavior->vma = vma = vma_lookup(mm, range->start);
 922		if (!vma)
 923			return -ENOMEM;
 924		/*
 925		 * Potential end adjustment for hugetlb vma is OK as
 926		 * the check below keeps end within vma.
 927		 */
 928		if (!madvise_dontneed_free_valid_vma(madv_behavior))
 929			return -EINVAL;
 930		if (range->end > vma->vm_end) {
 931			/*
 932			 * Don't fail if end > vma->vm_end. If the old
 933			 * vma was split while the mmap_lock was
 934			 * released the effect of the concurrent
 935			 * operation may not cause madvise() to
 936			 * have an undefined result. There may be an
 937			 * adjacent next vma that we'll walk
 938			 * next. userfaultfd_remove() will generate an
 939			 * UFFD_EVENT_REMOVE repetition on the
 940			 * end-vma->vm_end range, but the manager can
 941			 * handle a repetition fine.
 942			 */
 943			range->end = vma->vm_end;
 944		}
 945		/*
 946		 * If the memory region between start and end was
 947		 * originally backed by 4kB pages and then remapped to
 948		 * be backed by hugepages while mmap_lock was dropped,
 949		 * the adjustment for hugetlb vma above may have rounded
 950		 * end down to the start address.
 951		 */
 952		if (range->start == range->end)
 953			return 0;
 954		VM_WARN_ON(range->start > range->end);
 955	}
 956
 957	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
 958		return madvise_dontneed_single_vma(madv_behavior);
 959	else if (behavior == MADV_FREE)
 960		return madvise_free_single_vma(madv_behavior);
 961	else
 962		return -EINVAL;
 963}
 964
 965static long madvise_populate(struct madvise_behavior *madv_behavior)
 966{
 967	struct mm_struct *mm = madv_behavior->mm;
 968	const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE;
 969	int locked = 1;
 970	unsigned long start = madv_behavior->range.start;
 971	unsigned long end = madv_behavior->range.end;
 972	long pages;
 973
 974	while (start < end) {
 975		/* Populate (prefault) page tables readable/writable. */
 976		pages = faultin_page_range(mm, start, end, write, &locked);
 977		if (!locked) {
 978			mmap_read_lock(mm);
 979			locked = 1;
 980		}
 981		if (pages < 0) {
 982			switch (pages) {
 983			case -EINTR:
 984				return -EINTR;
 985			case -EINVAL: /* Incompatible mappings / permissions. */
 986				return -EINVAL;
 987			case -EHWPOISON:
 988				return -EHWPOISON;
 989			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
 990				return -EFAULT;
 991			default:
 992				pr_warn_once("%s: unhandled return value: %ld\n",
 993					     __func__, pages);
 994				fallthrough;
 995			case -ENOMEM: /* No VMA or out of memory. */
 996				return -ENOMEM;
 997			}
 998		}
 999		start += pages * PAGE_SIZE;
1000	}
1001	return 0;
1002}
1003
1004/*
1005 * Application wants to free up the pages and associated backing store.
1006 * This is effectively punching a hole into the middle of a file.
1007 */
1008static long madvise_remove(struct madvise_behavior *madv_behavior)
1009{
1010	loff_t offset;
1011	int error;
1012	struct file *f;
1013	struct mm_struct *mm = madv_behavior->mm;
1014	struct vm_area_struct *vma = madv_behavior->vma;
1015	unsigned long start = madv_behavior->range.start;
1016	unsigned long end = madv_behavior->range.end;
1017
1018	mark_mmap_lock_dropped(madv_behavior);
1019
1020	if (vma->vm_flags & VM_LOCKED)
1021		return -EINVAL;
1022
1023	f = vma->vm_file;
1024
1025	if (!f || !f->f_mapping || !f->f_mapping->host) {
1026			return -EINVAL;
1027	}
1028
1029	if (!vma_is_shared_maywrite(vma))
1030		return -EACCES;
1031
1032	offset = (loff_t)(start - vma->vm_start)
1033			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
1034
1035	/*
1036	 * Filesystem's fallocate may need to take i_rwsem.  We need to
1037	 * explicitly grab a reference because the vma (and hence the
1038	 * vma's reference to the file) can go away as soon as we drop
1039	 * mmap_lock.
1040	 */
1041	get_file(f);
1042	if (userfaultfd_remove(vma, start, end)) {
1043		/* mmap_lock was not released by userfaultfd_remove() */
1044		mmap_read_unlock(mm);
1045	}
1046	error = vfs_fallocate(f,
1047				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1048				offset, end - start);
1049	fput(f);
1050	mmap_read_lock(mm);
1051	return error;
1052}
1053
1054static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
1055{
1056	vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
1057
1058	/*
1059	 * A user could lock after setting a guard range but that's fine, as
1060	 * they'd not be able to fault in. The issue arises when we try to zap
1061	 * existing locked VMAs. We don't want to do that.
1062	 */
1063	if (!allow_locked)
1064		disallowed |= VM_LOCKED;
1065
1066	return !(vma->vm_flags & disallowed);
1067}
1068
1069static bool is_guard_pte_marker(pte_t ptent)
1070{
1071	const softleaf_t entry = softleaf_from_pte(ptent);
1072
1073	return softleaf_is_guard_marker(entry);
1074}
1075
1076static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
1077				   unsigned long next, struct mm_walk *walk)
1078{
1079	pud_t pudval = pudp_get(pud);
1080
1081	/* If huge return >0 so we abort the operation + zap. */
1082	return pud_trans_huge(pudval);
1083}
1084
1085static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
1086				   unsigned long next, struct mm_walk *walk)
1087{
1088	pmd_t pmdval = pmdp_get(pmd);
1089
1090	/* If huge return >0 so we abort the operation + zap. */
1091	return pmd_trans_huge(pmdval);
1092}
1093
1094static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
1095				   unsigned long next, struct mm_walk *walk)
1096{
1097	pte_t pteval = ptep_get(pte);
1098	unsigned long *nr_pages = (unsigned long *)walk->private;
1099
1100	/* If there is already a guard page marker, we have nothing to do. */
1101	if (is_guard_pte_marker(pteval)) {
1102		(*nr_pages)++;
1103
1104		return 0;
1105	}
1106
1107	/* If populated return >0 so we abort the operation + zap. */
1108	return 1;
1109}
1110
1111static int guard_install_set_pte(unsigned long addr, unsigned long next,
1112				 pte_t *ptep, struct mm_walk *walk)
1113{
1114	unsigned long *nr_pages = (unsigned long *)walk->private;
1115
1116	/* Simply install a PTE marker, this causes segfault on access. */
1117	*ptep = make_pte_marker(PTE_MARKER_GUARD);
1118	(*nr_pages)++;
1119
1120	return 0;
1121}
1122
1123static long madvise_guard_install(struct madvise_behavior *madv_behavior)
1124{
1125	struct vm_area_struct *vma = madv_behavior->vma;
1126	struct madvise_behavior_range *range = &madv_behavior->range;
1127	struct mm_walk_ops walk_ops = {
1128		.pud_entry	= guard_install_pud_entry,
1129		.pmd_entry	= guard_install_pmd_entry,
1130		.pte_entry	= guard_install_pte_entry,
1131		.install_pte	= guard_install_set_pte,
1132		.walk_lock	= get_walk_lock(madv_behavior->lock_mode),
1133	};
1134	long err;
1135	int i;
1136
1137	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
1138		return -EINVAL;
1139
1140	/*
1141	 * Set atomically under read lock. All pertinent readers will need to
1142	 * acquire an mmap/VMA write lock to read it. All remaining readers may
1143	 * or may not see the flag set, but we don't care.
1144	 */
1145	vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT);
1146
1147	/*
1148	 * If anonymous and we are establishing page tables the VMA ought to
1149	 * have an anon_vma associated with it.
1150	 *
1151	 * We will hold an mmap read lock if this is necessary, this is checked
1152	 * as part of the VMA lock logic.
1153	 */
1154	if (vma_is_anonymous(vma)) {
1155		VM_WARN_ON_ONCE(!vma->anon_vma &&
1156				madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK);
1157
1158		err = anon_vma_prepare(vma);
1159		if (err)
1160			return err;
1161	}
1162
1163	/*
1164	 * Optimistically try to install the guard marker pages first. If any
1165	 * non-guard pages or THP huge pages are encountered, give up and zap
1166	 * the range before trying again.
1167	 *
1168	 * We try a few times before giving up and releasing back to userland to
1169	 * loop around, releasing locks in the process to avoid contention.
1170	 *
1171	 * This would only happen due to races with e.g. page faults or
1172	 * khugepaged.
1173	 *
1174	 * In most cases we should simply install the guard markers immediately
1175	 * with no zap or looping.
1176	 */
1177	for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
1178		unsigned long nr_pages = 0;
1179
1180		/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
1181		if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK)
1182			err = walk_page_range_vma_unsafe(madv_behavior->vma,
1183					range->start, range->end, &walk_ops,
1184					&nr_pages);
1185		else
1186			err = walk_page_range_mm_unsafe(vma->vm_mm, range->start,
1187					range->end, &walk_ops, &nr_pages);
1188		if (err < 0)
1189			return err;
1190
1191		if (err == 0) {
1192			unsigned long nr_expected_pages =
1193				PHYS_PFN(range->end - range->start);
1194
1195			VM_WARN_ON(nr_pages != nr_expected_pages);
1196			return 0;
1197		}
1198
1199		/*
1200		 * OK some of the range have non-guard pages mapped, zap
1201		 * them. This leaves existing guard pages in place.
1202		 */
1203		zap_page_range_single(vma, range->start,
1204				range->end - range->start, NULL);
1205	}
1206
1207	/*
1208	 * We were unable to install the guard pages, return to userspace and
1209	 * immediately retry, relieving lock contention.
1210	 */
1211	return restart_syscall();
1212}
1213
1214static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
1215				  unsigned long next, struct mm_walk *walk)
1216{
1217	pud_t pudval = pudp_get(pud);
1218
1219	/* If huge, cannot have guard pages present, so no-op - skip. */
1220	if (pud_trans_huge(pudval))
1221		walk->action = ACTION_CONTINUE;
1222
1223	return 0;
1224}
1225
1226static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
1227				  unsigned long next, struct mm_walk *walk)
1228{
1229	pmd_t pmdval = pmdp_get(pmd);
1230
1231	/* If huge, cannot have guard pages present, so no-op - skip. */
1232	if (pmd_trans_huge(pmdval))
1233		walk->action = ACTION_CONTINUE;
1234
1235	return 0;
1236}
1237
1238static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
1239				  unsigned long next, struct mm_walk *walk)
1240{
1241	pte_t ptent = ptep_get(pte);
1242
1243	if (is_guard_pte_marker(ptent)) {
1244		/* Simply clear the PTE marker. */
1245		pte_clear_not_present_full(walk->mm, addr, pte, false);
1246		update_mmu_cache(walk->vma, addr, pte);
1247	}
1248
1249	return 0;
1250}
1251
1252static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
1253{
1254	struct vm_area_struct *vma = madv_behavior->vma;
1255	struct madvise_behavior_range *range = &madv_behavior->range;
1256	struct mm_walk_ops wallk_ops = {
1257		.pud_entry = guard_remove_pud_entry,
1258		.pmd_entry = guard_remove_pmd_entry,
1259		.pte_entry = guard_remove_pte_entry,
1260		.walk_lock = get_walk_lock(madv_behavior->lock_mode),
1261	};
1262
1263	/*
1264	 * We're ok with removing guards in mlock()'d ranges, as this is a
1265	 * non-destructive action.
1266	 */
1267	if (!is_valid_guard_vma(vma, /* allow_locked = */true))
1268		return -EINVAL;
1269
1270	return walk_page_range_vma(vma, range->start, range->end,
1271				   &wallk_ops, NULL);
1272}
1273
1274#ifdef CONFIG_64BIT
1275/* Does the madvise operation result in discarding of mapped data? */
1276static bool is_discard(int behavior)
1277{
1278	switch (behavior) {
1279	case MADV_FREE:
1280	case MADV_DONTNEED:
1281	case MADV_DONTNEED_LOCKED:
1282	case MADV_REMOVE:
1283	case MADV_DONTFORK:
1284	case MADV_WIPEONFORK:
1285	case MADV_GUARD_INSTALL:
1286		return true;
1287	}
1288
1289	return false;
1290}
1291
1292/*
1293 * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
1294 * circumstances - discarding of data from read-only anonymous SEALED mappings.
1295 *
1296 * This is because users cannot trivally discard data from these VMAs, and may
1297 * only do so via an appropriate madvise() call.
1298 */
1299static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
1300{
1301	struct vm_area_struct *vma = madv_behavior->vma;
1302
1303	/* If the VMA isn't sealed we're good. */
1304	if (!vma_is_sealed(vma))
1305		return true;
1306
1307	/* For a sealed VMA, we only care about discard operations. */
1308	if (!is_discard(madv_behavior->behavior))
1309		return true;
1310
1311	/*
1312	 * We explicitly permit all file-backed mappings, whether MAP_SHARED or
1313	 * MAP_PRIVATE.
1314	 *
1315	 * The latter causes some complications. Because now, one can mmap()
1316	 * read/write a MAP_PRIVATE mapping, write to it, then mprotect()
1317	 * read-only, mseal() and a discard will be permitted.
1318	 *
1319	 * However, in order to avoid issues with potential use of madvise(...,
1320	 * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being,
1321	 * permit this.
1322	 */
1323	if (!vma_is_anonymous(vma))
1324		return true;
1325
1326	/* If the user could write to the mapping anyway, then this is fine. */
1327	if ((vma->vm_flags & VM_WRITE) &&
1328	    arch_vma_access_permitted(vma, /* write= */ true,
1329			/* execute= */ false, /* foreign= */ false))
1330		return true;
1331
1332	/* Otherwise, we are not permitted to perform this operation. */
1333	return false;
1334}
1335#else
1336static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
1337{
1338	return true;
1339}
1340#endif
1341
1342/*
1343 * Apply an madvise behavior to a region of a vma.  madvise_update_vma
1344 * will handle splitting a vm area into separate areas, each area with its own
1345 * behavior.
1346 */
1347static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
1348{
1349	int behavior = madv_behavior->behavior;
1350	struct vm_area_struct *vma = madv_behavior->vma;
1351	vm_flags_t new_flags = vma->vm_flags;
1352	struct madvise_behavior_range *range = &madv_behavior->range;
1353	int error;
1354
1355	if (unlikely(!can_madvise_modify(madv_behavior)))
1356		return -EPERM;
1357
1358	switch (behavior) {
1359	case MADV_REMOVE:
1360		return madvise_remove(madv_behavior);
1361	case MADV_WILLNEED:
1362		return madvise_willneed(madv_behavior);
1363	case MADV_COLD:
1364		return madvise_cold(madv_behavior);
1365	case MADV_PAGEOUT:
1366		return madvise_pageout(madv_behavior);
1367	case MADV_FREE:
1368	case MADV_DONTNEED:
1369	case MADV_DONTNEED_LOCKED:
1370		return madvise_dontneed_free(madv_behavior);
1371	case MADV_COLLAPSE:
1372		return madvise_collapse(vma, range->start, range->end,
1373			&madv_behavior->lock_dropped);
1374	case MADV_GUARD_INSTALL:
1375		return madvise_guard_install(madv_behavior);
1376	case MADV_GUARD_REMOVE:
1377		return madvise_guard_remove(madv_behavior);
1378
1379	/* The below behaviours update VMAs via madvise_update_vma(). */
1380
1381	case MADV_NORMAL:
1382		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1383		break;
1384	case MADV_SEQUENTIAL:
1385		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1386		break;
1387	case MADV_RANDOM:
1388		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1389		break;
1390	case MADV_DONTFORK:
1391		new_flags |= VM_DONTCOPY;
1392		break;
1393	case MADV_DOFORK:
1394		if (new_flags & VM_IO)
1395			return -EINVAL;
1396		new_flags &= ~VM_DONTCOPY;
1397		break;
1398	case MADV_WIPEONFORK:
1399		/* MADV_WIPEONFORK is only supported on anonymous memory. */
1400		if (vma->vm_file || new_flags & VM_SHARED)
1401			return -EINVAL;
1402		new_flags |= VM_WIPEONFORK;
1403		break;
1404	case MADV_KEEPONFORK:
1405		if (new_flags & VM_DROPPABLE)
1406			return -EINVAL;
1407		new_flags &= ~VM_WIPEONFORK;
1408		break;
1409	case MADV_DONTDUMP:
1410		new_flags |= VM_DONTDUMP;
1411		break;
1412	case MADV_DODUMP:
1413		if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) ||
1414		    (new_flags & VM_DROPPABLE))
1415			return -EINVAL;
1416		new_flags &= ~VM_DONTDUMP;
1417		break;
1418	case MADV_MERGEABLE:
1419	case MADV_UNMERGEABLE:
1420		error = ksm_madvise(vma, range->start, range->end,
1421				behavior, &new_flags);
1422		if (error)
1423			goto out;
1424		break;
1425	case MADV_HUGEPAGE:
1426	case MADV_NOHUGEPAGE:
1427		error = hugepage_madvise(vma, &new_flags, behavior);
1428		if (error)
1429			goto out;
1430		break;
1431	case __MADV_SET_ANON_VMA_NAME:
1432		/* Only anonymous mappings can be named */
1433		if (vma->vm_file && !vma_is_anon_shmem(vma))
1434			return -EBADF;
1435		break;
1436	}
1437
1438	/* This is a write operation.*/
1439	VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
1440
1441	error = madvise_update_vma(new_flags, madv_behavior);
1442out:
1443	/*
1444	 * madvise() returns EAGAIN if kernel resources, such as
1445	 * slab, are temporarily unavailable.
1446	 */
1447	if (error == -ENOMEM)
1448		error = -EAGAIN;
1449	return error;
1450}
1451
1452#ifdef CONFIG_MEMORY_FAILURE
1453/*
1454 * Error injection support for memory error handling.
1455 */
1456static int madvise_inject_error(struct madvise_behavior *madv_behavior)
1457{
1458	unsigned long size;
1459	unsigned long start = madv_behavior->range.start;
1460	unsigned long end = madv_behavior->range.end;
1461
1462	if (!capable(CAP_SYS_ADMIN))
1463		return -EPERM;
1464
1465	for (; start < end; start += size) {
1466		unsigned long pfn;
1467		struct page *page;
1468		int ret;
1469
1470		ret = get_user_pages_fast(start, 1, 0, &page);
1471		if (ret != 1)
1472			return ret;
1473		pfn = page_to_pfn(page);
1474
1475		/*
1476		 * When soft offlining hugepages, after migrating the page
1477		 * we dissolve it, therefore in the second loop "page" will
1478		 * no longer be a compound page.
1479		 */
1480		size = page_size(compound_head(page));
1481
1482		if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
1483			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1484				 pfn, start);
1485			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1486		} else {
1487			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1488				 pfn, start);
1489			ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
1490			if (ret == -EOPNOTSUPP)
1491				ret = 0;
1492		}
1493
1494		if (ret)
1495			return ret;
1496	}
1497
1498	return 0;
1499}
1500
1501static bool is_memory_failure(struct madvise_behavior *madv_behavior)
1502{
1503	switch (madv_behavior->behavior) {
1504	case MADV_HWPOISON:
1505	case MADV_SOFT_OFFLINE:
1506		return true;
1507	default:
1508		return false;
1509	}
1510}
1511
1512#else
1513
1514static int madvise_inject_error(struct madvise_behavior *madv_behavior)
1515{
1516	return 0;
1517}
1518
1519static bool is_memory_failure(struct madvise_behavior *madv_behavior)
1520{
1521	return false;
1522}
1523
1524#endif	/* CONFIG_MEMORY_FAILURE */
1525
1526static bool
1527madvise_behavior_valid(int behavior)
1528{
1529	switch (behavior) {
1530	case MADV_DOFORK:
1531	case MADV_DONTFORK:
1532	case MADV_NORMAL:
1533	case MADV_SEQUENTIAL:
1534	case MADV_RANDOM:
1535	case MADV_REMOVE:
1536	case MADV_WILLNEED:
1537	case MADV_DONTNEED:
1538	case MADV_DONTNEED_LOCKED:
1539	case MADV_FREE:
1540	case MADV_COLD:
1541	case MADV_PAGEOUT:
1542	case MADV_POPULATE_READ:
1543	case MADV_POPULATE_WRITE:
1544#ifdef CONFIG_KSM
1545	case MADV_MERGEABLE:
1546	case MADV_UNMERGEABLE:
1547#endif
1548#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1549	case MADV_HUGEPAGE:
1550	case MADV_NOHUGEPAGE:
1551	case MADV_COLLAPSE:
1552#endif
1553	case MADV_DONTDUMP:
1554	case MADV_DODUMP:
1555	case MADV_WIPEONFORK:
1556	case MADV_KEEPONFORK:
1557	case MADV_GUARD_INSTALL:
1558	case MADV_GUARD_REMOVE:
1559#ifdef CONFIG_MEMORY_FAILURE
1560	case MADV_SOFT_OFFLINE:
1561	case MADV_HWPOISON:
1562#endif
1563		return true;
1564
1565	default:
1566		return false;
1567	}
1568}
1569
1570/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
1571static bool process_madvise_remote_valid(int behavior)
1572{
1573	switch (behavior) {
1574	case MADV_COLD:
1575	case MADV_PAGEOUT:
1576	case MADV_WILLNEED:
1577	case MADV_COLLAPSE:
1578		return true;
1579	default:
1580		return false;
1581	}
1582}
1583
1584/* Does this operation invoke anon_vma_prepare()? */
1585static bool prepares_anon_vma(int behavior)
1586{
1587	switch (behavior) {
1588	case MADV_GUARD_INSTALL:
1589		return true;
1590	default:
1591		return false;
1592	}
1593}
1594
1595/*
1596 * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA
1597 * read lock only now we have a VMA to examine?
1598 */
1599static bool is_vma_lock_sufficient(struct vm_area_struct *vma,
1600		struct madvise_behavior *madv_behavior)
1601{
1602	/* Must span only a single VMA.*/
1603	if (madv_behavior->range.end > vma->vm_end)
1604		return false;
1605	/* Remote processes unsupported. */
1606	if (current->mm != vma->vm_mm)
1607		return false;
1608	/* Userfaultfd unsupported. */
1609	if (userfaultfd_armed(vma))
1610		return false;
1611	/*
1612	 * anon_vma_prepare() explicitly requires an mmap lock for
1613	 * serialisation, so we cannot use a VMA lock in this case.
1614	 *
1615	 * Note we might race with anon_vma being set, however this makes this
1616	 * check overly paranoid which is safe.
1617	 */
1618	if (vma_is_anonymous(vma) &&
1619	    prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma)
1620		return false;
1621
1622	return true;
1623}
1624
1625/*
1626 * Try to acquire a VMA read lock if possible.
1627 *
1628 * We only support this lock over a single VMA, which the input range must
1629 * span either partially or fully.
1630 *
1631 * This function always returns with an appropriate lock held. If a VMA read
1632 * lock could be acquired, we return true and set madv_behavior state
1633 * accordingly.
1634 *
1635 * If a VMA read lock could not be acquired, we return false and expect caller to
1636 * fallback to mmap lock behaviour.
1637 */
1638static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
1639{
1640	struct mm_struct *mm = madv_behavior->mm;
1641	struct vm_area_struct *vma;
1642
1643	vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
1644	if (!vma)
1645		goto take_mmap_read_lock;
1646
1647	if (!is_vma_lock_sufficient(vma, madv_behavior)) {
1648		vma_end_read(vma);
1649		goto take_mmap_read_lock;
1650	}
1651
1652	madv_behavior->vma = vma;
1653	return true;
1654
1655take_mmap_read_lock:
1656	mmap_read_lock(mm);
1657	madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
1658	return false;
1659}
1660
1661/*
1662 * Walk the vmas in range [start,end), and call the madvise_vma_behavior
1663 * function on each one.  The function will get start and end parameters that
1664 * cover the overlap between the current vma and the original range.  Any
1665 * unmapped regions in the original range will result in this function returning
1666 * -ENOMEM while still calling the madvise_vma_behavior function on all of the
1667 * existing vmas in the range.  Must be called with the mmap_lock held for
1668 * reading or writing.
1669 */
1670static
1671int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
1672{
1673	struct mm_struct *mm = madv_behavior->mm;
1674	struct madvise_behavior_range *range = &madv_behavior->range;
1675	/* range is updated to span each VMA, so store end of entire range. */
1676	unsigned long last_end = range->end;
1677	int unmapped_error = 0;
1678	int error;
1679	struct vm_area_struct *prev, *vma;
1680
1681	/*
1682	 * If VMA read lock is supported, apply madvise to a single VMA
1683	 * tentatively, avoiding walking VMAs.
1684	 */
1685	if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK &&
1686	    try_vma_read_lock(madv_behavior)) {
1687		error = madvise_vma_behavior(madv_behavior);
1688		vma_end_read(madv_behavior->vma);
1689		return error;
1690	}
1691
1692	vma = find_vma_prev(mm, range->start, &prev);
1693	if (vma && range->start > vma->vm_start)
1694		prev = vma;
1695
1696	for (;;) {
1697		/* Still start < end. */
1698		if (!vma)
1699			return -ENOMEM;
1700
1701		/* Here start < (last_end|vma->vm_end). */
1702		if (range->start < vma->vm_start) {
1703			/*
1704			 * This indicates a gap between VMAs in the input
1705			 * range. This does not cause the operation to abort,
1706			 * rather we simply return -ENOMEM to indicate that this
1707			 * has happened, but carry on.
1708			 */
1709			unmapped_error = -ENOMEM;
1710			range->start = vma->vm_start;
1711			if (range->start >= last_end)
1712				break;
1713		}
1714
1715		/* Here vma->vm_start <= range->start < (last_end|vma->vm_end) */
1716		range->end = min(vma->vm_end, last_end);
1717
1718		/* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */
1719		madv_behavior->prev = prev;
1720		madv_behavior->vma = vma;
1721		error = madvise_vma_behavior(madv_behavior);
1722		if (error)
1723			return error;
1724		if (madv_behavior->lock_dropped) {
1725			/* We dropped the mmap lock, we can't ref the VMA. */
1726			prev = NULL;
1727			vma = NULL;
1728			madv_behavior->lock_dropped = false;
1729		} else {
1730			vma = madv_behavior->vma;
1731			prev = vma;
1732		}
1733
1734		if (vma && range->end < vma->vm_end)
1735			range->end = vma->vm_end;
1736		if (range->end >= last_end)
1737			break;
1738
1739		vma = find_vma(mm, vma ? vma->vm_end : range->end);
1740		range->start = range->end;
1741	}
1742
1743	return unmapped_error;
1744}
1745
1746/*
1747 * Any behaviour which results in changes to the vma->vm_flags needs to
1748 * take mmap_lock for writing. Others, which simply traverse vmas, need
1749 * to only take it for reading.
1750 */
1751static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
1752{
1753	if (is_memory_failure(madv_behavior))
1754		return MADVISE_NO_LOCK;
1755
1756	switch (madv_behavior->behavior) {
1757	case MADV_REMOVE:
1758	case MADV_WILLNEED:
1759	case MADV_COLD:
1760	case MADV_PAGEOUT:
1761	case MADV_POPULATE_READ:
1762	case MADV_POPULATE_WRITE:
1763	case MADV_COLLAPSE:
1764		return MADVISE_MMAP_READ_LOCK;
1765	case MADV_GUARD_INSTALL:
1766	case MADV_GUARD_REMOVE:
1767	case MADV_DONTNEED:
1768	case MADV_DONTNEED_LOCKED:
1769	case MADV_FREE:
1770		return MADVISE_VMA_READ_LOCK;
1771	default:
1772		return MADVISE_MMAP_WRITE_LOCK;
1773	}
1774}
1775
1776static int madvise_lock(struct madvise_behavior *madv_behavior)
1777{
1778	struct mm_struct *mm = madv_behavior->mm;
1779	enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
1780
1781	switch (lock_mode) {
1782	case MADVISE_NO_LOCK:
1783		break;
1784	case MADVISE_MMAP_WRITE_LOCK:
1785		if (mmap_write_lock_killable(mm))
1786			return -EINTR;
1787		break;
1788	case MADVISE_MMAP_READ_LOCK:
1789		mmap_read_lock(mm);
1790		break;
1791	case MADVISE_VMA_READ_LOCK:
1792		/* We will acquire the lock per-VMA in madvise_walk_vmas(). */
1793		break;
1794	}
1795
1796	madv_behavior->lock_mode = lock_mode;
1797	return 0;
1798}
1799
1800static void madvise_unlock(struct madvise_behavior *madv_behavior)
1801{
1802	struct mm_struct *mm = madv_behavior->mm;
1803
1804	switch (madv_behavior->lock_mode) {
1805	case  MADVISE_NO_LOCK:
1806		return;
1807	case MADVISE_MMAP_WRITE_LOCK:
1808		mmap_write_unlock(mm);
1809		break;
1810	case MADVISE_MMAP_READ_LOCK:
1811		mmap_read_unlock(mm);
1812		break;
1813	case MADVISE_VMA_READ_LOCK:
1814		/* We will drop the lock per-VMA in madvise_walk_vmas(). */
1815		break;
1816	}
1817
1818	madv_behavior->lock_mode = MADVISE_NO_LOCK;
1819}
1820
1821static bool madvise_batch_tlb_flush(int behavior)
1822{
1823	switch (behavior) {
1824	case MADV_DONTNEED:
1825	case MADV_DONTNEED_LOCKED:
1826	case MADV_FREE:
1827		return true;
1828	default:
1829		return false;
1830	}
1831}
1832
1833static void madvise_init_tlb(struct madvise_behavior *madv_behavior)
1834{
1835	if (madvise_batch_tlb_flush(madv_behavior->behavior))
1836		tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm);
1837}
1838
1839static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
1840{
1841	if (madvise_batch_tlb_flush(madv_behavior->behavior))
1842		tlb_finish_mmu(madv_behavior->tlb);
1843}
1844
1845static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
1846{
1847	size_t len;
1848
1849	if (!madvise_behavior_valid(behavior))
1850		return false;
1851
1852	if (!PAGE_ALIGNED(start))
1853		return false;
1854	len = PAGE_ALIGN(len_in);
1855
1856	/* Check to see whether len was rounded up from small -ve to zero */
1857	if (len_in && !len)
1858		return false;
1859
1860	if (start + len < start)
1861		return false;
1862
1863	return true;
1864}
1865
1866/*
1867 * madvise_should_skip() - Return if the request is invalid or nothing.
1868 * @start:	Start address of madvise-requested address range.
1869 * @len_in:	Length of madvise-requested address range.
1870 * @behavior:	Requested madvise behavor.
1871 * @err:	Pointer to store an error code from the check.
1872 *
1873 * If the specified behaviour is invalid or nothing would occur, we skip the
1874 * operation.  This function returns true in the cases, otherwise false.  In
1875 * the former case we store an error on @err.
1876 */
1877static bool madvise_should_skip(unsigned long start, size_t len_in,
1878		int behavior, int *err)
1879{
1880	if (!is_valid_madvise(start, len_in, behavior)) {
1881		*err = -EINVAL;
1882		return true;
1883	}
1884	if (start + PAGE_ALIGN(len_in) == start) {
1885		*err = 0;
1886		return true;
1887	}
1888	return false;
1889}
1890
1891static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
1892{
1893	switch (madv_behavior->behavior) {
1894	case MADV_POPULATE_READ:
1895	case MADV_POPULATE_WRITE:
1896		return true;
1897	default:
1898		return false;
1899	}
1900}
1901
1902/*
1903 * untagged_addr_remote() assumes mmap_lock is already held. On
1904 * architectures like x86 and RISC-V, tagging is tricky because each
1905 * mm may have a different tagging mask. However, we might only hold
1906 * the per-VMA lock (currently only local processes are supported),
1907 * so untagged_addr is used to avoid the mmap_lock assertion for
1908 * local processes.
1909 */
1910static inline unsigned long get_untagged_addr(struct mm_struct *mm,
1911		unsigned long start)
1912{
1913	return current->mm == mm ? untagged_addr(start) :
1914				   untagged_addr_remote(mm, start);
1915}
1916
1917static int madvise_do_behavior(unsigned long start, size_t len_in,
1918		struct madvise_behavior *madv_behavior)
1919{
1920	struct blk_plug plug;
1921	int error;
1922	struct madvise_behavior_range *range = &madv_behavior->range;
1923
1924	if (is_memory_failure(madv_behavior)) {
1925		range->start = start;
1926		range->end = start + len_in;
1927		return madvise_inject_error(madv_behavior);
1928	}
1929
1930	range->start = get_untagged_addr(madv_behavior->mm, start);
1931	range->end = range->start + PAGE_ALIGN(len_in);
1932
1933	blk_start_plug(&plug);
1934	if (is_madvise_populate(madv_behavior))
1935		error = madvise_populate(madv_behavior);
1936	else
1937		error = madvise_walk_vmas(madv_behavior);
1938	blk_finish_plug(&plug);
1939	return error;
1940}
1941
1942/*
1943 * The madvise(2) system call.
1944 *
1945 * Applications can use madvise() to advise the kernel how it should
1946 * handle paging I/O in this VM area.  The idea is to help the kernel
1947 * use appropriate read-ahead and caching techniques.  The information
1948 * provided is advisory only, and can be safely disregarded by the
1949 * kernel without affecting the correct operation of the application.
1950 *
1951 * behavior values:
1952 *  MADV_NORMAL - the default behavior is to read clusters.  This
1953 *		results in some read-ahead and read-behind.
1954 *  MADV_RANDOM - the system should read the minimum amount of data
1955 *		on any access, since it is unlikely that the appli-
1956 *		cation will need more than what it asks for.
1957 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1958 *		once, so they can be aggressively read ahead, and
1959 *		can be freed soon after they are accessed.
1960 *  MADV_WILLNEED - the application is notifying the system to read
1961 *		some pages ahead.
1962 *  MADV_DONTNEED - the application is finished with the given range,
1963 *		so the kernel can free resources associated with it.
1964 *  MADV_FREE - the application marks pages in the given range as lazy free,
1965 *		where actual purges are postponed until memory pressure happens.
1966 *  MADV_REMOVE - the application wants to free up the given range of
1967 *		pages and associated backing store.
1968 *  MADV_DONTFORK - omit this area from child's address space when forking:
1969 *		typically, to avoid COWing pages pinned by get_user_pages().
1970 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1971 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1972 *              range after a fork.
1973 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1974 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1975 *		were corrupted by unrecoverable hardware memory failure.
1976 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1977 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1978 *		this area with pages of identical content from other such areas.
1979 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1980 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1981 *		huge pages in the future. Existing pages might be coalesced and
1982 *		new pages might be allocated as THP.
1983 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1984 *		transparent huge pages so the existing pages will not be
1985 *		coalesced into THP and new pages will not be allocated as THP.
1986 *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
1987 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1988 *		from being included in its core dump.
1989 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1990 *  MADV_COLD - the application is not expected to use this memory soon,
1991 *		deactivate pages in this range so that they can be reclaimed
1992 *		easily if memory pressure happens.
1993 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1994 *		page out the pages in this range immediately.
1995 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1996 *		triggering read faults if required
1997 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1998 *		triggering write faults if required
1999 *
2000 * return values:
2001 *  zero    - success
2002 *  -EINVAL - start + len < 0, start is not page-aligned,
2003 *		"behavior" is not a valid value, or application
2004 *		is attempting to release locked or shared pages,
2005 *		or the specified address range includes file, Huge TLB,
2006 *		MAP_SHARED or VMPFNMAP range.
2007 *  -ENOMEM - addresses in the specified range are not currently
2008 *		mapped, or are outside the AS of the process.
2009 *  -EIO    - an I/O error occurred while paging in data.
2010 *  -EBADF  - map exists, but area maps something that isn't a file.
2011 *  -EAGAIN - a kernel resource was temporarily unavailable.
2012 *  -EPERM  - memory is sealed.
2013 */
2014int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
2015{
2016	int error;
2017	struct mmu_gather tlb;
2018	struct madvise_behavior madv_behavior = {
2019		.mm = mm,
2020		.behavior = behavior,
2021		.tlb = &tlb,
2022	};
2023
2024	if (madvise_should_skip(start, len_in, behavior, &error))
2025		return error;
2026	error = madvise_lock(&madv_behavior);
2027	if (error)
2028		return error;
2029	madvise_init_tlb(&madv_behavior);
2030	error = madvise_do_behavior(start, len_in, &madv_behavior);
2031	madvise_finish_tlb(&madv_behavior);
2032	madvise_unlock(&madv_behavior);
2033
2034	return error;
2035}
2036
2037SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
2038{
2039	return do_madvise(current->mm, start, len_in, behavior);
2040}
2041
2042/* Perform an madvise operation over a vector of addresses and lengths. */
2043static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
2044			      int behavior)
2045{
2046	ssize_t ret = 0;
2047	size_t total_len;
2048	struct mmu_gather tlb;
2049	struct madvise_behavior madv_behavior = {
2050		.mm = mm,
2051		.behavior = behavior,
2052		.tlb = &tlb,
2053	};
2054
2055	total_len = iov_iter_count(iter);
2056
2057	ret = madvise_lock(&madv_behavior);
2058	if (ret)
2059		return ret;
2060	madvise_init_tlb(&madv_behavior);
2061
2062	while (iov_iter_count(iter)) {
2063		unsigned long start = (unsigned long)iter_iov_addr(iter);
2064		size_t len_in = iter_iov_len(iter);
2065		int error;
2066
2067		if (madvise_should_skip(start, len_in, behavior, &error))
2068			ret = error;
2069		else
2070			ret = madvise_do_behavior(start, len_in, &madv_behavior);
2071		/*
2072		 * An madvise operation is attempting to restart the syscall,
2073		 * but we cannot proceed as it would not be correct to repeat
2074		 * the operation in aggregate, and would be surprising to the
2075		 * user.
2076		 *
2077		 * We drop and reacquire locks so it is safe to just loop and
2078		 * try again. We check for fatal signals in case we need exit
2079		 * early anyway.
2080		 */
2081		if (ret == -ERESTARTNOINTR) {
2082			if (fatal_signal_pending(current)) {
2083				ret = -EINTR;
2084				break;
2085			}
2086
2087			/* Drop and reacquire lock to unwind race. */
2088			madvise_finish_tlb(&madv_behavior);
2089			madvise_unlock(&madv_behavior);
2090			ret = madvise_lock(&madv_behavior);
2091			if (ret)
2092				goto out;
2093			madvise_init_tlb(&madv_behavior);
2094			continue;
2095		}
2096		if (ret < 0)
2097			break;
2098		iov_iter_advance(iter, iter_iov_len(iter));
2099	}
2100	madvise_finish_tlb(&madv_behavior);
2101	madvise_unlock(&madv_behavior);
2102
2103out:
2104	ret = (total_len - iov_iter_count(iter)) ? : ret;
2105
2106	return ret;
2107}
2108
2109SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
2110		size_t, vlen, int, behavior, unsigned int, flags)
2111{
2112	ssize_t ret;
2113	struct iovec iovstack[UIO_FASTIOV];
2114	struct iovec *iov = iovstack;
2115	struct iov_iter iter;
2116	struct task_struct *task;
2117	struct mm_struct *mm;
2118	unsigned int f_flags;
2119
2120	if (flags != 0) {
2121		ret = -EINVAL;
2122		goto out;
2123	}
2124
2125	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
2126	if (ret < 0)
2127		goto out;
2128
2129	task = pidfd_get_task(pidfd, &f_flags);
2130	if (IS_ERR(task)) {
2131		ret = PTR_ERR(task);
2132		goto free_iov;
2133	}
2134
2135	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
2136	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
2137	if (IS_ERR(mm)) {
2138		ret = PTR_ERR(mm);
2139		goto release_task;
2140	}
2141
2142	/*
2143	 * We need only perform this check if we are attempting to manipulate a
2144	 * remote process's address space.
2145	 */
2146	if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
2147		ret = -EINVAL;
2148		goto release_mm;
2149	}
2150
2151	/*
2152	 * Require CAP_SYS_NICE for influencing process performance. Note that
2153	 * only non-destructive hints are currently supported for remote
2154	 * processes.
2155	 */
2156	if (mm != current->mm && !capable(CAP_SYS_NICE)) {
2157		ret = -EPERM;
2158		goto release_mm;
2159	}
2160
2161	ret = vector_madvise(mm, &iter, behavior);
2162
2163release_mm:
2164	mmput(mm);
2165release_task:
2166	put_task_struct(task);
2167free_iov:
2168	kfree(iov);
2169out:
2170	return ret;
2171}
2172
2173#ifdef CONFIG_ANON_VMA_NAME
2174
2175#define ANON_VMA_NAME_MAX_LEN		80
2176#define ANON_VMA_NAME_INVALID_CHARS	"\\`$[]"
2177
2178static inline bool is_valid_name_char(char ch)
2179{
2180	/* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
2181	return ch > 0x1f && ch < 0x7f &&
2182		!strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
2183}
2184
2185static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
2186		unsigned long len_in, struct anon_vma_name *anon_name)
2187{
2188	unsigned long end;
2189	unsigned long len;
2190	int error;
2191	struct madvise_behavior madv_behavior = {
2192		.mm = mm,
2193		.behavior = __MADV_SET_ANON_VMA_NAME,
2194		.anon_name = anon_name,
2195	};
2196
2197	if (start & ~PAGE_MASK)
2198		return -EINVAL;
2199	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
2200
2201	/* Check to see whether len was rounded up from small -ve to zero */
2202	if (len_in && !len)
2203		return -EINVAL;
2204
2205	end = start + len;
2206	if (end < start)
2207		return -EINVAL;
2208
2209	if (end == start)
2210		return 0;
2211
2212	madv_behavior.range.start = start;
2213	madv_behavior.range.end = end;
2214
2215	error = madvise_lock(&madv_behavior);
2216	if (error)
2217		return error;
2218	error = madvise_walk_vmas(&madv_behavior);
2219	madvise_unlock(&madv_behavior);
2220
2221	return error;
2222}
2223
2224int set_anon_vma_name(unsigned long addr, unsigned long size,
2225		      const char __user *uname)
2226{
2227	struct anon_vma_name *anon_name = NULL;
2228	struct mm_struct *mm = current->mm;
2229	int error;
2230
2231	if (uname) {
2232		char *name, *pch;
2233
2234		name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
2235		if (IS_ERR(name))
2236			return PTR_ERR(name);
2237
2238		for (pch = name; *pch != '\0'; pch++) {
2239			if (!is_valid_name_char(*pch)) {
2240				kfree(name);
2241				return -EINVAL;
2242			}
2243		}
2244		/* anon_vma has its own copy */
2245		anon_name = anon_vma_name_alloc(name);
2246		kfree(name);
2247		if (!anon_name)
2248			return -ENOMEM;
2249	}
2250
2251	error = madvise_set_anon_name(mm, addr, size, anon_name);
2252	anon_vma_name_put(anon_name);
2253
2254	return error;
2255}
2256#endif