mm/migrate.c at v5.18-rc7 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / migrate.c
at v5.18-rc7 2571 lines 68 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Memory Migration functionality - linux/mm/migrate.c
   4 *
   5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   6 *
   7 * Page migration was first developed in the context of the memory hotplug
   8 * project. The main authors of the migration code are:
   9 *
  10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  11 * Hirokazu Takahashi <taka@valinux.co.jp>
  12 * Dave Hansen <haveblue@us.ibm.com>
  13 * Christoph Lameter
  14 */
  15
  16#include <linux/migrate.h>
  17#include <linux/export.h>
  18#include <linux/swap.h>
  19#include <linux/swapops.h>
  20#include <linux/pagemap.h>
  21#include <linux/buffer_head.h>
  22#include <linux/mm_inline.h>
  23#include <linux/nsproxy.h>
  24#include <linux/pagevec.h>
  25#include <linux/ksm.h>
  26#include <linux/rmap.h>
  27#include <linux/topology.h>
  28#include <linux/cpu.h>
  29#include <linux/cpuset.h>
  30#include <linux/writeback.h>
  31#include <linux/mempolicy.h>
  32#include <linux/vmalloc.h>
  33#include <linux/security.h>
  34#include <linux/backing-dev.h>
  35#include <linux/compaction.h>
  36#include <linux/syscalls.h>
  37#include <linux/compat.h>
  38#include <linux/hugetlb.h>
  39#include <linux/hugetlb_cgroup.h>
  40#include <linux/gfp.h>
  41#include <linux/pfn_t.h>
  42#include <linux/memremap.h>
  43#include <linux/userfaultfd_k.h>
  44#include <linux/balloon_compaction.h>
  45#include <linux/page_idle.h>
  46#include <linux/page_owner.h>
  47#include <linux/sched/mm.h>
  48#include <linux/ptrace.h>
  49#include <linux/oom.h>
  50#include <linux/memory.h>
  51#include <linux/random.h>
  52#include <linux/sched/sysctl.h>
  53
  54#include <asm/tlbflush.h>
  55
  56#include <trace/events/migrate.h>
  57
  58#include "internal.h"
  59
  60int isolate_movable_page(struct page *page, isolate_mode_t mode)
  61{
  62	struct address_space *mapping;
  63
  64	/*
  65	 * Avoid burning cycles with pages that are yet under __free_pages(),
  66	 * or just got freed under us.
  67	 *
  68	 * In case we 'win' a race for a movable page being freed under us and
  69	 * raise its refcount preventing __free_pages() from doing its job
  70	 * the put_page() at the end of this block will take care of
  71	 * release this page, thus avoiding a nasty leakage.
  72	 */
  73	if (unlikely(!get_page_unless_zero(page)))
  74		goto out;
  75
  76	/*
  77	 * Check PageMovable before holding a PG_lock because page's owner
  78	 * assumes anybody doesn't touch PG_lock of newly allocated page
  79	 * so unconditionally grabbing the lock ruins page's owner side.
  80	 */
  81	if (unlikely(!__PageMovable(page)))
  82		goto out_putpage;
  83	/*
  84	 * As movable pages are not isolated from LRU lists, concurrent
  85	 * compaction threads can race against page migration functions
  86	 * as well as race against the releasing a page.
  87	 *
  88	 * In order to avoid having an already isolated movable page
  89	 * being (wrongly) re-isolated while it is under migration,
  90	 * or to avoid attempting to isolate pages being released,
  91	 * lets be sure we have the page lock
  92	 * before proceeding with the movable page isolation steps.
  93	 */
  94	if (unlikely(!trylock_page(page)))
  95		goto out_putpage;
  96
  97	if (!PageMovable(page) || PageIsolated(page))
  98		goto out_no_isolated;
  99
 100	mapping = page_mapping(page);
 101	VM_BUG_ON_PAGE(!mapping, page);
 102
 103	if (!mapping->a_ops->isolate_page(page, mode))
 104		goto out_no_isolated;
 105
 106	/* Driver shouldn't use PG_isolated bit of page->flags */
 107	WARN_ON_ONCE(PageIsolated(page));
 108	SetPageIsolated(page);
 109	unlock_page(page);
 110
 111	return 0;
 112
 113out_no_isolated:
 114	unlock_page(page);
 115out_putpage:
 116	put_page(page);
 117out:
 118	return -EBUSY;
 119}
 120
 121static void putback_movable_page(struct page *page)
 122{
 123	struct address_space *mapping;
 124
 125	mapping = page_mapping(page);
 126	mapping->a_ops->putback_page(page);
 127	ClearPageIsolated(page);
 128}
 129
 130/*
 131 * Put previously isolated pages back onto the appropriate lists
 132 * from where they were once taken off for compaction/migration.
 133 *
 134 * This function shall be used whenever the isolated pageset has been
 135 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
 136 * and isolate_huge_page().
 137 */
 138void putback_movable_pages(struct list_head *l)
 139{
 140	struct page *page;
 141	struct page *page2;
 142
 143	list_for_each_entry_safe(page, page2, l, lru) {
 144		if (unlikely(PageHuge(page))) {
 145			putback_active_hugepage(page);
 146			continue;
 147		}
 148		list_del(&page->lru);
 149		/*
 150		 * We isolated non-lru movable page so here we can use
 151		 * __PageMovable because LRU page's mapping cannot have
 152		 * PAGE_MAPPING_MOVABLE.
 153		 */
 154		if (unlikely(__PageMovable(page))) {
 155			VM_BUG_ON_PAGE(!PageIsolated(page), page);
 156			lock_page(page);
 157			if (PageMovable(page))
 158				putback_movable_page(page);
 159			else
 160				ClearPageIsolated(page);
 161			unlock_page(page);
 162			put_page(page);
 163		} else {
 164			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
 165					page_is_file_lru(page), -thp_nr_pages(page));
 166			putback_lru_page(page);
 167		}
 168	}
 169}
 170
 171/*
 172 * Restore a potential migration pte to a working pte entry
 173 */
 174static bool remove_migration_pte(struct folio *folio,
 175		struct vm_area_struct *vma, unsigned long addr, void *old)
 176{
 177	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
 178
 179	while (page_vma_mapped_walk(&pvmw)) {
 180		pte_t pte;
 181		swp_entry_t entry;
 182		struct page *new;
 183		unsigned long idx = 0;
 184
 185		/* pgoff is invalid for ksm pages, but they are never large */
 186		if (folio_test_large(folio) && !folio_test_hugetlb(folio))
 187			idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
 188		new = folio_page(folio, idx);
 189
 190#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 191		/* PMD-mapped THP migration entry */
 192		if (!pvmw.pte) {
 193			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
 194					!folio_test_pmd_mappable(folio), folio);
 195			remove_migration_pmd(&pvmw, new);
 196			continue;
 197		}
 198#endif
 199
 200		folio_get(folio);
 201		pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
 202		if (pte_swp_soft_dirty(*pvmw.pte))
 203			pte = pte_mksoft_dirty(pte);
 204
 205		/*
 206		 * Recheck VMA as permissions can change since migration started
 207		 */
 208		entry = pte_to_swp_entry(*pvmw.pte);
 209		if (is_writable_migration_entry(entry))
 210			pte = maybe_mkwrite(pte, vma);
 211		else if (pte_swp_uffd_wp(*pvmw.pte))
 212			pte = pte_mkuffd_wp(pte);
 213
 214		if (unlikely(is_device_private_page(new))) {
 215			if (pte_write(pte))
 216				entry = make_writable_device_private_entry(
 217							page_to_pfn(new));
 218			else
 219				entry = make_readable_device_private_entry(
 220							page_to_pfn(new));
 221			pte = swp_entry_to_pte(entry);
 222			if (pte_swp_soft_dirty(*pvmw.pte))
 223				pte = pte_swp_mksoft_dirty(pte);
 224			if (pte_swp_uffd_wp(*pvmw.pte))
 225				pte = pte_swp_mkuffd_wp(pte);
 226		}
 227
 228#ifdef CONFIG_HUGETLB_PAGE
 229		if (folio_test_hugetlb(folio)) {
 230			unsigned int shift = huge_page_shift(hstate_vma(vma));
 231
 232			pte = pte_mkhuge(pte);
 233			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 234			if (folio_test_anon(folio))
 235				hugepage_add_anon_rmap(new, vma, pvmw.address);
 236			else
 237				page_dup_rmap(new, true);
 238			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 239		} else
 240#endif
 241		{
 242			if (folio_test_anon(folio))
 243				page_add_anon_rmap(new, vma, pvmw.address, false);
 244			else
 245				page_add_file_rmap(new, vma, false);
 246			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 247		}
 248		if (vma->vm_flags & VM_LOCKED)
 249			mlock_page_drain_local();
 250
 251		trace_remove_migration_pte(pvmw.address, pte_val(pte),
 252					   compound_order(new));
 253
 254		/* No need to invalidate - it was non-present before */
 255		update_mmu_cache(vma, pvmw.address, pvmw.pte);
 256	}
 257
 258	return true;
 259}
 260
 261/*
 262 * Get rid of all migration entries and replace them by
 263 * references to the indicated page.
 264 */
 265void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
 266{
 267	struct rmap_walk_control rwc = {
 268		.rmap_one = remove_migration_pte,
 269		.arg = src,
 270	};
 271
 272	if (locked)
 273		rmap_walk_locked(dst, &rwc);
 274	else
 275		rmap_walk(dst, &rwc);
 276}
 277
 278/*
 279 * Something used the pte of a page under migration. We need to
 280 * get to the page and wait until migration is finished.
 281 * When we return from this function the fault will be retried.
 282 */
 283void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 284				spinlock_t *ptl)
 285{
 286	pte_t pte;
 287	swp_entry_t entry;
 288
 289	spin_lock(ptl);
 290	pte = *ptep;
 291	if (!is_swap_pte(pte))
 292		goto out;
 293
 294	entry = pte_to_swp_entry(pte);
 295	if (!is_migration_entry(entry))
 296		goto out;
 297
 298	migration_entry_wait_on_locked(entry, ptep, ptl);
 299	return;
 300out:
 301	pte_unmap_unlock(ptep, ptl);
 302}
 303
 304void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 305				unsigned long address)
 306{
 307	spinlock_t *ptl = pte_lockptr(mm, pmd);
 308	pte_t *ptep = pte_offset_map(pmd, address);
 309	__migration_entry_wait(mm, ptep, ptl);
 310}
 311
 312void migration_entry_wait_huge(struct vm_area_struct *vma,
 313		struct mm_struct *mm, pte_t *pte)
 314{
 315	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
 316	__migration_entry_wait(mm, pte, ptl);
 317}
 318
 319#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 320void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 321{
 322	spinlock_t *ptl;
 323
 324	ptl = pmd_lock(mm, pmd);
 325	if (!is_pmd_migration_entry(*pmd))
 326		goto unlock;
 327	migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
 328	return;
 329unlock:
 330	spin_unlock(ptl);
 331}
 332#endif
 333
 334static int expected_page_refs(struct address_space *mapping, struct page *page)
 335{
 336	int expected_count = 1;
 337
 338	if (mapping)
 339		expected_count += compound_nr(page) + page_has_private(page);
 340	return expected_count;
 341}
 342
 343/*
 344 * Replace the page in the mapping.
 345 *
 346 * The number of remaining references must be:
 347 * 1 for anonymous pages without a mapping
 348 * 2 for pages with a mapping
 349 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 350 */
 351int folio_migrate_mapping(struct address_space *mapping,
 352		struct folio *newfolio, struct folio *folio, int extra_count)
 353{
 354	XA_STATE(xas, &mapping->i_pages, folio_index(folio));
 355	struct zone *oldzone, *newzone;
 356	int dirty;
 357	int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
 358	long nr = folio_nr_pages(folio);
 359
 360	if (!mapping) {
 361		/* Anonymous page without mapping */
 362		if (folio_ref_count(folio) != expected_count)
 363			return -EAGAIN;
 364
 365		/* No turning back from here */
 366		newfolio->index = folio->index;
 367		newfolio->mapping = folio->mapping;
 368		if (folio_test_swapbacked(folio))
 369			__folio_set_swapbacked(newfolio);
 370
 371		return MIGRATEPAGE_SUCCESS;
 372	}
 373
 374	oldzone = folio_zone(folio);
 375	newzone = folio_zone(newfolio);
 376
 377	xas_lock_irq(&xas);
 378	if (!folio_ref_freeze(folio, expected_count)) {
 379		xas_unlock_irq(&xas);
 380		return -EAGAIN;
 381	}
 382
 383	/*
 384	 * Now we know that no one else is looking at the folio:
 385	 * no turning back from here.
 386	 */
 387	newfolio->index = folio->index;
 388	newfolio->mapping = folio->mapping;
 389	folio_ref_add(newfolio, nr); /* add cache reference */
 390	if (folio_test_swapbacked(folio)) {
 391		__folio_set_swapbacked(newfolio);
 392		if (folio_test_swapcache(folio)) {
 393			folio_set_swapcache(newfolio);
 394			newfolio->private = folio_get_private(folio);
 395		}
 396	} else {
 397		VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
 398	}
 399
 400	/* Move dirty while page refs frozen and newpage not yet exposed */
 401	dirty = folio_test_dirty(folio);
 402	if (dirty) {
 403		folio_clear_dirty(folio);
 404		folio_set_dirty(newfolio);
 405	}
 406
 407	xas_store(&xas, newfolio);
 408
 409	/*
 410	 * Drop cache reference from old page by unfreezing
 411	 * to one less reference.
 412	 * We know this isn't the last reference.
 413	 */
 414	folio_ref_unfreeze(folio, expected_count - nr);
 415
 416	xas_unlock(&xas);
 417	/* Leave irq disabled to prevent preemption while updating stats */
 418
 419	/*
 420	 * If moved to a different zone then also account
 421	 * the page for that zone. Other VM counters will be
 422	 * taken care of when we establish references to the
 423	 * new page and drop references to the old page.
 424	 *
 425	 * Note that anonymous pages are accounted for
 426	 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
 427	 * are mapped to swap space.
 428	 */
 429	if (newzone != oldzone) {
 430		struct lruvec *old_lruvec, *new_lruvec;
 431		struct mem_cgroup *memcg;
 432
 433		memcg = folio_memcg(folio);
 434		old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
 435		new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
 436
 437		__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
 438		__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
 439		if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
 440			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
 441			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
 442		}
 443#ifdef CONFIG_SWAP
 444		if (folio_test_swapcache(folio)) {
 445			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
 446			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
 447		}
 448#endif
 449		if (dirty && mapping_can_writeback(mapping)) {
 450			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
 451			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
 452			__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
 453			__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
 454		}
 455	}
 456	local_irq_enable();
 457
 458	return MIGRATEPAGE_SUCCESS;
 459}
 460EXPORT_SYMBOL(folio_migrate_mapping);
 461
 462/*
 463 * The expected number of remaining references is the same as that
 464 * of folio_migrate_mapping().
 465 */
 466int migrate_huge_page_move_mapping(struct address_space *mapping,
 467				   struct page *newpage, struct page *page)
 468{
 469	XA_STATE(xas, &mapping->i_pages, page_index(page));
 470	int expected_count;
 471
 472	xas_lock_irq(&xas);
 473	expected_count = 2 + page_has_private(page);
 474	if (page_count(page) != expected_count || xas_load(&xas) != page) {
 475		xas_unlock_irq(&xas);
 476		return -EAGAIN;
 477	}
 478
 479	if (!page_ref_freeze(page, expected_count)) {
 480		xas_unlock_irq(&xas);
 481		return -EAGAIN;
 482	}
 483
 484	newpage->index = page->index;
 485	newpage->mapping = page->mapping;
 486
 487	get_page(newpage);
 488
 489	xas_store(&xas, newpage);
 490
 491	page_ref_unfreeze(page, expected_count - 1);
 492
 493	xas_unlock_irq(&xas);
 494
 495	return MIGRATEPAGE_SUCCESS;
 496}
 497
 498/*
 499 * Copy the flags and some other ancillary information
 500 */
 501void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
 502{
 503	int cpupid;
 504
 505	if (folio_test_error(folio))
 506		folio_set_error(newfolio);
 507	if (folio_test_referenced(folio))
 508		folio_set_referenced(newfolio);
 509	if (folio_test_uptodate(folio))
 510		folio_mark_uptodate(newfolio);
 511	if (folio_test_clear_active(folio)) {
 512		VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
 513		folio_set_active(newfolio);
 514	} else if (folio_test_clear_unevictable(folio))
 515		folio_set_unevictable(newfolio);
 516	if (folio_test_workingset(folio))
 517		folio_set_workingset(newfolio);
 518	if (folio_test_checked(folio))
 519		folio_set_checked(newfolio);
 520	if (folio_test_mappedtodisk(folio))
 521		folio_set_mappedtodisk(newfolio);
 522
 523	/* Move dirty on pages not done by folio_migrate_mapping() */
 524	if (folio_test_dirty(folio))
 525		folio_set_dirty(newfolio);
 526
 527	if (folio_test_young(folio))
 528		folio_set_young(newfolio);
 529	if (folio_test_idle(folio))
 530		folio_set_idle(newfolio);
 531
 532	/*
 533	 * Copy NUMA information to the new page, to prevent over-eager
 534	 * future migrations of this same page.
 535	 */
 536	cpupid = page_cpupid_xchg_last(&folio->page, -1);
 537	page_cpupid_xchg_last(&newfolio->page, cpupid);
 538
 539	folio_migrate_ksm(newfolio, folio);
 540	/*
 541	 * Please do not reorder this without considering how mm/ksm.c's
 542	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
 543	 */
 544	if (folio_test_swapcache(folio))
 545		folio_clear_swapcache(folio);
 546	folio_clear_private(folio);
 547
 548	/* page->private contains hugetlb specific flags */
 549	if (!folio_test_hugetlb(folio))
 550		folio->private = NULL;
 551
 552	/*
 553	 * If any waiters have accumulated on the new page then
 554	 * wake them up.
 555	 */
 556	if (folio_test_writeback(newfolio))
 557		folio_end_writeback(newfolio);
 558
 559	/*
 560	 * PG_readahead shares the same bit with PG_reclaim.  The above
 561	 * end_page_writeback() may clear PG_readahead mistakenly, so set the
 562	 * bit after that.
 563	 */
 564	if (folio_test_readahead(folio))
 565		folio_set_readahead(newfolio);
 566
 567	folio_copy_owner(newfolio, folio);
 568
 569	if (!folio_test_hugetlb(folio))
 570		mem_cgroup_migrate(folio, newfolio);
 571}
 572EXPORT_SYMBOL(folio_migrate_flags);
 573
 574void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
 575{
 576	folio_copy(newfolio, folio);
 577	folio_migrate_flags(newfolio, folio);
 578}
 579EXPORT_SYMBOL(folio_migrate_copy);
 580
 581/************************************************************
 582 *                    Migration functions
 583 ***********************************************************/
 584
 585/*
 586 * Common logic to directly migrate a single LRU page suitable for
 587 * pages that do not use PagePrivate/PagePrivate2.
 588 *
 589 * Pages are locked upon entry and exit.
 590 */
 591int migrate_page(struct address_space *mapping,
 592		struct page *newpage, struct page *page,
 593		enum migrate_mode mode)
 594{
 595	struct folio *newfolio = page_folio(newpage);
 596	struct folio *folio = page_folio(page);
 597	int rc;
 598
 599	BUG_ON(folio_test_writeback(folio));	/* Writeback must be complete */
 600
 601	rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
 602
 603	if (rc != MIGRATEPAGE_SUCCESS)
 604		return rc;
 605
 606	if (mode != MIGRATE_SYNC_NO_COPY)
 607		folio_migrate_copy(newfolio, folio);
 608	else
 609		folio_migrate_flags(newfolio, folio);
 610	return MIGRATEPAGE_SUCCESS;
 611}
 612EXPORT_SYMBOL(migrate_page);
 613
 614#ifdef CONFIG_BLOCK
 615/* Returns true if all buffers are successfully locked */
 616static bool buffer_migrate_lock_buffers(struct buffer_head *head,
 617							enum migrate_mode mode)
 618{
 619	struct buffer_head *bh = head;
 620
 621	/* Simple case, sync compaction */
 622	if (mode != MIGRATE_ASYNC) {
 623		do {
 624			lock_buffer(bh);
 625			bh = bh->b_this_page;
 626
 627		} while (bh != head);
 628
 629		return true;
 630	}
 631
 632	/* async case, we cannot block on lock_buffer so use trylock_buffer */
 633	do {
 634		if (!trylock_buffer(bh)) {
 635			/*
 636			 * We failed to lock the buffer and cannot stall in
 637			 * async migration. Release the taken locks
 638			 */
 639			struct buffer_head *failed_bh = bh;
 640			bh = head;
 641			while (bh != failed_bh) {
 642				unlock_buffer(bh);
 643				bh = bh->b_this_page;
 644			}
 645			return false;
 646		}
 647
 648		bh = bh->b_this_page;
 649	} while (bh != head);
 650	return true;
 651}
 652
 653static int __buffer_migrate_page(struct address_space *mapping,
 654		struct page *newpage, struct page *page, enum migrate_mode mode,
 655		bool check_refs)
 656{
 657	struct buffer_head *bh, *head;
 658	int rc;
 659	int expected_count;
 660
 661	if (!page_has_buffers(page))
 662		return migrate_page(mapping, newpage, page, mode);
 663
 664	/* Check whether page does not have extra refs before we do more work */
 665	expected_count = expected_page_refs(mapping, page);
 666	if (page_count(page) != expected_count)
 667		return -EAGAIN;
 668
 669	head = page_buffers(page);
 670	if (!buffer_migrate_lock_buffers(head, mode))
 671		return -EAGAIN;
 672
 673	if (check_refs) {
 674		bool busy;
 675		bool invalidated = false;
 676
 677recheck_buffers:
 678		busy = false;
 679		spin_lock(&mapping->private_lock);
 680		bh = head;
 681		do {
 682			if (atomic_read(&bh->b_count)) {
 683				busy = true;
 684				break;
 685			}
 686			bh = bh->b_this_page;
 687		} while (bh != head);
 688		if (busy) {
 689			if (invalidated) {
 690				rc = -EAGAIN;
 691				goto unlock_buffers;
 692			}
 693			spin_unlock(&mapping->private_lock);
 694			invalidate_bh_lrus();
 695			invalidated = true;
 696			goto recheck_buffers;
 697		}
 698	}
 699
 700	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 701	if (rc != MIGRATEPAGE_SUCCESS)
 702		goto unlock_buffers;
 703
 704	attach_page_private(newpage, detach_page_private(page));
 705
 706	bh = head;
 707	do {
 708		set_bh_page(bh, newpage, bh_offset(bh));
 709		bh = bh->b_this_page;
 710
 711	} while (bh != head);
 712
 713	if (mode != MIGRATE_SYNC_NO_COPY)
 714		migrate_page_copy(newpage, page);
 715	else
 716		migrate_page_states(newpage, page);
 717
 718	rc = MIGRATEPAGE_SUCCESS;
 719unlock_buffers:
 720	if (check_refs)
 721		spin_unlock(&mapping->private_lock);
 722	bh = head;
 723	do {
 724		unlock_buffer(bh);
 725		bh = bh->b_this_page;
 726
 727	} while (bh != head);
 728
 729	return rc;
 730}
 731
 732/*
 733 * Migration function for pages with buffers. This function can only be used
 734 * if the underlying filesystem guarantees that no other references to "page"
 735 * exist. For example attached buffer heads are accessed only under page lock.
 736 */
 737int buffer_migrate_page(struct address_space *mapping,
 738		struct page *newpage, struct page *page, enum migrate_mode mode)
 739{
 740	return __buffer_migrate_page(mapping, newpage, page, mode, false);
 741}
 742EXPORT_SYMBOL(buffer_migrate_page);
 743
 744/*
 745 * Same as above except that this variant is more careful and checks that there
 746 * are also no buffer head references. This function is the right one for
 747 * mappings where buffer heads are directly looked up and referenced (such as
 748 * block device mappings).
 749 */
 750int buffer_migrate_page_norefs(struct address_space *mapping,
 751		struct page *newpage, struct page *page, enum migrate_mode mode)
 752{
 753	return __buffer_migrate_page(mapping, newpage, page, mode, true);
 754}
 755#endif
 756
 757/*
 758 * Writeback a page to clean the dirty state
 759 */
 760static int writeout(struct address_space *mapping, struct page *page)
 761{
 762	struct folio *folio = page_folio(page);
 763	struct writeback_control wbc = {
 764		.sync_mode = WB_SYNC_NONE,
 765		.nr_to_write = 1,
 766		.range_start = 0,
 767		.range_end = LLONG_MAX,
 768		.for_reclaim = 1
 769	};
 770	int rc;
 771
 772	if (!mapping->a_ops->writepage)
 773		/* No write method for the address space */
 774		return -EINVAL;
 775
 776	if (!clear_page_dirty_for_io(page))
 777		/* Someone else already triggered a write */
 778		return -EAGAIN;
 779
 780	/*
 781	 * A dirty page may imply that the underlying filesystem has
 782	 * the page on some queue. So the page must be clean for
 783	 * migration. Writeout may mean we loose the lock and the
 784	 * page state is no longer what we checked for earlier.
 785	 * At this point we know that the migration attempt cannot
 786	 * be successful.
 787	 */
 788	remove_migration_ptes(folio, folio, false);
 789
 790	rc = mapping->a_ops->writepage(page, &wbc);
 791
 792	if (rc != AOP_WRITEPAGE_ACTIVATE)
 793		/* unlocked. Relock */
 794		lock_page(page);
 795
 796	return (rc < 0) ? -EIO : -EAGAIN;
 797}
 798
 799/*
 800 * Default handling if a filesystem does not provide a migration function.
 801 */
 802static int fallback_migrate_page(struct address_space *mapping,
 803	struct page *newpage, struct page *page, enum migrate_mode mode)
 804{
 805	if (PageDirty(page)) {
 806		/* Only writeback pages in full synchronous migration */
 807		switch (mode) {
 808		case MIGRATE_SYNC:
 809		case MIGRATE_SYNC_NO_COPY:
 810			break;
 811		default:
 812			return -EBUSY;
 813		}
 814		return writeout(mapping, page);
 815	}
 816
 817	/*
 818	 * Buffers may be managed in a filesystem specific way.
 819	 * We must have no buffers or drop them.
 820	 */
 821	if (page_has_private(page) &&
 822	    !try_to_release_page(page, GFP_KERNEL))
 823		return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
 824
 825	return migrate_page(mapping, newpage, page, mode);
 826}
 827
 828/*
 829 * Move a page to a newly allocated page
 830 * The page is locked and all ptes have been successfully removed.
 831 *
 832 * The new page will have replaced the old page if this function
 833 * is successful.
 834 *
 835 * Return value:
 836 *   < 0 - error code
 837 *  MIGRATEPAGE_SUCCESS - success
 838 */
 839static int move_to_new_page(struct page *newpage, struct page *page,
 840				enum migrate_mode mode)
 841{
 842	struct address_space *mapping;
 843	int rc = -EAGAIN;
 844	bool is_lru = !__PageMovable(page);
 845
 846	VM_BUG_ON_PAGE(!PageLocked(page), page);
 847	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 848
 849	mapping = page_mapping(page);
 850
 851	if (likely(is_lru)) {
 852		if (!mapping)
 853			rc = migrate_page(mapping, newpage, page, mode);
 854		else if (mapping->a_ops->migratepage)
 855			/*
 856			 * Most pages have a mapping and most filesystems
 857			 * provide a migratepage callback. Anonymous pages
 858			 * are part of swap space which also has its own
 859			 * migratepage callback. This is the most common path
 860			 * for page migration.
 861			 */
 862			rc = mapping->a_ops->migratepage(mapping, newpage,
 863							page, mode);
 864		else
 865			rc = fallback_migrate_page(mapping, newpage,
 866							page, mode);
 867	} else {
 868		/*
 869		 * In case of non-lru page, it could be released after
 870		 * isolation step. In that case, we shouldn't try migration.
 871		 */
 872		VM_BUG_ON_PAGE(!PageIsolated(page), page);
 873		if (!PageMovable(page)) {
 874			rc = MIGRATEPAGE_SUCCESS;
 875			ClearPageIsolated(page);
 876			goto out;
 877		}
 878
 879		rc = mapping->a_ops->migratepage(mapping, newpage,
 880						page, mode);
 881		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
 882			!PageIsolated(page));
 883	}
 884
 885	/*
 886	 * When successful, old pagecache page->mapping must be cleared before
 887	 * page is freed; but stats require that PageAnon be left as PageAnon.
 888	 */
 889	if (rc == MIGRATEPAGE_SUCCESS) {
 890		if (__PageMovable(page)) {
 891			VM_BUG_ON_PAGE(!PageIsolated(page), page);
 892
 893			/*
 894			 * We clear PG_movable under page_lock so any compactor
 895			 * cannot try to migrate this page.
 896			 */
 897			ClearPageIsolated(page);
 898		}
 899
 900		/*
 901		 * Anonymous and movable page->mapping will be cleared by
 902		 * free_pages_prepare so don't reset it here for keeping
 903		 * the type to work PageAnon, for example.
 904		 */
 905		if (!PageMappingFlags(page))
 906			page->mapping = NULL;
 907
 908		if (likely(!is_zone_device_page(newpage)))
 909			flush_dcache_folio(page_folio(newpage));
 910	}
 911out:
 912	return rc;
 913}
 914
 915static int __unmap_and_move(struct page *page, struct page *newpage,
 916				int force, enum migrate_mode mode)
 917{
 918	struct folio *folio = page_folio(page);
 919	struct folio *dst = page_folio(newpage);
 920	int rc = -EAGAIN;
 921	bool page_was_mapped = false;
 922	struct anon_vma *anon_vma = NULL;
 923	bool is_lru = !__PageMovable(page);
 924
 925	if (!trylock_page(page)) {
 926		if (!force || mode == MIGRATE_ASYNC)
 927			goto out;
 928
 929		/*
 930		 * It's not safe for direct compaction to call lock_page.
 931		 * For example, during page readahead pages are added locked
 932		 * to the LRU. Later, when the IO completes the pages are
 933		 * marked uptodate and unlocked. However, the queueing
 934		 * could be merging multiple pages for one bio (e.g.
 935		 * mpage_readahead). If an allocation happens for the
 936		 * second or third page, the process can end up locking
 937		 * the same page twice and deadlocking. Rather than
 938		 * trying to be clever about what pages can be locked,
 939		 * avoid the use of lock_page for direct compaction
 940		 * altogether.
 941		 */
 942		if (current->flags & PF_MEMALLOC)
 943			goto out;
 944
 945		lock_page(page);
 946	}
 947
 948	if (PageWriteback(page)) {
 949		/*
 950		 * Only in the case of a full synchronous migration is it
 951		 * necessary to wait for PageWriteback. In the async case,
 952		 * the retry loop is too short and in the sync-light case,
 953		 * the overhead of stalling is too much
 954		 */
 955		switch (mode) {
 956		case MIGRATE_SYNC:
 957		case MIGRATE_SYNC_NO_COPY:
 958			break;
 959		default:
 960			rc = -EBUSY;
 961			goto out_unlock;
 962		}
 963		if (!force)
 964			goto out_unlock;
 965		wait_on_page_writeback(page);
 966	}
 967
 968	/*
 969	 * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
 970	 * we cannot notice that anon_vma is freed while we migrates a page.
 971	 * This get_anon_vma() delays freeing anon_vma pointer until the end
 972	 * of migration. File cache pages are no problem because of page_lock()
 973	 * File Caches may use write_page() or lock_page() in migration, then,
 974	 * just care Anon page here.
 975	 *
 976	 * Only page_get_anon_vma() understands the subtleties of
 977	 * getting a hold on an anon_vma from outside one of its mms.
 978	 * But if we cannot get anon_vma, then we won't need it anyway,
 979	 * because that implies that the anon page is no longer mapped
 980	 * (and cannot be remapped so long as we hold the page lock).
 981	 */
 982	if (PageAnon(page) && !PageKsm(page))
 983		anon_vma = page_get_anon_vma(page);
 984
 985	/*
 986	 * Block others from accessing the new page when we get around to
 987	 * establishing additional references. We are usually the only one
 988	 * holding a reference to newpage at this point. We used to have a BUG
 989	 * here if trylock_page(newpage) fails, but would like to allow for
 990	 * cases where there might be a race with the previous use of newpage.
 991	 * This is much like races on refcount of oldpage: just don't BUG().
 992	 */
 993	if (unlikely(!trylock_page(newpage)))
 994		goto out_unlock;
 995
 996	if (unlikely(!is_lru)) {
 997		rc = move_to_new_page(newpage, page, mode);
 998		goto out_unlock_both;
 999	}
1000
1001	/*
1002	 * Corner case handling:
1003	 * 1. When a new swap-cache page is read into, it is added to the LRU
1004	 * and treated as swapcache but it has no rmap yet.
1005	 * Calling try_to_unmap() against a page->mapping==NULL page will
1006	 * trigger a BUG.  So handle it here.
1007	 * 2. An orphaned page (see truncate_cleanup_page) might have
1008	 * fs-private metadata. The page can be picked up due to memory
1009	 * offlining.  Everywhere else except page reclaim, the page is
1010	 * invisible to the vm, so the page can not be migrated.  So try to
1011	 * free the metadata, so the page can be freed.
1012	 */
1013	if (!page->mapping) {
1014		VM_BUG_ON_PAGE(PageAnon(page), page);
1015		if (page_has_private(page)) {
1016			try_to_free_buffers(page);
1017			goto out_unlock_both;
1018		}
1019	} else if (page_mapped(page)) {
1020		/* Establish migration ptes */
1021		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
1022				page);
1023		try_to_migrate(folio, 0);
1024		page_was_mapped = true;
1025	}
1026
1027	if (!page_mapped(page))
1028		rc = move_to_new_page(newpage, page, mode);
1029
1030	/*
1031	 * When successful, push newpage to LRU immediately: so that if it
1032	 * turns out to be an mlocked page, remove_migration_ptes() will
1033	 * automatically build up the correct newpage->mlock_count for it.
1034	 *
1035	 * We would like to do something similar for the old page, when
1036	 * unsuccessful, and other cases when a page has been temporarily
1037	 * isolated from the unevictable LRU: but this case is the easiest.
1038	 */
1039	if (rc == MIGRATEPAGE_SUCCESS) {
1040		lru_cache_add(newpage);
1041		if (page_was_mapped)
1042			lru_add_drain();
1043	}
1044
1045	if (page_was_mapped)
1046		remove_migration_ptes(folio,
1047			rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
1048
1049out_unlock_both:
1050	unlock_page(newpage);
1051out_unlock:
1052	/* Drop an anon_vma reference if we took one */
1053	if (anon_vma)
1054		put_anon_vma(anon_vma);
1055	unlock_page(page);
1056out:
1057	/*
1058	 * If migration is successful, decrease refcount of the newpage,
1059	 * which will not free the page because new page owner increased
1060	 * refcounter.
1061	 */
1062	if (rc == MIGRATEPAGE_SUCCESS)
1063		put_page(newpage);
1064
1065	return rc;
1066}
1067
1068/*
1069 * Obtain the lock on page, remove all ptes and migrate the page
1070 * to the newly allocated page in newpage.
1071 */
1072static int unmap_and_move(new_page_t get_new_page,
1073				   free_page_t put_new_page,
1074				   unsigned long private, struct page *page,
1075				   int force, enum migrate_mode mode,
1076				   enum migrate_reason reason,
1077				   struct list_head *ret)
1078{
1079	int rc = MIGRATEPAGE_SUCCESS;
1080	struct page *newpage = NULL;
1081
1082	if (!thp_migration_supported() && PageTransHuge(page))
1083		return -ENOSYS;
1084
1085	if (page_count(page) == 1) {
1086		/* page was freed from under us. So we are done. */
1087		ClearPageActive(page);
1088		ClearPageUnevictable(page);
1089		if (unlikely(__PageMovable(page))) {
1090			lock_page(page);
1091			if (!PageMovable(page))
1092				ClearPageIsolated(page);
1093			unlock_page(page);
1094		}
1095		goto out;
1096	}
1097
1098	newpage = get_new_page(page, private);
1099	if (!newpage)
1100		return -ENOMEM;
1101
1102	rc = __unmap_and_move(page, newpage, force, mode);
1103	if (rc == MIGRATEPAGE_SUCCESS)
1104		set_page_owner_migrate_reason(newpage, reason);
1105
1106out:
1107	if (rc != -EAGAIN) {
1108		/*
1109		 * A page that has been migrated has all references
1110		 * removed and will be freed. A page that has not been
1111		 * migrated will have kept its references and be restored.
1112		 */
1113		list_del(&page->lru);
1114	}
1115
1116	/*
1117	 * If migration is successful, releases reference grabbed during
1118	 * isolation. Otherwise, restore the page to right list unless
1119	 * we want to retry.
1120	 */
1121	if (rc == MIGRATEPAGE_SUCCESS) {
1122		/*
1123		 * Compaction can migrate also non-LRU pages which are
1124		 * not accounted to NR_ISOLATED_*. They can be recognized
1125		 * as __PageMovable
1126		 */
1127		if (likely(!__PageMovable(page)))
1128			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1129					page_is_file_lru(page), -thp_nr_pages(page));
1130
1131		if (reason != MR_MEMORY_FAILURE)
1132			/*
1133			 * We release the page in page_handle_poison.
1134			 */
1135			put_page(page);
1136	} else {
1137		if (rc != -EAGAIN)
1138			list_add_tail(&page->lru, ret);
1139
1140		if (put_new_page)
1141			put_new_page(newpage, private);
1142		else
1143			put_page(newpage);
1144	}
1145
1146	return rc;
1147}
1148
1149/*
1150 * Counterpart of unmap_and_move_page() for hugepage migration.
1151 *
1152 * This function doesn't wait the completion of hugepage I/O
1153 * because there is no race between I/O and migration for hugepage.
1154 * Note that currently hugepage I/O occurs only in direct I/O
1155 * where no lock is held and PG_writeback is irrelevant,
1156 * and writeback status of all subpages are counted in the reference
1157 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1158 * under direct I/O, the reference of the head page is 512 and a bit more.)
1159 * This means that when we try to migrate hugepage whose subpages are
1160 * doing direct I/O, some references remain after try_to_unmap() and
1161 * hugepage migration fails without data corruption.
1162 *
1163 * There is also no race when direct I/O is issued on the page under migration,
1164 * because then pte is replaced with migration swap entry and direct I/O code
1165 * will wait in the page fault for migration to complete.
1166 */
1167static int unmap_and_move_huge_page(new_page_t get_new_page,
1168				free_page_t put_new_page, unsigned long private,
1169				struct page *hpage, int force,
1170				enum migrate_mode mode, int reason,
1171				struct list_head *ret)
1172{
1173	struct folio *dst, *src = page_folio(hpage);
1174	int rc = -EAGAIN;
1175	int page_was_mapped = 0;
1176	struct page *new_hpage;
1177	struct anon_vma *anon_vma = NULL;
1178	struct address_space *mapping = NULL;
1179
1180	/*
1181	 * Migratability of hugepages depends on architectures and their size.
1182	 * This check is necessary because some callers of hugepage migration
1183	 * like soft offline and memory hotremove don't walk through page
1184	 * tables or check whether the hugepage is pmd-based or not before
1185	 * kicking migration.
1186	 */
1187	if (!hugepage_migration_supported(page_hstate(hpage))) {
1188		list_move_tail(&hpage->lru, ret);
1189		return -ENOSYS;
1190	}
1191
1192	if (page_count(hpage) == 1) {
1193		/* page was freed from under us. So we are done. */
1194		putback_active_hugepage(hpage);
1195		return MIGRATEPAGE_SUCCESS;
1196	}
1197
1198	new_hpage = get_new_page(hpage, private);
1199	if (!new_hpage)
1200		return -ENOMEM;
1201	dst = page_folio(new_hpage);
1202
1203	if (!trylock_page(hpage)) {
1204		if (!force)
1205			goto out;
1206		switch (mode) {
1207		case MIGRATE_SYNC:
1208		case MIGRATE_SYNC_NO_COPY:
1209			break;
1210		default:
1211			goto out;
1212		}
1213		lock_page(hpage);
1214	}
1215
1216	/*
1217	 * Check for pages which are in the process of being freed.  Without
1218	 * page_mapping() set, hugetlbfs specific move page routine will not
1219	 * be called and we could leak usage counts for subpools.
1220	 */
1221	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
1222		rc = -EBUSY;
1223		goto out_unlock;
1224	}
1225
1226	if (PageAnon(hpage))
1227		anon_vma = page_get_anon_vma(hpage);
1228
1229	if (unlikely(!trylock_page(new_hpage)))
1230		goto put_anon;
1231
1232	if (page_mapped(hpage)) {
1233		bool mapping_locked = false;
1234		enum ttu_flags ttu = 0;
1235
1236		if (!PageAnon(hpage)) {
1237			/*
1238			 * In shared mappings, try_to_unmap could potentially
1239			 * call huge_pmd_unshare.  Because of this, take
1240			 * semaphore in write mode here and set TTU_RMAP_LOCKED
1241			 * to let lower levels know we have taken the lock.
1242			 */
1243			mapping = hugetlb_page_mapping_lock_write(hpage);
1244			if (unlikely(!mapping))
1245				goto unlock_put_anon;
1246
1247			mapping_locked = true;
1248			ttu |= TTU_RMAP_LOCKED;
1249		}
1250
1251		try_to_migrate(src, ttu);
1252		page_was_mapped = 1;
1253
1254		if (mapping_locked)
1255			i_mmap_unlock_write(mapping);
1256	}
1257
1258	if (!page_mapped(hpage))
1259		rc = move_to_new_page(new_hpage, hpage, mode);
1260
1261	if (page_was_mapped)
1262		remove_migration_ptes(src,
1263			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
1264
1265unlock_put_anon:
1266	unlock_page(new_hpage);
1267
1268put_anon:
1269	if (anon_vma)
1270		put_anon_vma(anon_vma);
1271
1272	if (rc == MIGRATEPAGE_SUCCESS) {
1273		move_hugetlb_state(hpage, new_hpage, reason);
1274		put_new_page = NULL;
1275	}
1276
1277out_unlock:
1278	unlock_page(hpage);
1279out:
1280	if (rc == MIGRATEPAGE_SUCCESS)
1281		putback_active_hugepage(hpage);
1282	else if (rc != -EAGAIN)
1283		list_move_tail(&hpage->lru, ret);
1284
1285	/*
1286	 * If migration was not successful and there's a freeing callback, use
1287	 * it.  Otherwise, put_page() will drop the reference grabbed during
1288	 * isolation.
1289	 */
1290	if (put_new_page)
1291		put_new_page(new_hpage, private);
1292	else
1293		putback_active_hugepage(new_hpage);
1294
1295	return rc;
1296}
1297
1298static inline int try_split_thp(struct page *page, struct page **page2,
1299				struct list_head *from)
1300{
1301	int rc = 0;
1302
1303	lock_page(page);
1304	rc = split_huge_page_to_list(page, from);
1305	unlock_page(page);
1306	if (!rc)
1307		list_safe_reset_next(page, *page2, lru);
1308
1309	return rc;
1310}
1311
1312/*
1313 * migrate_pages - migrate the pages specified in a list, to the free pages
1314 *		   supplied as the target for the page migration
1315 *
1316 * @from:		The list of pages to be migrated.
1317 * @get_new_page:	The function used to allocate free pages to be used
1318 *			as the target of the page migration.
1319 * @put_new_page:	The function used to free target pages if migration
1320 *			fails, or NULL if no special handling is necessary.
1321 * @private:		Private data to be passed on to get_new_page()
1322 * @mode:		The migration mode that specifies the constraints for
1323 *			page migration, if any.
1324 * @reason:		The reason for page migration.
1325 * @ret_succeeded:	Set to the number of normal pages migrated successfully if
1326 *			the caller passes a non-NULL pointer.
1327 *
1328 * The function returns after 10 attempts or if no pages are movable any more
1329 * because the list has become empty or no retryable pages exist any more.
1330 * It is caller's responsibility to call putback_movable_pages() to return pages
1331 * to the LRU or free list only if ret != 0.
1332 *
1333 * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
1334 * an error code. The number of THP splits will be considered as the number of
1335 * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
1336 */
1337int migrate_pages(struct list_head *from, new_page_t get_new_page,
1338		free_page_t put_new_page, unsigned long private,
1339		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
1340{
1341	int retry = 1;
1342	int thp_retry = 1;
1343	int nr_failed = 0;
1344	int nr_failed_pages = 0;
1345	int nr_succeeded = 0;
1346	int nr_thp_succeeded = 0;
1347	int nr_thp_failed = 0;
1348	int nr_thp_split = 0;
1349	int pass = 0;
1350	bool is_thp = false;
1351	struct page *page;
1352	struct page *page2;
1353	int rc, nr_subpages;
1354	LIST_HEAD(ret_pages);
1355	LIST_HEAD(thp_split_pages);
1356	bool nosplit = (reason == MR_NUMA_MISPLACED);
1357	bool no_subpage_counting = false;
1358
1359	trace_mm_migrate_pages_start(mode, reason);
1360
1361thp_subpage_migration:
1362	for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
1363		retry = 0;
1364		thp_retry = 0;
1365
1366		list_for_each_entry_safe(page, page2, from, lru) {
1367retry:
1368			/*
1369			 * THP statistics is based on the source huge page.
1370			 * Capture required information that might get lost
1371			 * during migration.
1372			 */
1373			is_thp = PageTransHuge(page) && !PageHuge(page);
1374			nr_subpages = compound_nr(page);
1375			cond_resched();
1376
1377			if (PageHuge(page))
1378				rc = unmap_and_move_huge_page(get_new_page,
1379						put_new_page, private, page,
1380						pass > 2, mode, reason,
1381						&ret_pages);
1382			else
1383				rc = unmap_and_move(get_new_page, put_new_page,
1384						private, page, pass > 2, mode,
1385						reason, &ret_pages);
1386			/*
1387			 * The rules are:
1388			 *	Success: non hugetlb page will be freed, hugetlb
1389			 *		 page will be put back
1390			 *	-EAGAIN: stay on the from list
1391			 *	-ENOMEM: stay on the from list
1392			 *	Other errno: put on ret_pages list then splice to
1393			 *		     from list
1394			 */
1395			switch(rc) {
1396			/*
1397			 * THP migration might be unsupported or the
1398			 * allocation could've failed so we should
1399			 * retry on the same page with the THP split
1400			 * to base pages.
1401			 *
1402			 * Head page is retried immediately and tail
1403			 * pages are added to the tail of the list so
1404			 * we encounter them after the rest of the list
1405			 * is processed.
1406			 */
1407			case -ENOSYS:
1408				/* THP migration is unsupported */
1409				if (is_thp) {
1410					nr_thp_failed++;
1411					if (!try_split_thp(page, &page2, &thp_split_pages)) {
1412						nr_thp_split++;
1413						goto retry;
1414					}
1415
1416					nr_failed_pages += nr_subpages;
1417					break;
1418				}
1419
1420				/* Hugetlb migration is unsupported */
1421				if (!no_subpage_counting)
1422					nr_failed++;
1423				nr_failed_pages += nr_subpages;
1424				break;
1425			case -ENOMEM:
1426				/*
1427				 * When memory is low, don't bother to try to migrate
1428				 * other pages, just exit.
1429				 * THP NUMA faulting doesn't split THP to retry.
1430				 */
1431				if (is_thp && !nosplit) {
1432					nr_thp_failed++;
1433					if (!try_split_thp(page, &page2, &thp_split_pages)) {
1434						nr_thp_split++;
1435						goto retry;
1436					}
1437
1438					nr_failed_pages += nr_subpages;
1439					goto out;
1440				}
1441
1442				if (!no_subpage_counting)
1443					nr_failed++;
1444				nr_failed_pages += nr_subpages;
1445				goto out;
1446			case -EAGAIN:
1447				if (is_thp) {
1448					thp_retry++;
1449					break;
1450				}
1451				retry++;
1452				break;
1453			case MIGRATEPAGE_SUCCESS:
1454				nr_succeeded += nr_subpages;
1455				if (is_thp) {
1456					nr_thp_succeeded++;
1457					break;
1458				}
1459				break;
1460			default:
1461				/*
1462				 * Permanent failure (-EBUSY, etc.):
1463				 * unlike -EAGAIN case, the failed page is
1464				 * removed from migration page list and not
1465				 * retried in the next outer loop.
1466				 */
1467				if (is_thp) {
1468					nr_thp_failed++;
1469					nr_failed_pages += nr_subpages;
1470					break;
1471				}
1472
1473				if (!no_subpage_counting)
1474					nr_failed++;
1475				nr_failed_pages += nr_subpages;
1476				break;
1477			}
1478		}
1479	}
1480	nr_failed += retry;
1481	nr_thp_failed += thp_retry;
1482	/*
1483	 * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
1484	 * counting in this round, since all subpages of a THP is counted
1485	 * as 1 failure in the first round.
1486	 */
1487	if (!list_empty(&thp_split_pages)) {
1488		/*
1489		 * Move non-migrated pages (after 10 retries) to ret_pages
1490		 * to avoid migrating them again.
1491		 */
1492		list_splice_init(from, &ret_pages);
1493		list_splice_init(&thp_split_pages, from);
1494		no_subpage_counting = true;
1495		retry = 1;
1496		goto thp_subpage_migration;
1497	}
1498
1499	rc = nr_failed + nr_thp_failed;
1500out:
1501	/*
1502	 * Put the permanent failure page back to migration list, they
1503	 * will be put back to the right list by the caller.
1504	 */
1505	list_splice(&ret_pages, from);
1506
1507	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1508	count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
1509	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
1510	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
1511	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1512	trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
1513			       nr_thp_failed, nr_thp_split, mode, reason);
1514
1515	if (ret_succeeded)
1516		*ret_succeeded = nr_succeeded;
1517
1518	return rc;
1519}
1520
1521struct page *alloc_migration_target(struct page *page, unsigned long private)
1522{
1523	struct folio *folio = page_folio(page);
1524	struct migration_target_control *mtc;
1525	gfp_t gfp_mask;
1526	unsigned int order = 0;
1527	struct folio *new_folio = NULL;
1528	int nid;
1529	int zidx;
1530
1531	mtc = (struct migration_target_control *)private;
1532	gfp_mask = mtc->gfp_mask;
1533	nid = mtc->nid;
1534	if (nid == NUMA_NO_NODE)
1535		nid = folio_nid(folio);
1536
1537	if (folio_test_hugetlb(folio)) {
1538		struct hstate *h = page_hstate(&folio->page);
1539
1540		gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
1541		return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
1542	}
1543
1544	if (folio_test_large(folio)) {
1545		/*
1546		 * clear __GFP_RECLAIM to make the migration callback
1547		 * consistent with regular THP allocations.
1548		 */
1549		gfp_mask &= ~__GFP_RECLAIM;
1550		gfp_mask |= GFP_TRANSHUGE;
1551		order = folio_order(folio);
1552	}
1553	zidx = zone_idx(folio_zone(folio));
1554	if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
1555		gfp_mask |= __GFP_HIGHMEM;
1556
1557	new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
1558
1559	return &new_folio->page;
1560}
1561
1562#ifdef CONFIG_NUMA
1563
1564static int store_status(int __user *status, int start, int value, int nr)
1565{
1566	while (nr-- > 0) {
1567		if (put_user(value, status + start))
1568			return -EFAULT;
1569		start++;
1570	}
1571
1572	return 0;
1573}
1574
1575static int do_move_pages_to_node(struct mm_struct *mm,
1576		struct list_head *pagelist, int node)
1577{
1578	int err;
1579	struct migration_target_control mtc = {
1580		.nid = node,
1581		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1582	};
1583
1584	err = migrate_pages(pagelist, alloc_migration_target, NULL,
1585		(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1586	if (err)
1587		putback_movable_pages(pagelist);
1588	return err;
1589}
1590
1591/*
1592 * Resolves the given address to a struct page, isolates it from the LRU and
1593 * puts it to the given pagelist.
1594 * Returns:
1595 *     errno - if the page cannot be found/isolated
1596 *     0 - when it doesn't have to be migrated because it is already on the
1597 *         target node
1598 *     1 - when it has been queued
1599 */
1600static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1601		int node, struct list_head *pagelist, bool migrate_all)
1602{
1603	struct vm_area_struct *vma;
1604	struct page *page;
1605	int err;
1606
1607	mmap_read_lock(mm);
1608	err = -EFAULT;
1609	vma = find_vma(mm, addr);
1610	if (!vma || addr < vma->vm_start || !vma_migratable(vma))
1611		goto out;
1612
1613	/* FOLL_DUMP to ignore special (like zero) pages */
1614	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
1615
1616	err = PTR_ERR(page);
1617	if (IS_ERR(page))
1618		goto out;
1619
1620	err = -ENOENT;
1621	if (!page)
1622		goto out;
1623
1624	err = 0;
1625	if (page_to_nid(page) == node)
1626		goto out_putpage;
1627
1628	err = -EACCES;
1629	if (page_mapcount(page) > 1 && !migrate_all)
1630		goto out_putpage;
1631
1632	if (PageHuge(page)) {
1633		if (PageHead(page)) {
1634			isolate_huge_page(page, pagelist);
1635			err = 1;
1636		}
1637	} else {
1638		struct page *head;
1639
1640		head = compound_head(page);
1641		err = isolate_lru_page(head);
1642		if (err)
1643			goto out_putpage;
1644
1645		err = 1;
1646		list_add_tail(&head->lru, pagelist);
1647		mod_node_page_state(page_pgdat(head),
1648			NR_ISOLATED_ANON + page_is_file_lru(head),
1649			thp_nr_pages(head));
1650	}
1651out_putpage:
1652	/*
1653	 * Either remove the duplicate refcount from
1654	 * isolate_lru_page() or drop the page ref if it was
1655	 * not isolated.
1656	 */
1657	put_page(page);
1658out:
1659	mmap_read_unlock(mm);
1660	return err;
1661}
1662
1663static int move_pages_and_store_status(struct mm_struct *mm, int node,
1664		struct list_head *pagelist, int __user *status,
1665		int start, int i, unsigned long nr_pages)
1666{
1667	int err;
1668
1669	if (list_empty(pagelist))
1670		return 0;
1671
1672	err = do_move_pages_to_node(mm, pagelist, node);
1673	if (err) {
1674		/*
1675		 * Positive err means the number of failed
1676		 * pages to migrate.  Since we are going to
1677		 * abort and return the number of non-migrated
1678		 * pages, so need to include the rest of the
1679		 * nr_pages that have not been attempted as
1680		 * well.
1681		 */
1682		if (err > 0)
1683			err += nr_pages - i - 1;
1684		return err;
1685	}
1686	return store_status(status, start, node, i - start);
1687}
1688
1689/*
1690 * Migrate an array of page address onto an array of nodes and fill
1691 * the corresponding array of status.
1692 */
1693static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1694			 unsigned long nr_pages,
1695			 const void __user * __user *pages,
1696			 const int __user *nodes,
1697			 int __user *status, int flags)
1698{
1699	int current_node = NUMA_NO_NODE;
1700	LIST_HEAD(pagelist);
1701	int start, i;
1702	int err = 0, err1;
1703
1704	lru_cache_disable();
1705
1706	for (i = start = 0; i < nr_pages; i++) {
1707		const void __user *p;
1708		unsigned long addr;
1709		int node;
1710
1711		err = -EFAULT;
1712		if (get_user(p, pages + i))
1713			goto out_flush;
1714		if (get_user(node, nodes + i))
1715			goto out_flush;
1716		addr = (unsigned long)untagged_addr(p);
1717
1718		err = -ENODEV;
1719		if (node < 0 || node >= MAX_NUMNODES)
1720			goto out_flush;
1721		if (!node_state(node, N_MEMORY))
1722			goto out_flush;
1723
1724		err = -EACCES;
1725		if (!node_isset(node, task_nodes))
1726			goto out_flush;
1727
1728		if (current_node == NUMA_NO_NODE) {
1729			current_node = node;
1730			start = i;
1731		} else if (node != current_node) {
1732			err = move_pages_and_store_status(mm, current_node,
1733					&pagelist, status, start, i, nr_pages);
1734			if (err)
1735				goto out;
1736			start = i;
1737			current_node = node;
1738		}
1739
1740		/*
1741		 * Errors in the page lookup or isolation are not fatal and we simply
1742		 * report them via status
1743		 */
1744		err = add_page_for_migration(mm, addr, current_node,
1745				&pagelist, flags & MPOL_MF_MOVE_ALL);
1746
1747		if (err > 0) {
1748			/* The page is successfully queued for migration */
1749			continue;
1750		}
1751
1752		/*
1753		 * The move_pages() man page does not have an -EEXIST choice, so
1754		 * use -EFAULT instead.
1755		 */
1756		if (err == -EEXIST)
1757			err = -EFAULT;
1758
1759		/*
1760		 * If the page is already on the target node (!err), store the
1761		 * node, otherwise, store the err.
1762		 */
1763		err = store_status(status, i, err ? : current_node, 1);
1764		if (err)
1765			goto out_flush;
1766
1767		err = move_pages_and_store_status(mm, current_node, &pagelist,
1768				status, start, i, nr_pages);
1769		if (err)
1770			goto out;
1771		current_node = NUMA_NO_NODE;
1772	}
1773out_flush:
1774	/* Make sure we do not overwrite the existing error */
1775	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
1776				status, start, i, nr_pages);
1777	if (err >= 0)
1778		err = err1;
1779out:
1780	lru_cache_enable();
1781	return err;
1782}
1783
1784/*
1785 * Determine the nodes of an array of pages and store it in an array of status.
1786 */
1787static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1788				const void __user **pages, int *status)
1789{
1790	unsigned long i;
1791
1792	mmap_read_lock(mm);
1793
1794	for (i = 0; i < nr_pages; i++) {
1795		unsigned long addr = (unsigned long)(*pages);
1796		struct vm_area_struct *vma;
1797		struct page *page;
1798		int err = -EFAULT;
1799
1800		vma = vma_lookup(mm, addr);
1801		if (!vma)
1802			goto set_status;
1803
1804		/* FOLL_DUMP to ignore special (like zero) pages */
1805		page = follow_page(vma, addr, FOLL_DUMP);
1806
1807		err = PTR_ERR(page);
1808		if (IS_ERR(page))
1809			goto set_status;
1810
1811		err = page ? page_to_nid(page) : -ENOENT;
1812set_status:
1813		*status = err;
1814
1815		pages++;
1816		status++;
1817	}
1818
1819	mmap_read_unlock(mm);
1820}
1821
1822static int get_compat_pages_array(const void __user *chunk_pages[],
1823				  const void __user * __user *pages,
1824				  unsigned long chunk_nr)
1825{
1826	compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
1827	compat_uptr_t p;
1828	int i;
1829
1830	for (i = 0; i < chunk_nr; i++) {
1831		if (get_user(p, pages32 + i))
1832			return -EFAULT;
1833		chunk_pages[i] = compat_ptr(p);
1834	}
1835
1836	return 0;
1837}
1838
1839/*
1840 * Determine the nodes of a user array of pages and store it in
1841 * a user array of status.
1842 */
1843static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1844			 const void __user * __user *pages,
1845			 int __user *status)
1846{
1847#define DO_PAGES_STAT_CHUNK_NR 16
1848	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1849	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1850
1851	while (nr_pages) {
1852		unsigned long chunk_nr;
1853
1854		chunk_nr = nr_pages;
1855		if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1856			chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1857
1858		if (in_compat_syscall()) {
1859			if (get_compat_pages_array(chunk_pages, pages,
1860						   chunk_nr))
1861				break;
1862		} else {
1863			if (copy_from_user(chunk_pages, pages,
1864				      chunk_nr * sizeof(*chunk_pages)))
1865				break;
1866		}
1867
1868		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1869
1870		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1871			break;
1872
1873		pages += chunk_nr;
1874		status += chunk_nr;
1875		nr_pages -= chunk_nr;
1876	}
1877	return nr_pages ? -EFAULT : 0;
1878}
1879
1880static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
1881{
1882	struct task_struct *task;
1883	struct mm_struct *mm;
1884
1885	/*
1886	 * There is no need to check if current process has the right to modify
1887	 * the specified process when they are same.
1888	 */
1889	if (!pid) {
1890		mmget(current->mm);
1891		*mem_nodes = cpuset_mems_allowed(current);
1892		return current->mm;
1893	}
1894
1895	/* Find the mm_struct */
1896	rcu_read_lock();
1897	task = find_task_by_vpid(pid);
1898	if (!task) {
1899		rcu_read_unlock();
1900		return ERR_PTR(-ESRCH);
1901	}
1902	get_task_struct(task);
1903
1904	/*
1905	 * Check if this process has the right to modify the specified
1906	 * process. Use the regular "ptrace_may_access()" checks.
1907	 */
1908	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1909		rcu_read_unlock();
1910		mm = ERR_PTR(-EPERM);
1911		goto out;
1912	}
1913	rcu_read_unlock();
1914
1915	mm = ERR_PTR(security_task_movememory(task));
1916	if (IS_ERR(mm))
1917		goto out;
1918	*mem_nodes = cpuset_mems_allowed(task);
1919	mm = get_task_mm(task);
1920out:
1921	put_task_struct(task);
1922	if (!mm)
1923		mm = ERR_PTR(-EINVAL);
1924	return mm;
1925}
1926
1927/*
1928 * Move a list of pages in the address space of the currently executing
1929 * process.
1930 */
1931static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
1932			     const void __user * __user *pages,
1933			     const int __user *nodes,
1934			     int __user *status, int flags)
1935{
1936	struct mm_struct *mm;
1937	int err;
1938	nodemask_t task_nodes;
1939
1940	/* Check flags */
1941	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1942		return -EINVAL;
1943
1944	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1945		return -EPERM;
1946
1947	mm = find_mm_struct(pid, &task_nodes);
1948	if (IS_ERR(mm))
1949		return PTR_ERR(mm);
1950
1951	if (nodes)
1952		err = do_pages_move(mm, task_nodes, nr_pages, pages,
1953				    nodes, status, flags);
1954	else
1955		err = do_pages_stat(mm, nr_pages, pages, status);
1956
1957	mmput(mm);
1958	return err;
1959}
1960
1961SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1962		const void __user * __user *, pages,
1963		const int __user *, nodes,
1964		int __user *, status, int, flags)
1965{
1966	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
1967}
1968
1969#ifdef CONFIG_NUMA_BALANCING
1970/*
1971 * Returns true if this is a safe migration target node for misplaced NUMA
1972 * pages. Currently it only checks the watermarks which crude
1973 */
1974static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1975				   unsigned long nr_migrate_pages)
1976{
1977	int z;
1978
1979	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1980		struct zone *zone = pgdat->node_zones + z;
1981
1982		if (!populated_zone(zone))
1983			continue;
1984
1985		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
1986		if (!zone_watermark_ok(zone, 0,
1987				       high_wmark_pages(zone) +
1988				       nr_migrate_pages,
1989				       ZONE_MOVABLE, 0))
1990			continue;
1991		return true;
1992	}
1993	return false;
1994}
1995
1996static struct page *alloc_misplaced_dst_page(struct page *page,
1997					   unsigned long data)
1998{
1999	int nid = (int) data;
2000	int order = compound_order(page);
2001	gfp_t gfp = __GFP_THISNODE;
2002	struct folio *new;
2003
2004	if (order > 0)
2005		gfp |= GFP_TRANSHUGE_LIGHT;
2006	else {
2007		gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
2008			__GFP_NOWARN;
2009		gfp &= ~__GFP_RECLAIM;
2010	}
2011	new = __folio_alloc_node(gfp, order, nid);
2012
2013	return &new->page;
2014}
2015
2016static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
2017{
2018	int page_lru;
2019	int nr_pages = thp_nr_pages(page);
2020	int order = compound_order(page);
2021
2022	VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
2023
2024	/* Do not migrate THP mapped by multiple processes */
2025	if (PageTransHuge(page) && total_mapcount(page) > 1)
2026		return 0;
2027
2028	/* Avoid migrating to a node that is nearly full */
2029	if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
2030		int z;
2031
2032		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
2033			return 0;
2034		for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2035			if (populated_zone(pgdat->node_zones + z))
2036				break;
2037		}
2038		wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
2039		return 0;
2040	}
2041
2042	if (isolate_lru_page(page))
2043		return 0;
2044
2045	page_lru = page_is_file_lru(page);
2046	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
2047			    nr_pages);
2048
2049	/*
2050	 * Isolating the page has taken another reference, so the
2051	 * caller's reference can be safely dropped without the page
2052	 * disappearing underneath us during migration.
2053	 */
2054	put_page(page);
2055	return 1;
2056}
2057
2058/*
2059 * Attempt to migrate a misplaced page to the specified destination
2060 * node. Caller is expected to have an elevated reference count on
2061 * the page that will be dropped by this function before returning.
2062 */
2063int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
2064			   int node)
2065{
2066	pg_data_t *pgdat = NODE_DATA(node);
2067	int isolated;
2068	int nr_remaining;
2069	unsigned int nr_succeeded;
2070	LIST_HEAD(migratepages);
2071	int nr_pages = thp_nr_pages(page);
2072
2073	/*
2074	 * Don't migrate file pages that are mapped in multiple processes
2075	 * with execute permissions as they are probably shared libraries.
2076	 */
2077	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
2078	    (vma->vm_flags & VM_EXEC))
2079		goto out;
2080
2081	/*
2082	 * Also do not migrate dirty pages as not all filesystems can move
2083	 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
2084	 */
2085	if (page_is_file_lru(page) && PageDirty(page))
2086		goto out;
2087
2088	isolated = numamigrate_isolate_page(pgdat, page);
2089	if (!isolated)
2090		goto out;
2091
2092	list_add(&page->lru, &migratepages);
2093	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
2094				     NULL, node, MIGRATE_ASYNC,
2095				     MR_NUMA_MISPLACED, &nr_succeeded);
2096	if (nr_remaining) {
2097		if (!list_empty(&migratepages)) {
2098			list_del(&page->lru);
2099			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2100					page_is_file_lru(page), -nr_pages);
2101			putback_lru_page(page);
2102		}
2103		isolated = 0;
2104	}
2105	if (nr_succeeded) {
2106		count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
2107		if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
2108			mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
2109					    nr_succeeded);
2110	}
2111	BUG_ON(!list_empty(&migratepages));
2112	return isolated;
2113
2114out:
2115	put_page(page);
2116	return 0;
2117}
2118#endif /* CONFIG_NUMA_BALANCING */
2119#endif /* CONFIG_NUMA */
2120
2121/*
2122 * node_demotion[] example:
2123 *
2124 * Consider a system with two sockets.  Each socket has
2125 * three classes of memory attached: fast, medium and slow.
2126 * Each memory class is placed in its own NUMA node.  The
2127 * CPUs are placed in the node with the "fast" memory.  The
2128 * 6 NUMA nodes (0-5) might be split among the sockets like
2129 * this:
2130 *
2131 *	Socket A: 0, 1, 2
2132 *	Socket B: 3, 4, 5
2133 *
2134 * When Node 0 fills up, its memory should be migrated to
2135 * Node 1.  When Node 1 fills up, it should be migrated to
2136 * Node 2.  The migration path start on the nodes with the
2137 * processors (since allocations default to this node) and
2138 * fast memory, progress through medium and end with the
2139 * slow memory:
2140 *
2141 *	0 -> 1 -> 2 -> stop
2142 *	3 -> 4 -> 5 -> stop
2143 *
2144 * This is represented in the node_demotion[] like this:
2145 *
2146 *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
2147 *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
2148 *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
2149 *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
2150 *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
2151 *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
2152 *
2153 * Moreover some systems may have multiple slow memory nodes.
2154 * Suppose a system has one socket with 3 memory nodes, node 0
2155 * is fast memory type, and node 1/2 both are slow memory
2156 * type, and the distance between fast memory node and slow
2157 * memory node is same. So the migration path should be:
2158 *
2159 *	0 -> 1/2 -> stop
2160 *
2161 * This is represented in the node_demotion[] like this:
2162 *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
2163 *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
2164 *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate
2165 */
2166
2167/*
2168 * Writes to this array occur without locking.  Cycles are
2169 * not allowed: Node X demotes to Y which demotes to X...
2170 *
2171 * If multiple reads are performed, a single rcu_read_lock()
2172 * must be held over all reads to ensure that no cycles are
2173 * observed.
2174 */
2175#define DEFAULT_DEMOTION_TARGET_NODES 15
2176
2177#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
2178#define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1)
2179#else
2180#define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES
2181#endif
2182
2183struct demotion_nodes {
2184	unsigned short nr;
2185	short nodes[DEMOTION_TARGET_NODES];
2186};
2187
2188static struct demotion_nodes *node_demotion __read_mostly;
2189
2190/**
2191 * next_demotion_node() - Get the next node in the demotion path
2192 * @node: The starting node to lookup the next node
2193 *
2194 * Return: node id for next memory node in the demotion path hierarchy
2195 * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
2196 * @node online or guarantee that it *continues* to be the next demotion
2197 * target.
2198 */
2199int next_demotion_node(int node)
2200{
2201	struct demotion_nodes *nd;
2202	unsigned short target_nr, index;
2203	int target;
2204
2205	if (!node_demotion)
2206		return NUMA_NO_NODE;
2207
2208	nd = &node_demotion[node];
2209
2210	/*
2211	 * node_demotion[] is updated without excluding this
2212	 * function from running.  RCU doesn't provide any
2213	 * compiler barriers, so the READ_ONCE() is required
2214	 * to avoid compiler reordering or read merging.
2215	 *
2216	 * Make sure to use RCU over entire code blocks if
2217	 * node_demotion[] reads need to be consistent.
2218	 */
2219	rcu_read_lock();
2220	target_nr = READ_ONCE(nd->nr);
2221
2222	switch (target_nr) {
2223	case 0:
2224		target = NUMA_NO_NODE;
2225		goto out;
2226	case 1:
2227		index = 0;
2228		break;
2229	default:
2230		/*
2231		 * If there are multiple target nodes, just select one
2232		 * target node randomly.
2233		 *
2234		 * In addition, we can also use round-robin to select
2235		 * target node, but we should introduce another variable
2236		 * for node_demotion[] to record last selected target node,
2237		 * that may cause cache ping-pong due to the changing of
2238		 * last target node. Or introducing per-cpu data to avoid
2239		 * caching issue, which seems more complicated. So selecting
2240		 * target node randomly seems better until now.
2241		 */
2242		index = get_random_int() % target_nr;
2243		break;
2244	}
2245
2246	target = READ_ONCE(nd->nodes[index]);
2247
2248out:
2249	rcu_read_unlock();
2250	return target;
2251}
2252
2253#if defined(CONFIG_HOTPLUG_CPU)
2254/* Disable reclaim-based migration. */
2255static void __disable_all_migrate_targets(void)
2256{
2257	int node, i;
2258
2259	if (!node_demotion)
2260		return;
2261
2262	for_each_online_node(node) {
2263		node_demotion[node].nr = 0;
2264		for (i = 0; i < DEMOTION_TARGET_NODES; i++)
2265			node_demotion[node].nodes[i] = NUMA_NO_NODE;
2266	}
2267}
2268
2269static void disable_all_migrate_targets(void)
2270{
2271	__disable_all_migrate_targets();
2272
2273	/*
2274	 * Ensure that the "disable" is visible across the system.
2275	 * Readers will see either a combination of before+disable
2276	 * state or disable+after.  They will never see before and
2277	 * after state together.
2278	 *
2279	 * The before+after state together might have cycles and
2280	 * could cause readers to do things like loop until this
2281	 * function finishes.  This ensures they can only see a
2282	 * single "bad" read and would, for instance, only loop
2283	 * once.
2284	 */
2285	synchronize_rcu();
2286}
2287
2288/*
2289 * Find an automatic demotion target for 'node'.
2290 * Failing here is OK.  It might just indicate
2291 * being at the end of a chain.
2292 */
2293static int establish_migrate_target(int node, nodemask_t *used,
2294				    int best_distance)
2295{
2296	int migration_target, index, val;
2297	struct demotion_nodes *nd;
2298
2299	if (!node_demotion)
2300		return NUMA_NO_NODE;
2301
2302	nd = &node_demotion[node];
2303
2304	migration_target = find_next_best_node(node, used);
2305	if (migration_target == NUMA_NO_NODE)
2306		return NUMA_NO_NODE;
2307
2308	/*
2309	 * If the node has been set a migration target node before,
2310	 * which means it's the best distance between them. Still
2311	 * check if this node can be demoted to other target nodes
2312	 * if they have a same best distance.
2313	 */
2314	if (best_distance != -1) {
2315		val = node_distance(node, migration_target);
2316		if (val > best_distance)
2317			goto out_clear;
2318	}
2319
2320	index = nd->nr;
2321	if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
2322		      "Exceeds maximum demotion target nodes\n"))
2323		goto out_clear;
2324
2325	nd->nodes[index] = migration_target;
2326	nd->nr++;
2327
2328	return migration_target;
2329out_clear:
2330	node_clear(migration_target, *used);
2331	return NUMA_NO_NODE;
2332}
2333
2334/*
2335 * When memory fills up on a node, memory contents can be
2336 * automatically migrated to another node instead of
2337 * discarded at reclaim.
2338 *
2339 * Establish a "migration path" which will start at nodes
2340 * with CPUs and will follow the priorities used to build the
2341 * page allocator zonelists.
2342 *
2343 * The difference here is that cycles must be avoided.  If
2344 * node0 migrates to node1, then neither node1, nor anything
2345 * node1 migrates to can migrate to node0. Also one node can
2346 * be migrated to multiple nodes if the target nodes all have
2347 * a same best-distance against the source node.
2348 *
2349 * This function can run simultaneously with readers of
2350 * node_demotion[].  However, it can not run simultaneously
2351 * with itself.  Exclusion is provided by memory hotplug events
2352 * being single-threaded.
2353 */
2354static void __set_migration_target_nodes(void)
2355{
2356	nodemask_t next_pass	= NODE_MASK_NONE;
2357	nodemask_t this_pass	= NODE_MASK_NONE;
2358	nodemask_t used_targets = NODE_MASK_NONE;
2359	int node, best_distance;
2360
2361	/*
2362	 * Avoid any oddities like cycles that could occur
2363	 * from changes in the topology.  This will leave
2364	 * a momentary gap when migration is disabled.
2365	 */
2366	disable_all_migrate_targets();
2367
2368	/*
2369	 * Allocations go close to CPUs, first.  Assume that
2370	 * the migration path starts at the nodes with CPUs.
2371	 */
2372	next_pass = node_states[N_CPU];
2373again:
2374	this_pass = next_pass;
2375	next_pass = NODE_MASK_NONE;
2376	/*
2377	 * To avoid cycles in the migration "graph", ensure
2378	 * that migration sources are not future targets by
2379	 * setting them in 'used_targets'.  Do this only
2380	 * once per pass so that multiple source nodes can
2381	 * share a target node.
2382	 *
2383	 * 'used_targets' will become unavailable in future
2384	 * passes.  This limits some opportunities for
2385	 * multiple source nodes to share a destination.
2386	 */
2387	nodes_or(used_targets, used_targets, this_pass);
2388
2389	for_each_node_mask(node, this_pass) {
2390		best_distance = -1;
2391
2392		/*
2393		 * Try to set up the migration path for the node, and the target
2394		 * migration nodes can be multiple, so doing a loop to find all
2395		 * the target nodes if they all have a best node distance.
2396		 */
2397		do {
2398			int target_node =
2399				establish_migrate_target(node, &used_targets,
2400							 best_distance);
2401
2402			if (target_node == NUMA_NO_NODE)
2403				break;
2404
2405			if (best_distance == -1)
2406				best_distance = node_distance(node, target_node);
2407
2408			/*
2409			 * Visit targets from this pass in the next pass.
2410			 * Eventually, every node will have been part of
2411			 * a pass, and will become set in 'used_targets'.
2412			 */
2413			node_set(target_node, next_pass);
2414		} while (1);
2415	}
2416	/*
2417	 * 'next_pass' contains nodes which became migration
2418	 * targets in this pass.  Make additional passes until
2419	 * no more migrations targets are available.
2420	 */
2421	if (!nodes_empty(next_pass))
2422		goto again;
2423}
2424
2425/*
2426 * For callers that do not hold get_online_mems() already.
2427 */
2428void set_migration_target_nodes(void)
2429{
2430	get_online_mems();
2431	__set_migration_target_nodes();
2432	put_online_mems();
2433}
2434
2435/*
2436 * This leaves migrate-on-reclaim transiently disabled between
2437 * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
2438 * whether reclaim-based migration is enabled or not, which
2439 * ensures that the user can turn reclaim-based migration at
2440 * any time without needing to recalculate migration targets.
2441 *
2442 * These callbacks already hold get_online_mems().  That is why
2443 * __set_migration_target_nodes() can be used as opposed to
2444 * set_migration_target_nodes().
2445 */
2446static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
2447						 unsigned long action, void *_arg)
2448{
2449	struct memory_notify *arg = _arg;
2450
2451	/*
2452	 * Only update the node migration order when a node is
2453	 * changing status, like online->offline.  This avoids
2454	 * the overhead of synchronize_rcu() in most cases.
2455	 */
2456	if (arg->status_change_nid < 0)
2457		return notifier_from_errno(0);
2458
2459	switch (action) {
2460	case MEM_GOING_OFFLINE:
2461		/*
2462		 * Make sure there are not transient states where
2463		 * an offline node is a migration target.  This
2464		 * will leave migration disabled until the offline
2465		 * completes and the MEM_OFFLINE case below runs.
2466		 */
2467		disable_all_migrate_targets();
2468		break;
2469	case MEM_OFFLINE:
2470	case MEM_ONLINE:
2471		/*
2472		 * Recalculate the target nodes once the node
2473		 * reaches its final state (online or offline).
2474		 */
2475		__set_migration_target_nodes();
2476		break;
2477	case MEM_CANCEL_OFFLINE:
2478		/*
2479		 * MEM_GOING_OFFLINE disabled all the migration
2480		 * targets.  Reenable them.
2481		 */
2482		__set_migration_target_nodes();
2483		break;
2484	case MEM_GOING_ONLINE:
2485	case MEM_CANCEL_ONLINE:
2486		break;
2487	}
2488
2489	return notifier_from_errno(0);
2490}
2491
2492void __init migrate_on_reclaim_init(void)
2493{
2494	node_demotion = kmalloc_array(nr_node_ids,
2495				      sizeof(struct demotion_nodes),
2496				      GFP_KERNEL);
2497	WARN_ON(!node_demotion);
2498
2499	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
2500	/*
2501	 * At this point, all numa nodes with memory/CPus have their state
2502	 * properly set, so we can build the demotion order now.
2503	 * Let us hold the cpu_hotplug lock just, as we could possibily have
2504	 * CPU hotplug events during boot.
2505	 */
2506	cpus_read_lock();
2507	set_migration_target_nodes();
2508	cpus_read_unlock();
2509}
2510#endif /* CONFIG_HOTPLUG_CPU */
2511
2512bool numa_demotion_enabled = false;
2513
2514#ifdef CONFIG_SYSFS
2515static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
2516					  struct kobj_attribute *attr, char *buf)
2517{
2518	return sysfs_emit(buf, "%s\n",
2519			  numa_demotion_enabled ? "true" : "false");
2520}
2521
2522static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
2523					   struct kobj_attribute *attr,
2524					   const char *buf, size_t count)
2525{
2526	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
2527		numa_demotion_enabled = true;
2528	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
2529		numa_demotion_enabled = false;
2530	else
2531		return -EINVAL;
2532
2533	return count;
2534}
2535
2536static struct kobj_attribute numa_demotion_enabled_attr =
2537	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
2538	       numa_demotion_enabled_store);
2539
2540static struct attribute *numa_attrs[] = {
2541	&numa_demotion_enabled_attr.attr,
2542	NULL,
2543};
2544
2545static const struct attribute_group numa_attr_group = {
2546	.attrs = numa_attrs,
2547};
2548
2549static int __init numa_init_sysfs(void)
2550{
2551	int err;
2552	struct kobject *numa_kobj;
2553
2554	numa_kobj = kobject_create_and_add("numa", mm_kobj);
2555	if (!numa_kobj) {
2556		pr_err("failed to create numa kobject\n");
2557		return -ENOMEM;
2558	}
2559	err = sysfs_create_group(numa_kobj, &numa_attr_group);
2560	if (err) {
2561		pr_err("failed to register numa group\n");
2562		goto delete_obj;
2563	}
2564	return 0;
2565
2566delete_obj:
2567	kobject_put(numa_kobj);
2568	return err;
2569}
2570subsys_initcall(numa_init_sysfs);
2571#endif