mm/mempolicy.c at v6.6-rc7 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / mempolicy.c
at v6.6-rc7 3179 lines 81 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * preferred many Try a set of nodes first before normal fallback. This is
  35 *                similar to preferred without the special case.
  36 *
  37 * default        Allocate on the local node first, or when on a VMA
  38 *                use the process policy. This is what Linux always did
  39 *		  in a NUMA aware kernel and still does by, ahem, default.
  40 *
  41 * The process policy is applied for most non interrupt memory allocations
  42 * in that process' context. Interrupts ignore the policies and always
  43 * try to allocate on the local CPU. The VMA policy is only applied for memory
  44 * allocations for a VMA in the VM.
  45 *
  46 * Currently there are a few corner cases in swapping where the policy
  47 * is not applied, but the majority should be handled. When process policy
  48 * is used it is not remembered over swap outs/swap ins.
  49 *
  50 * Only the highest zone in the zone hierarchy gets policied. Allocations
  51 * requesting a lower zone just use default policy. This implies that
  52 * on systems with highmem kernel lowmem allocation don't get policied.
  53 * Same with GFP_DMA allocations.
  54 *
  55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  56 * all users and remembered even when nobody has memory mapped.
  57 */
  58
  59/* Notebook:
  60   fix mmap readahead to honour policy and enable policy for any page cache
  61   object
  62   statistics for bigpages
  63   global policy for page cache? currently it uses process policy. Requires
  64   first item above.
  65   handle mremap for shared memory (currently ignored for the policy)
  66   grows down?
  67   make bind policy root only? It can trigger oom much faster and the
  68   kernel is not always grateful with that.
  69*/
  70
  71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  72
  73#include <linux/mempolicy.h>
  74#include <linux/pagewalk.h>
  75#include <linux/highmem.h>
  76#include <linux/hugetlb.h>
  77#include <linux/kernel.h>
  78#include <linux/sched.h>
  79#include <linux/sched/mm.h>
  80#include <linux/sched/numa_balancing.h>
  81#include <linux/sched/task.h>
  82#include <linux/nodemask.h>
  83#include <linux/cpuset.h>
  84#include <linux/slab.h>
  85#include <linux/string.h>
  86#include <linux/export.h>
  87#include <linux/nsproxy.h>
  88#include <linux/interrupt.h>
  89#include <linux/init.h>
  90#include <linux/compat.h>
  91#include <linux/ptrace.h>
  92#include <linux/swap.h>
  93#include <linux/seq_file.h>
  94#include <linux/proc_fs.h>
  95#include <linux/migrate.h>
  96#include <linux/ksm.h>
  97#include <linux/rmap.h>
  98#include <linux/security.h>
  99#include <linux/syscalls.h>
 100#include <linux/ctype.h>
 101#include <linux/mm_inline.h>
 102#include <linux/mmu_notifier.h>
 103#include <linux/printk.h>
 104#include <linux/swapops.h>
 105
 106#include <asm/tlbflush.h>
 107#include <asm/tlb.h>
 108#include <linux/uaccess.h>
 109
 110#include "internal.h"
 111
 112/* Internal flags */
 113#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 114#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 115
 116static struct kmem_cache *policy_cache;
 117static struct kmem_cache *sn_cache;
 118
 119/* Highest zone. An specific allocation for a zone below that is not
 120   policied. */
 121enum zone_type policy_zone = 0;
 122
 123/*
 124 * run-time system-wide default policy => local allocation
 125 */
 126static struct mempolicy default_policy = {
 127	.refcnt = ATOMIC_INIT(1), /* never free it */
 128	.mode = MPOL_LOCAL,
 129};
 130
 131static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 132
 133/**
 134 * numa_map_to_online_node - Find closest online node
 135 * @node: Node id to start the search
 136 *
 137 * Lookup the next closest node by distance if @nid is not online.
 138 *
 139 * Return: this @node if it is online, otherwise the closest node by distance
 140 */
 141int numa_map_to_online_node(int node)
 142{
 143	int min_dist = INT_MAX, dist, n, min_node;
 144
 145	if (node == NUMA_NO_NODE || node_online(node))
 146		return node;
 147
 148	min_node = node;
 149	for_each_online_node(n) {
 150		dist = node_distance(node, n);
 151		if (dist < min_dist) {
 152			min_dist = dist;
 153			min_node = n;
 154		}
 155	}
 156
 157	return min_node;
 158}
 159EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 160
 161struct mempolicy *get_task_policy(struct task_struct *p)
 162{
 163	struct mempolicy *pol = p->mempolicy;
 164	int node;
 165
 166	if (pol)
 167		return pol;
 168
 169	node = numa_node_id();
 170	if (node != NUMA_NO_NODE) {
 171		pol = &preferred_node_policy[node];
 172		/* preferred_node_policy is not initialised early in boot */
 173		if (pol->mode)
 174			return pol;
 175	}
 176
 177	return &default_policy;
 178}
 179
 180static const struct mempolicy_operations {
 181	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 182	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 183} mpol_ops[MPOL_MAX];
 184
 185static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 186{
 187	return pol->flags & MPOL_MODE_FLAGS;
 188}
 189
 190static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 191				   const nodemask_t *rel)
 192{
 193	nodemask_t tmp;
 194	nodes_fold(tmp, *orig, nodes_weight(*rel));
 195	nodes_onto(*ret, tmp, *rel);
 196}
 197
 198static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 199{
 200	if (nodes_empty(*nodes))
 201		return -EINVAL;
 202	pol->nodes = *nodes;
 203	return 0;
 204}
 205
 206static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 207{
 208	if (nodes_empty(*nodes))
 209		return -EINVAL;
 210
 211	nodes_clear(pol->nodes);
 212	node_set(first_node(*nodes), pol->nodes);
 213	return 0;
 214}
 215
 216/*
 217 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 218 * any, for the new policy.  mpol_new() has already validated the nodes
 219 * parameter with respect to the policy mode and flags.
 220 *
 221 * Must be called holding task's alloc_lock to protect task's mems_allowed
 222 * and mempolicy.  May also be called holding the mmap_lock for write.
 223 */
 224static int mpol_set_nodemask(struct mempolicy *pol,
 225		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 226{
 227	int ret;
 228
 229	/*
 230	 * Default (pol==NULL) resp. local memory policies are not a
 231	 * subject of any remapping. They also do not need any special
 232	 * constructor.
 233	 */
 234	if (!pol || pol->mode == MPOL_LOCAL)
 235		return 0;
 236
 237	/* Check N_MEMORY */
 238	nodes_and(nsc->mask1,
 239		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 240
 241	VM_BUG_ON(!nodes);
 242
 243	if (pol->flags & MPOL_F_RELATIVE_NODES)
 244		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 245	else
 246		nodes_and(nsc->mask2, *nodes, nsc->mask1);
 247
 248	if (mpol_store_user_nodemask(pol))
 249		pol->w.user_nodemask = *nodes;
 250	else
 251		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
 252
 253	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 254	return ret;
 255}
 256
 257/*
 258 * This function just creates a new policy, does some check and simple
 259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 260 */
 261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 262				  nodemask_t *nodes)
 263{
 264	struct mempolicy *policy;
 265
 266	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 267		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 268
 269	if (mode == MPOL_DEFAULT) {
 270		if (nodes && !nodes_empty(*nodes))
 271			return ERR_PTR(-EINVAL);
 272		return NULL;
 273	}
 274	VM_BUG_ON(!nodes);
 275
 276	/*
 277	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 278	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 279	 * All other modes require a valid pointer to a non-empty nodemask.
 280	 */
 281	if (mode == MPOL_PREFERRED) {
 282		if (nodes_empty(*nodes)) {
 283			if (((flags & MPOL_F_STATIC_NODES) ||
 284			     (flags & MPOL_F_RELATIVE_NODES)))
 285				return ERR_PTR(-EINVAL);
 286
 287			mode = MPOL_LOCAL;
 288		}
 289	} else if (mode == MPOL_LOCAL) {
 290		if (!nodes_empty(*nodes) ||
 291		    (flags & MPOL_F_STATIC_NODES) ||
 292		    (flags & MPOL_F_RELATIVE_NODES))
 293			return ERR_PTR(-EINVAL);
 294	} else if (nodes_empty(*nodes))
 295		return ERR_PTR(-EINVAL);
 296	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 297	if (!policy)
 298		return ERR_PTR(-ENOMEM);
 299	atomic_set(&policy->refcnt, 1);
 300	policy->mode = mode;
 301	policy->flags = flags;
 302	policy->home_node = NUMA_NO_NODE;
 303
 304	return policy;
 305}
 306
 307/* Slow path of a mpol destructor. */
 308void __mpol_put(struct mempolicy *p)
 309{
 310	if (!atomic_dec_and_test(&p->refcnt))
 311		return;
 312	kmem_cache_free(policy_cache, p);
 313}
 314
 315static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 316{
 317}
 318
 319static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 320{
 321	nodemask_t tmp;
 322
 323	if (pol->flags & MPOL_F_STATIC_NODES)
 324		nodes_and(tmp, pol->w.user_nodemask, *nodes);
 325	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 326		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 327	else {
 328		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
 329								*nodes);
 330		pol->w.cpuset_mems_allowed = *nodes;
 331	}
 332
 333	if (nodes_empty(tmp))
 334		tmp = *nodes;
 335
 336	pol->nodes = tmp;
 337}
 338
 339static void mpol_rebind_preferred(struct mempolicy *pol,
 340						const nodemask_t *nodes)
 341{
 342	pol->w.cpuset_mems_allowed = *nodes;
 343}
 344
 345/*
 346 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 347 *
 348 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 349 * policies are protected by task->mems_allowed_seq to prevent a premature
 350 * OOM/allocation failure due to parallel nodemask modification.
 351 */
 352static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 353{
 354	if (!pol || pol->mode == MPOL_LOCAL)
 355		return;
 356	if (!mpol_store_user_nodemask(pol) &&
 357	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 358		return;
 359
 360	mpol_ops[pol->mode].rebind(pol, newmask);
 361}
 362
 363/*
 364 * Wrapper for mpol_rebind_policy() that just requires task
 365 * pointer, and updates task mempolicy.
 366 *
 367 * Called with task's alloc_lock held.
 368 */
 369
 370void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 371{
 372	mpol_rebind_policy(tsk->mempolicy, new);
 373}
 374
 375/*
 376 * Rebind each vma in mm to new nodemask.
 377 *
 378 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 379 */
 380
 381void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 382{
 383	struct vm_area_struct *vma;
 384	VMA_ITERATOR(vmi, mm, 0);
 385
 386	mmap_write_lock(mm);
 387	for_each_vma(vmi, vma) {
 388		vma_start_write(vma);
 389		mpol_rebind_policy(vma->vm_policy, new);
 390	}
 391	mmap_write_unlock(mm);
 392}
 393
 394static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 395	[MPOL_DEFAULT] = {
 396		.rebind = mpol_rebind_default,
 397	},
 398	[MPOL_INTERLEAVE] = {
 399		.create = mpol_new_nodemask,
 400		.rebind = mpol_rebind_nodemask,
 401	},
 402	[MPOL_PREFERRED] = {
 403		.create = mpol_new_preferred,
 404		.rebind = mpol_rebind_preferred,
 405	},
 406	[MPOL_BIND] = {
 407		.create = mpol_new_nodemask,
 408		.rebind = mpol_rebind_nodemask,
 409	},
 410	[MPOL_LOCAL] = {
 411		.rebind = mpol_rebind_default,
 412	},
 413	[MPOL_PREFERRED_MANY] = {
 414		.create = mpol_new_nodemask,
 415		.rebind = mpol_rebind_preferred,
 416	},
 417};
 418
 419static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
 420				unsigned long flags);
 421
 422struct queue_pages {
 423	struct list_head *pagelist;
 424	unsigned long flags;
 425	nodemask_t *nmask;
 426	unsigned long start;
 427	unsigned long end;
 428	struct vm_area_struct *first;
 429	bool has_unmovable;
 430};
 431
 432/*
 433 * Check if the folio's nid is in qp->nmask.
 434 *
 435 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 436 * in the invert of qp->nmask.
 437 */
 438static inline bool queue_folio_required(struct folio *folio,
 439					struct queue_pages *qp)
 440{
 441	int nid = folio_nid(folio);
 442	unsigned long flags = qp->flags;
 443
 444	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 445}
 446
 447/*
 448 * queue_folios_pmd() has three possible return values:
 449 * 0 - folios are placed on the right node or queued successfully, or
 450 *     special page is met, i.e. zero page, or unmovable page is found
 451 *     but continue walking (indicated by queue_pages.has_unmovable).
 452 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 453 *        existing folio was already on a node that does not follow the
 454 *        policy.
 455 */
 456static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 457				unsigned long end, struct mm_walk *walk)
 458	__releases(ptl)
 459{
 460	int ret = 0;
 461	struct folio *folio;
 462	struct queue_pages *qp = walk->private;
 463	unsigned long flags;
 464
 465	if (unlikely(is_pmd_migration_entry(*pmd))) {
 466		ret = -EIO;
 467		goto unlock;
 468	}
 469	folio = pfn_folio(pmd_pfn(*pmd));
 470	if (is_huge_zero_page(&folio->page)) {
 471		walk->action = ACTION_CONTINUE;
 472		goto unlock;
 473	}
 474	if (!queue_folio_required(folio, qp))
 475		goto unlock;
 476
 477	flags = qp->flags;
 478	/* go to folio migration */
 479	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 480		if (!vma_migratable(walk->vma) ||
 481		    migrate_folio_add(folio, qp->pagelist, flags)) {
 482			qp->has_unmovable = true;
 483			goto unlock;
 484		}
 485	} else
 486		ret = -EIO;
 487unlock:
 488	spin_unlock(ptl);
 489	return ret;
 490}
 491
 492/*
 493 * Scan through pages checking if pages follow certain conditions,
 494 * and move them to the pagelist if they do.
 495 *
 496 * queue_folios_pte_range() has three possible return values:
 497 * 0 - folios are placed on the right node or queued successfully, or
 498 *     special page is met, i.e. zero page, or unmovable page is found
 499 *     but continue walking (indicated by queue_pages.has_unmovable).
 500 * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
 501 *        on a node that does not follow the policy.
 502 */
 503static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 504			unsigned long end, struct mm_walk *walk)
 505{
 506	struct vm_area_struct *vma = walk->vma;
 507	struct folio *folio;
 508	struct queue_pages *qp = walk->private;
 509	unsigned long flags = qp->flags;
 510	pte_t *pte, *mapped_pte;
 511	pte_t ptent;
 512	spinlock_t *ptl;
 513
 514	ptl = pmd_trans_huge_lock(pmd, vma);
 515	if (ptl)
 516		return queue_folios_pmd(pmd, ptl, addr, end, walk);
 517
 518	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 519	if (!pte) {
 520		walk->action = ACTION_AGAIN;
 521		return 0;
 522	}
 523	for (; addr != end; pte++, addr += PAGE_SIZE) {
 524		ptent = ptep_get(pte);
 525		if (!pte_present(ptent))
 526			continue;
 527		folio = vm_normal_folio(vma, addr, ptent);
 528		if (!folio || folio_is_zone_device(folio))
 529			continue;
 530		/*
 531		 * vm_normal_folio() filters out zero pages, but there might
 532		 * still be reserved folios to skip, perhaps in a VDSO.
 533		 */
 534		if (folio_test_reserved(folio))
 535			continue;
 536		if (!queue_folio_required(folio, qp))
 537			continue;
 538		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 539			/*
 540			 * MPOL_MF_STRICT must be specified if we get here.
 541			 * Continue walking vmas due to MPOL_MF_MOVE* flags.
 542			 */
 543			if (!vma_migratable(vma))
 544				qp->has_unmovable = true;
 545
 546			/*
 547			 * Do not abort immediately since there may be
 548			 * temporary off LRU pages in the range.  Still
 549			 * need migrate other LRU pages.
 550			 */
 551			if (migrate_folio_add(folio, qp->pagelist, flags))
 552				qp->has_unmovable = true;
 553		} else
 554			break;
 555	}
 556	pte_unmap_unlock(mapped_pte, ptl);
 557	cond_resched();
 558
 559	return addr != end ? -EIO : 0;
 560}
 561
 562static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
 563			       unsigned long addr, unsigned long end,
 564			       struct mm_walk *walk)
 565{
 566	int ret = 0;
 567#ifdef CONFIG_HUGETLB_PAGE
 568	struct queue_pages *qp = walk->private;
 569	unsigned long flags = (qp->flags & MPOL_MF_VALID);
 570	struct folio *folio;
 571	spinlock_t *ptl;
 572	pte_t entry;
 573
 574	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 575	entry = huge_ptep_get(pte);
 576	if (!pte_present(entry))
 577		goto unlock;
 578	folio = pfn_folio(pte_pfn(entry));
 579	if (!queue_folio_required(folio, qp))
 580		goto unlock;
 581
 582	if (flags == MPOL_MF_STRICT) {
 583		/*
 584		 * STRICT alone means only detecting misplaced folio and no
 585		 * need to further check other vma.
 586		 */
 587		ret = -EIO;
 588		goto unlock;
 589	}
 590
 591	if (!vma_migratable(walk->vma)) {
 592		/*
 593		 * Must be STRICT with MOVE*, otherwise .test_walk() have
 594		 * stopped walking current vma.
 595		 * Detecting misplaced folio but allow migrating folios which
 596		 * have been queued.
 597		 */
 598		qp->has_unmovable = true;
 599		goto unlock;
 600	}
 601
 602	/*
 603	 * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
 604	 * is shared it is likely not worth migrating.
 605	 *
 606	 * To check if the folio is shared, ideally we want to make sure
 607	 * every page is mapped to the same process. Doing that is very
 608	 * expensive, so check the estimated mapcount of the folio instead.
 609	 */
 610	if (flags & (MPOL_MF_MOVE_ALL) ||
 611	    (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
 612	     !hugetlb_pmd_shared(pte))) {
 613		if (!isolate_hugetlb(folio, qp->pagelist) &&
 614			(flags & MPOL_MF_STRICT))
 615			/*
 616			 * Failed to isolate folio but allow migrating pages
 617			 * which have been queued.
 618			 */
 619			qp->has_unmovable = true;
 620	}
 621unlock:
 622	spin_unlock(ptl);
 623#else
 624	BUG();
 625#endif
 626	return ret;
 627}
 628
 629#ifdef CONFIG_NUMA_BALANCING
 630/*
 631 * This is used to mark a range of virtual addresses to be inaccessible.
 632 * These are later cleared by a NUMA hinting fault. Depending on these
 633 * faults, pages may be migrated for better NUMA placement.
 634 *
 635 * This is assuming that NUMA faults are handled using PROT_NONE. If
 636 * an architecture makes a different choice, it will need further
 637 * changes to the core.
 638 */
 639unsigned long change_prot_numa(struct vm_area_struct *vma,
 640			unsigned long addr, unsigned long end)
 641{
 642	struct mmu_gather tlb;
 643	long nr_updated;
 644
 645	tlb_gather_mmu(&tlb, vma->vm_mm);
 646
 647	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
 648	if (nr_updated > 0)
 649		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 650
 651	tlb_finish_mmu(&tlb);
 652
 653	return nr_updated;
 654}
 655#else
 656static unsigned long change_prot_numa(struct vm_area_struct *vma,
 657			unsigned long addr, unsigned long end)
 658{
 659	return 0;
 660}
 661#endif /* CONFIG_NUMA_BALANCING */
 662
 663static int queue_pages_test_walk(unsigned long start, unsigned long end,
 664				struct mm_walk *walk)
 665{
 666	struct vm_area_struct *next, *vma = walk->vma;
 667	struct queue_pages *qp = walk->private;
 668	unsigned long endvma = vma->vm_end;
 669	unsigned long flags = qp->flags;
 670
 671	/* range check first */
 672	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
 673
 674	if (!qp->first) {
 675		qp->first = vma;
 676		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 677			(qp->start < vma->vm_start))
 678			/* hole at head side of range */
 679			return -EFAULT;
 680	}
 681	next = find_vma(vma->vm_mm, vma->vm_end);
 682	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 683		((vma->vm_end < qp->end) &&
 684		(!next || vma->vm_end < next->vm_start)))
 685		/* hole at middle or tail of range */
 686		return -EFAULT;
 687
 688	/*
 689	 * Need check MPOL_MF_STRICT to return -EIO if possible
 690	 * regardless of vma_migratable
 691	 */
 692	if (!vma_migratable(vma) &&
 693	    !(flags & MPOL_MF_STRICT))
 694		return 1;
 695
 696	if (endvma > end)
 697		endvma = end;
 698
 699	if (flags & MPOL_MF_LAZY) {
 700		/* Similar to task_numa_work, skip inaccessible VMAs */
 701		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
 702			!(vma->vm_flags & VM_MIXEDMAP))
 703			change_prot_numa(vma, start, endvma);
 704		return 1;
 705	}
 706
 707	/* queue pages from current vma */
 708	if (flags & MPOL_MF_VALID)
 709		return 0;
 710	return 1;
 711}
 712
 713static const struct mm_walk_ops queue_pages_walk_ops = {
 714	.hugetlb_entry		= queue_folios_hugetlb,
 715	.pmd_entry		= queue_folios_pte_range,
 716	.test_walk		= queue_pages_test_walk,
 717	.walk_lock		= PGWALK_RDLOCK,
 718};
 719
 720static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
 721	.hugetlb_entry		= queue_folios_hugetlb,
 722	.pmd_entry		= queue_folios_pte_range,
 723	.test_walk		= queue_pages_test_walk,
 724	.walk_lock		= PGWALK_WRLOCK,
 725};
 726
 727/*
 728 * Walk through page tables and collect pages to be migrated.
 729 *
 730 * If pages found in a given range are on a set of nodes (determined by
 731 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 732 * passed via @private.
 733 *
 734 * queue_pages_range() has three possible return values:
 735 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 736 *     specified.
 737 * 0 - queue pages successfully or no misplaced page.
 738 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 739 *         memory range specified by nodemask and maxnode points outside
 740 *         your accessible address space (-EFAULT)
 741 */
 742static int
 743queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 744		nodemask_t *nodes, unsigned long flags,
 745		struct list_head *pagelist, bool lock_vma)
 746{
 747	int err;
 748	struct queue_pages qp = {
 749		.pagelist = pagelist,
 750		.flags = flags,
 751		.nmask = nodes,
 752		.start = start,
 753		.end = end,
 754		.first = NULL,
 755		.has_unmovable = false,
 756	};
 757	const struct mm_walk_ops *ops = lock_vma ?
 758			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
 759
 760	err = walk_page_range(mm, start, end, ops, &qp);
 761
 762	if (qp.has_unmovable)
 763		err = 1;
 764	if (!qp.first)
 765		/* whole range in hole */
 766		err = -EFAULT;
 767
 768	return err;
 769}
 770
 771/*
 772 * Apply policy to a single VMA
 773 * This must be called with the mmap_lock held for writing.
 774 */
 775static int vma_replace_policy(struct vm_area_struct *vma,
 776						struct mempolicy *pol)
 777{
 778	int err;
 779	struct mempolicy *old;
 780	struct mempolicy *new;
 781
 782	vma_assert_write_locked(vma);
 783
 784	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 785		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 786		 vma->vm_ops, vma->vm_file,
 787		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 788
 789	new = mpol_dup(pol);
 790	if (IS_ERR(new))
 791		return PTR_ERR(new);
 792
 793	if (vma->vm_ops && vma->vm_ops->set_policy) {
 794		err = vma->vm_ops->set_policy(vma, new);
 795		if (err)
 796			goto err_out;
 797	}
 798
 799	old = vma->vm_policy;
 800	vma->vm_policy = new; /* protected by mmap_lock */
 801	mpol_put(old);
 802
 803	return 0;
 804 err_out:
 805	mpol_put(new);
 806	return err;
 807}
 808
 809/* Split or merge the VMA (if required) and apply the new policy */
 810static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
 811		struct vm_area_struct **prev, unsigned long start,
 812		unsigned long end, struct mempolicy *new_pol)
 813{
 814	struct vm_area_struct *merged;
 815	unsigned long vmstart, vmend;
 816	pgoff_t pgoff;
 817	int err;
 818
 819	vmend = min(end, vma->vm_end);
 820	if (start > vma->vm_start) {
 821		*prev = vma;
 822		vmstart = start;
 823	} else {
 824		vmstart = vma->vm_start;
 825	}
 826
 827	if (mpol_equal(vma_policy(vma), new_pol)) {
 828		*prev = vma;
 829		return 0;
 830	}
 831
 832	pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 833	merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
 834			 vma->anon_vma, vma->vm_file, pgoff, new_pol,
 835			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
 836	if (merged) {
 837		*prev = merged;
 838		return vma_replace_policy(merged, new_pol);
 839	}
 840
 841	if (vma->vm_start != vmstart) {
 842		err = split_vma(vmi, vma, vmstart, 1);
 843		if (err)
 844			return err;
 845	}
 846
 847	if (vma->vm_end != vmend) {
 848		err = split_vma(vmi, vma, vmend, 0);
 849		if (err)
 850			return err;
 851	}
 852
 853	*prev = vma;
 854	return vma_replace_policy(vma, new_pol);
 855}
 856
 857/* Set the process memory policy */
 858static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 859			     nodemask_t *nodes)
 860{
 861	struct mempolicy *new, *old;
 862	NODEMASK_SCRATCH(scratch);
 863	int ret;
 864
 865	if (!scratch)
 866		return -ENOMEM;
 867
 868	new = mpol_new(mode, flags, nodes);
 869	if (IS_ERR(new)) {
 870		ret = PTR_ERR(new);
 871		goto out;
 872	}
 873
 874	task_lock(current);
 875	ret = mpol_set_nodemask(new, nodes, scratch);
 876	if (ret) {
 877		task_unlock(current);
 878		mpol_put(new);
 879		goto out;
 880	}
 881
 882	old = current->mempolicy;
 883	current->mempolicy = new;
 884	if (new && new->mode == MPOL_INTERLEAVE)
 885		current->il_prev = MAX_NUMNODES-1;
 886	task_unlock(current);
 887	mpol_put(old);
 888	ret = 0;
 889out:
 890	NODEMASK_SCRATCH_FREE(scratch);
 891	return ret;
 892}
 893
 894/*
 895 * Return nodemask for policy for get_mempolicy() query
 896 *
 897 * Called with task's alloc_lock held
 898 */
 899static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 900{
 901	nodes_clear(*nodes);
 902	if (p == &default_policy)
 903		return;
 904
 905	switch (p->mode) {
 906	case MPOL_BIND:
 907	case MPOL_INTERLEAVE:
 908	case MPOL_PREFERRED:
 909	case MPOL_PREFERRED_MANY:
 910		*nodes = p->nodes;
 911		break;
 912	case MPOL_LOCAL:
 913		/* return empty node mask for local allocation */
 914		break;
 915	default:
 916		BUG();
 917	}
 918}
 919
 920static int lookup_node(struct mm_struct *mm, unsigned long addr)
 921{
 922	struct page *p = NULL;
 923	int ret;
 924
 925	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
 926	if (ret > 0) {
 927		ret = page_to_nid(p);
 928		put_page(p);
 929	}
 930	return ret;
 931}
 932
 933/* Retrieve NUMA policy */
 934static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 935			     unsigned long addr, unsigned long flags)
 936{
 937	int err;
 938	struct mm_struct *mm = current->mm;
 939	struct vm_area_struct *vma = NULL;
 940	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 941
 942	if (flags &
 943		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 944		return -EINVAL;
 945
 946	if (flags & MPOL_F_MEMS_ALLOWED) {
 947		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 948			return -EINVAL;
 949		*policy = 0;	/* just so it's initialized */
 950		task_lock(current);
 951		*nmask  = cpuset_current_mems_allowed;
 952		task_unlock(current);
 953		return 0;
 954	}
 955
 956	if (flags & MPOL_F_ADDR) {
 957		/*
 958		 * Do NOT fall back to task policy if the
 959		 * vma/shared policy at addr is NULL.  We
 960		 * want to return MPOL_DEFAULT in this case.
 961		 */
 962		mmap_read_lock(mm);
 963		vma = vma_lookup(mm, addr);
 964		if (!vma) {
 965			mmap_read_unlock(mm);
 966			return -EFAULT;
 967		}
 968		if (vma->vm_ops && vma->vm_ops->get_policy)
 969			pol = vma->vm_ops->get_policy(vma, addr);
 970		else
 971			pol = vma->vm_policy;
 972	} else if (addr)
 973		return -EINVAL;
 974
 975	if (!pol)
 976		pol = &default_policy;	/* indicates default behavior */
 977
 978	if (flags & MPOL_F_NODE) {
 979		if (flags & MPOL_F_ADDR) {
 980			/*
 981			 * Take a refcount on the mpol, because we are about to
 982			 * drop the mmap_lock, after which only "pol" remains
 983			 * valid, "vma" is stale.
 984			 */
 985			pol_refcount = pol;
 986			vma = NULL;
 987			mpol_get(pol);
 988			mmap_read_unlock(mm);
 989			err = lookup_node(mm, addr);
 990			if (err < 0)
 991				goto out;
 992			*policy = err;
 993		} else if (pol == current->mempolicy &&
 994				pol->mode == MPOL_INTERLEAVE) {
 995			*policy = next_node_in(current->il_prev, pol->nodes);
 996		} else {
 997			err = -EINVAL;
 998			goto out;
 999		}
1000	} else {
1001		*policy = pol == &default_policy ? MPOL_DEFAULT :
1002						pol->mode;
1003		/*
1004		 * Internal mempolicy flags must be masked off before exposing
1005		 * the policy to userspace.
1006		 */
1007		*policy |= (pol->flags & MPOL_MODE_FLAGS);
1008	}
1009
1010	err = 0;
1011	if (nmask) {
1012		if (mpol_store_user_nodemask(pol)) {
1013			*nmask = pol->w.user_nodemask;
1014		} else {
1015			task_lock(current);
1016			get_policy_nodemask(pol, nmask);
1017			task_unlock(current);
1018		}
1019	}
1020
1021 out:
1022	mpol_cond_put(pol);
1023	if (vma)
1024		mmap_read_unlock(mm);
1025	if (pol_refcount)
1026		mpol_put(pol_refcount);
1027	return err;
1028}
1029
1030#ifdef CONFIG_MIGRATION
1031static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1032				unsigned long flags)
1033{
1034	/*
1035	 * We try to migrate only unshared folios. If it is shared it
1036	 * is likely not worth migrating.
1037	 *
1038	 * To check if the folio is shared, ideally we want to make sure
1039	 * every page is mapped to the same process. Doing that is very
1040	 * expensive, so check the estimated mapcount of the folio instead.
1041	 */
1042	if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
1043		if (folio_isolate_lru(folio)) {
1044			list_add_tail(&folio->lru, foliolist);
1045			node_stat_mod_folio(folio,
1046				NR_ISOLATED_ANON + folio_is_file_lru(folio),
1047				folio_nr_pages(folio));
1048		} else if (flags & MPOL_MF_STRICT) {
1049			/*
1050			 * Non-movable folio may reach here.  And, there may be
1051			 * temporary off LRU folios or non-LRU movable folios.
1052			 * Treat them as unmovable folios since they can't be
1053			 * isolated, so they can't be moved at the moment.  It
1054			 * should return -EIO for this case too.
1055			 */
1056			return -EIO;
1057		}
1058	}
1059
1060	return 0;
1061}
1062
1063/*
1064 * Migrate pages from one node to a target node.
1065 * Returns error or the number of pages not migrated.
1066 */
1067static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1068			   int flags)
1069{
1070	nodemask_t nmask;
1071	struct vm_area_struct *vma;
1072	LIST_HEAD(pagelist);
1073	int err = 0;
1074	struct migration_target_control mtc = {
1075		.nid = dest,
1076		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1077	};
1078
1079	nodes_clear(nmask);
1080	node_set(source, nmask);
1081
1082	/*
1083	 * This does not "check" the range but isolates all pages that
1084	 * need migration.  Between passing in the full user address
1085	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1086	 */
1087	vma = find_vma(mm, 0);
1088	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1089	queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1090			flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
1091
1092	if (!list_empty(&pagelist)) {
1093		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1094				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1095		if (err)
1096			putback_movable_pages(&pagelist);
1097	}
1098
1099	return err;
1100}
1101
1102/*
1103 * Move pages between the two nodesets so as to preserve the physical
1104 * layout as much as possible.
1105 *
1106 * Returns the number of page that could not be moved.
1107 */
1108int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1109		     const nodemask_t *to, int flags)
1110{
1111	int busy = 0;
1112	int err = 0;
1113	nodemask_t tmp;
1114
1115	lru_cache_disable();
1116
1117	mmap_read_lock(mm);
1118
1119	/*
1120	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1121	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1122	 * bit in 'tmp', and return that <source, dest> pair for migration.
1123	 * The pair of nodemasks 'to' and 'from' define the map.
1124	 *
1125	 * If no pair of bits is found that way, fallback to picking some
1126	 * pair of 'source' and 'dest' bits that are not the same.  If the
1127	 * 'source' and 'dest' bits are the same, this represents a node
1128	 * that will be migrating to itself, so no pages need move.
1129	 *
1130	 * If no bits are left in 'tmp', or if all remaining bits left
1131	 * in 'tmp' correspond to the same bit in 'to', return false
1132	 * (nothing left to migrate).
1133	 *
1134	 * This lets us pick a pair of nodes to migrate between, such that
1135	 * if possible the dest node is not already occupied by some other
1136	 * source node, minimizing the risk of overloading the memory on a
1137	 * node that would happen if we migrated incoming memory to a node
1138	 * before migrating outgoing memory source that same node.
1139	 *
1140	 * A single scan of tmp is sufficient.  As we go, we remember the
1141	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1142	 * that not only moved, but what's better, moved to an empty slot
1143	 * (d is not set in tmp), then we break out then, with that pair.
1144	 * Otherwise when we finish scanning from_tmp, we at least have the
1145	 * most recent <s, d> pair that moved.  If we get all the way through
1146	 * the scan of tmp without finding any node that moved, much less
1147	 * moved to an empty node, then there is nothing left worth migrating.
1148	 */
1149
1150	tmp = *from;
1151	while (!nodes_empty(tmp)) {
1152		int s, d;
1153		int source = NUMA_NO_NODE;
1154		int dest = 0;
1155
1156		for_each_node_mask(s, tmp) {
1157
1158			/*
1159			 * do_migrate_pages() tries to maintain the relative
1160			 * node relationship of the pages established between
1161			 * threads and memory areas.
1162                         *
1163			 * However if the number of source nodes is not equal to
1164			 * the number of destination nodes we can not preserve
1165			 * this node relative relationship.  In that case, skip
1166			 * copying memory from a node that is in the destination
1167			 * mask.
1168			 *
1169			 * Example: [2,3,4] -> [3,4,5] moves everything.
1170			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1171			 */
1172
1173			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1174						(node_isset(s, *to)))
1175				continue;
1176
1177			d = node_remap(s, *from, *to);
1178			if (s == d)
1179				continue;
1180
1181			source = s;	/* Node moved. Memorize */
1182			dest = d;
1183
1184			/* dest not in remaining from nodes? */
1185			if (!node_isset(dest, tmp))
1186				break;
1187		}
1188		if (source == NUMA_NO_NODE)
1189			break;
1190
1191		node_clear(source, tmp);
1192		err = migrate_to_node(mm, source, dest, flags);
1193		if (err > 0)
1194			busy += err;
1195		if (err < 0)
1196			break;
1197	}
1198	mmap_read_unlock(mm);
1199
1200	lru_cache_enable();
1201	if (err < 0)
1202		return err;
1203	return busy;
1204
1205}
1206
1207/*
1208 * Allocate a new page for page migration based on vma policy.
1209 * Start by assuming the page is mapped by the same vma as contains @start.
1210 * Search forward from there, if not.  N.B., this assumes that the
1211 * list of pages handed to migrate_pages()--which is how we get here--
1212 * is in virtual address order.
1213 */
1214static struct folio *new_folio(struct folio *src, unsigned long start)
1215{
1216	struct vm_area_struct *vma;
1217	unsigned long address;
1218	VMA_ITERATOR(vmi, current->mm, start);
1219	gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
1220
1221	for_each_vma(vmi, vma) {
1222		address = page_address_in_vma(&src->page, vma);
1223		if (address != -EFAULT)
1224			break;
1225	}
1226
1227	if (folio_test_hugetlb(src)) {
1228		return alloc_hugetlb_folio_vma(folio_hstate(src),
1229				vma, address);
1230	}
1231
1232	if (folio_test_large(src))
1233		gfp = GFP_TRANSHUGE;
1234
1235	/*
1236	 * if !vma, vma_alloc_folio() will use task or system default policy
1237	 */
1238	return vma_alloc_folio(gfp, folio_order(src), vma, address,
1239			folio_test_large(src));
1240}
1241#else
1242
1243static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1244				unsigned long flags)
1245{
1246	return -EIO;
1247}
1248
1249int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1250		     const nodemask_t *to, int flags)
1251{
1252	return -ENOSYS;
1253}
1254
1255static struct folio *new_folio(struct folio *src, unsigned long start)
1256{
1257	return NULL;
1258}
1259#endif
1260
1261static long do_mbind(unsigned long start, unsigned long len,
1262		     unsigned short mode, unsigned short mode_flags,
1263		     nodemask_t *nmask, unsigned long flags)
1264{
1265	struct mm_struct *mm = current->mm;
1266	struct vm_area_struct *vma, *prev;
1267	struct vma_iterator vmi;
1268	struct mempolicy *new;
1269	unsigned long end;
1270	int err;
1271	int ret;
1272	LIST_HEAD(pagelist);
1273
1274	if (flags & ~(unsigned long)MPOL_MF_VALID)
1275		return -EINVAL;
1276	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1277		return -EPERM;
1278
1279	if (start & ~PAGE_MASK)
1280		return -EINVAL;
1281
1282	if (mode == MPOL_DEFAULT)
1283		flags &= ~MPOL_MF_STRICT;
1284
1285	len = PAGE_ALIGN(len);
1286	end = start + len;
1287
1288	if (end < start)
1289		return -EINVAL;
1290	if (end == start)
1291		return 0;
1292
1293	new = mpol_new(mode, mode_flags, nmask);
1294	if (IS_ERR(new))
1295		return PTR_ERR(new);
1296
1297	if (flags & MPOL_MF_LAZY)
1298		new->flags |= MPOL_F_MOF;
1299
1300	/*
1301	 * If we are using the default policy then operation
1302	 * on discontinuous address spaces is okay after all
1303	 */
1304	if (!new)
1305		flags |= MPOL_MF_DISCONTIG_OK;
1306
1307	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1308		 start, start + len, mode, mode_flags,
1309		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1310
1311	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1312
1313		lru_cache_disable();
1314	}
1315	{
1316		NODEMASK_SCRATCH(scratch);
1317		if (scratch) {
1318			mmap_write_lock(mm);
1319			err = mpol_set_nodemask(new, nmask, scratch);
1320			if (err)
1321				mmap_write_unlock(mm);
1322		} else
1323			err = -ENOMEM;
1324		NODEMASK_SCRATCH_FREE(scratch);
1325	}
1326	if (err)
1327		goto mpol_out;
1328
1329	/*
1330	 * Lock the VMAs before scanning for pages to migrate, to ensure we don't
1331	 * miss a concurrently inserted page.
1332	 */
1333	ret = queue_pages_range(mm, start, end, nmask,
1334			  flags | MPOL_MF_INVERT, &pagelist, true);
1335
1336	if (ret < 0) {
1337		err = ret;
1338		goto up_out;
1339	}
1340
1341	vma_iter_init(&vmi, mm, start);
1342	prev = vma_prev(&vmi);
1343	for_each_vma_range(vmi, vma, end) {
1344		err = mbind_range(&vmi, vma, &prev, start, end, new);
1345		if (err)
1346			break;
1347	}
1348
1349	if (!err) {
1350		int nr_failed = 0;
1351
1352		if (!list_empty(&pagelist)) {
1353			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1354			nr_failed = migrate_pages(&pagelist, new_folio, NULL,
1355				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1356			if (nr_failed)
1357				putback_movable_pages(&pagelist);
1358		}
1359
1360		if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT))
1361			err = -EIO;
1362	} else {
1363up_out:
1364		if (!list_empty(&pagelist))
1365			putback_movable_pages(&pagelist);
1366	}
1367
1368	mmap_write_unlock(mm);
1369mpol_out:
1370	mpol_put(new);
1371	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1372		lru_cache_enable();
1373	return err;
1374}
1375
1376/*
1377 * User space interface with variable sized bitmaps for nodelists.
1378 */
1379static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1380		      unsigned long maxnode)
1381{
1382	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1383	int ret;
1384
1385	if (in_compat_syscall())
1386		ret = compat_get_bitmap(mask,
1387					(const compat_ulong_t __user *)nmask,
1388					maxnode);
1389	else
1390		ret = copy_from_user(mask, nmask,
1391				     nlongs * sizeof(unsigned long));
1392
1393	if (ret)
1394		return -EFAULT;
1395
1396	if (maxnode % BITS_PER_LONG)
1397		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1398
1399	return 0;
1400}
1401
1402/* Copy a node mask from user space. */
1403static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1404		     unsigned long maxnode)
1405{
1406	--maxnode;
1407	nodes_clear(*nodes);
1408	if (maxnode == 0 || !nmask)
1409		return 0;
1410	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1411		return -EINVAL;
1412
1413	/*
1414	 * When the user specified more nodes than supported just check
1415	 * if the non supported part is all zero, one word at a time,
1416	 * starting at the end.
1417	 */
1418	while (maxnode > MAX_NUMNODES) {
1419		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1420		unsigned long t;
1421
1422		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1423			return -EFAULT;
1424
1425		if (maxnode - bits >= MAX_NUMNODES) {
1426			maxnode -= bits;
1427		} else {
1428			maxnode = MAX_NUMNODES;
1429			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1430		}
1431		if (t)
1432			return -EINVAL;
1433	}
1434
1435	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1436}
1437
1438/* Copy a kernel node mask to user space */
1439static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1440			      nodemask_t *nodes)
1441{
1442	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1443	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1444	bool compat = in_compat_syscall();
1445
1446	if (compat)
1447		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1448
1449	if (copy > nbytes) {
1450		if (copy > PAGE_SIZE)
1451			return -EINVAL;
1452		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1453			return -EFAULT;
1454		copy = nbytes;
1455		maxnode = nr_node_ids;
1456	}
1457
1458	if (compat)
1459		return compat_put_bitmap((compat_ulong_t __user *)mask,
1460					 nodes_addr(*nodes), maxnode);
1461
1462	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1463}
1464
1465/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1466static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1467{
1468	*flags = *mode & MPOL_MODE_FLAGS;
1469	*mode &= ~MPOL_MODE_FLAGS;
1470
1471	if ((unsigned int)(*mode) >=  MPOL_MAX)
1472		return -EINVAL;
1473	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1474		return -EINVAL;
1475	if (*flags & MPOL_F_NUMA_BALANCING) {
1476		if (*mode != MPOL_BIND)
1477			return -EINVAL;
1478		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
1479	}
1480	return 0;
1481}
1482
1483static long kernel_mbind(unsigned long start, unsigned long len,
1484			 unsigned long mode, const unsigned long __user *nmask,
1485			 unsigned long maxnode, unsigned int flags)
1486{
1487	unsigned short mode_flags;
1488	nodemask_t nodes;
1489	int lmode = mode;
1490	int err;
1491
1492	start = untagged_addr(start);
1493	err = sanitize_mpol_flags(&lmode, &mode_flags);
1494	if (err)
1495		return err;
1496
1497	err = get_nodes(&nodes, nmask, maxnode);
1498	if (err)
1499		return err;
1500
1501	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1502}
1503
1504SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1505		unsigned long, home_node, unsigned long, flags)
1506{
1507	struct mm_struct *mm = current->mm;
1508	struct vm_area_struct *vma, *prev;
1509	struct mempolicy *new, *old;
1510	unsigned long end;
1511	int err = -ENOENT;
1512	VMA_ITERATOR(vmi, mm, start);
1513
1514	start = untagged_addr(start);
1515	if (start & ~PAGE_MASK)
1516		return -EINVAL;
1517	/*
1518	 * flags is used for future extension if any.
1519	 */
1520	if (flags != 0)
1521		return -EINVAL;
1522
1523	/*
1524	 * Check home_node is online to avoid accessing uninitialized
1525	 * NODE_DATA.
1526	 */
1527	if (home_node >= MAX_NUMNODES || !node_online(home_node))
1528		return -EINVAL;
1529
1530	len = PAGE_ALIGN(len);
1531	end = start + len;
1532
1533	if (end < start)
1534		return -EINVAL;
1535	if (end == start)
1536		return 0;
1537	mmap_write_lock(mm);
1538	prev = vma_prev(&vmi);
1539	for_each_vma_range(vmi, vma, end) {
1540		/*
1541		 * If any vma in the range got policy other than MPOL_BIND
1542		 * or MPOL_PREFERRED_MANY we return error. We don't reset
1543		 * the home node for vmas we already updated before.
1544		 */
1545		old = vma_policy(vma);
1546		if (!old)
1547			continue;
1548		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1549			err = -EOPNOTSUPP;
1550			break;
1551		}
1552		new = mpol_dup(old);
1553		if (IS_ERR(new)) {
1554			err = PTR_ERR(new);
1555			break;
1556		}
1557
1558		vma_start_write(vma);
1559		new->home_node = home_node;
1560		err = mbind_range(&vmi, vma, &prev, start, end, new);
1561		mpol_put(new);
1562		if (err)
1563			break;
1564	}
1565	mmap_write_unlock(mm);
1566	return err;
1567}
1568
1569SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1570		unsigned long, mode, const unsigned long __user *, nmask,
1571		unsigned long, maxnode, unsigned int, flags)
1572{
1573	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1574}
1575
1576/* Set the process memory policy */
1577static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1578				 unsigned long maxnode)
1579{
1580	unsigned short mode_flags;
1581	nodemask_t nodes;
1582	int lmode = mode;
1583	int err;
1584
1585	err = sanitize_mpol_flags(&lmode, &mode_flags);
1586	if (err)
1587		return err;
1588
1589	err = get_nodes(&nodes, nmask, maxnode);
1590	if (err)
1591		return err;
1592
1593	return do_set_mempolicy(lmode, mode_flags, &nodes);
1594}
1595
1596SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1597		unsigned long, maxnode)
1598{
1599	return kernel_set_mempolicy(mode, nmask, maxnode);
1600}
1601
1602static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1603				const unsigned long __user *old_nodes,
1604				const unsigned long __user *new_nodes)
1605{
1606	struct mm_struct *mm = NULL;
1607	struct task_struct *task;
1608	nodemask_t task_nodes;
1609	int err;
1610	nodemask_t *old;
1611	nodemask_t *new;
1612	NODEMASK_SCRATCH(scratch);
1613
1614	if (!scratch)
1615		return -ENOMEM;
1616
1617	old = &scratch->mask1;
1618	new = &scratch->mask2;
1619
1620	err = get_nodes(old, old_nodes, maxnode);
1621	if (err)
1622		goto out;
1623
1624	err = get_nodes(new, new_nodes, maxnode);
1625	if (err)
1626		goto out;
1627
1628	/* Find the mm_struct */
1629	rcu_read_lock();
1630	task = pid ? find_task_by_vpid(pid) : current;
1631	if (!task) {
1632		rcu_read_unlock();
1633		err = -ESRCH;
1634		goto out;
1635	}
1636	get_task_struct(task);
1637
1638	err = -EINVAL;
1639
1640	/*
1641	 * Check if this process has the right to modify the specified process.
1642	 * Use the regular "ptrace_may_access()" checks.
1643	 */
1644	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1645		rcu_read_unlock();
1646		err = -EPERM;
1647		goto out_put;
1648	}
1649	rcu_read_unlock();
1650
1651	task_nodes = cpuset_mems_allowed(task);
1652	/* Is the user allowed to access the target nodes? */
1653	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1654		err = -EPERM;
1655		goto out_put;
1656	}
1657
1658	task_nodes = cpuset_mems_allowed(current);
1659	nodes_and(*new, *new, task_nodes);
1660	if (nodes_empty(*new))
1661		goto out_put;
1662
1663	err = security_task_movememory(task);
1664	if (err)
1665		goto out_put;
1666
1667	mm = get_task_mm(task);
1668	put_task_struct(task);
1669
1670	if (!mm) {
1671		err = -EINVAL;
1672		goto out;
1673	}
1674
1675	err = do_migrate_pages(mm, old, new,
1676		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1677
1678	mmput(mm);
1679out:
1680	NODEMASK_SCRATCH_FREE(scratch);
1681
1682	return err;
1683
1684out_put:
1685	put_task_struct(task);
1686	goto out;
1687
1688}
1689
1690SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1691		const unsigned long __user *, old_nodes,
1692		const unsigned long __user *, new_nodes)
1693{
1694	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1695}
1696
1697
1698/* Retrieve NUMA policy */
1699static int kernel_get_mempolicy(int __user *policy,
1700				unsigned long __user *nmask,
1701				unsigned long maxnode,
1702				unsigned long addr,
1703				unsigned long flags)
1704{
1705	int err;
1706	int pval;
1707	nodemask_t nodes;
1708
1709	if (nmask != NULL && maxnode < nr_node_ids)
1710		return -EINVAL;
1711
1712	addr = untagged_addr(addr);
1713
1714	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1715
1716	if (err)
1717		return err;
1718
1719	if (policy && put_user(pval, policy))
1720		return -EFAULT;
1721
1722	if (nmask)
1723		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1724
1725	return err;
1726}
1727
1728SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1729		unsigned long __user *, nmask, unsigned long, maxnode,
1730		unsigned long, addr, unsigned long, flags)
1731{
1732	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1733}
1734
1735bool vma_migratable(struct vm_area_struct *vma)
1736{
1737	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1738		return false;
1739
1740	/*
1741	 * DAX device mappings require predictable access latency, so avoid
1742	 * incurring periodic faults.
1743	 */
1744	if (vma_is_dax(vma))
1745		return false;
1746
1747	if (is_vm_hugetlb_page(vma) &&
1748		!hugepage_migration_supported(hstate_vma(vma)))
1749		return false;
1750
1751	/*
1752	 * Migration allocates pages in the highest zone. If we cannot
1753	 * do so then migration (at least from node to node) is not
1754	 * possible.
1755	 */
1756	if (vma->vm_file &&
1757		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1758			< policy_zone)
1759		return false;
1760	return true;
1761}
1762
1763struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1764						unsigned long addr)
1765{
1766	struct mempolicy *pol = NULL;
1767
1768	if (vma) {
1769		if (vma->vm_ops && vma->vm_ops->get_policy) {
1770			pol = vma->vm_ops->get_policy(vma, addr);
1771		} else if (vma->vm_policy) {
1772			pol = vma->vm_policy;
1773
1774			/*
1775			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1776			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1777			 * count on these policies which will be dropped by
1778			 * mpol_cond_put() later
1779			 */
1780			if (mpol_needs_cond_ref(pol))
1781				mpol_get(pol);
1782		}
1783	}
1784
1785	return pol;
1786}
1787
1788/*
1789 * get_vma_policy(@vma, @addr)
1790 * @vma: virtual memory area whose policy is sought
1791 * @addr: address in @vma for shared policy lookup
1792 *
1793 * Returns effective policy for a VMA at specified address.
1794 * Falls back to current->mempolicy or system default policy, as necessary.
1795 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1796 * count--added by the get_policy() vm_op, as appropriate--to protect against
1797 * freeing by another task.  It is the caller's responsibility to free the
1798 * extra reference for shared policies.
1799 */
1800static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1801						unsigned long addr)
1802{
1803	struct mempolicy *pol = __get_vma_policy(vma, addr);
1804
1805	if (!pol)
1806		pol = get_task_policy(current);
1807
1808	return pol;
1809}
1810
1811bool vma_policy_mof(struct vm_area_struct *vma)
1812{
1813	struct mempolicy *pol;
1814
1815	if (vma->vm_ops && vma->vm_ops->get_policy) {
1816		bool ret = false;
1817
1818		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1819		if (pol && (pol->flags & MPOL_F_MOF))
1820			ret = true;
1821		mpol_cond_put(pol);
1822
1823		return ret;
1824	}
1825
1826	pol = vma->vm_policy;
1827	if (!pol)
1828		pol = get_task_policy(current);
1829
1830	return pol->flags & MPOL_F_MOF;
1831}
1832
1833bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1834{
1835	enum zone_type dynamic_policy_zone = policy_zone;
1836
1837	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1838
1839	/*
1840	 * if policy->nodes has movable memory only,
1841	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1842	 *
1843	 * policy->nodes is intersect with node_states[N_MEMORY].
1844	 * so if the following test fails, it implies
1845	 * policy->nodes has movable memory only.
1846	 */
1847	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1848		dynamic_policy_zone = ZONE_MOVABLE;
1849
1850	return zone >= dynamic_policy_zone;
1851}
1852
1853/*
1854 * Return a nodemask representing a mempolicy for filtering nodes for
1855 * page allocation
1856 */
1857nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1858{
1859	int mode = policy->mode;
1860
1861	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1862	if (unlikely(mode == MPOL_BIND) &&
1863		apply_policy_zone(policy, gfp_zone(gfp)) &&
1864		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1865		return &policy->nodes;
1866
1867	if (mode == MPOL_PREFERRED_MANY)
1868		return &policy->nodes;
1869
1870	return NULL;
1871}
1872
1873/*
1874 * Return the  preferred node id for 'prefer' mempolicy, and return
1875 * the given id for all other policies.
1876 *
1877 * policy_node() is always coupled with policy_nodemask(), which
1878 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1879 */
1880static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1881{
1882	if (policy->mode == MPOL_PREFERRED) {
1883		nd = first_node(policy->nodes);
1884	} else {
1885		/*
1886		 * __GFP_THISNODE shouldn't even be used with the bind policy
1887		 * because we might easily break the expectation to stay on the
1888		 * requested node and not break the policy.
1889		 */
1890		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1891	}
1892
1893	if ((policy->mode == MPOL_BIND ||
1894	     policy->mode == MPOL_PREFERRED_MANY) &&
1895	    policy->home_node != NUMA_NO_NODE)
1896		return policy->home_node;
1897
1898	return nd;
1899}
1900
1901/* Do dynamic interleaving for a process */
1902static unsigned interleave_nodes(struct mempolicy *policy)
1903{
1904	unsigned next;
1905	struct task_struct *me = current;
1906
1907	next = next_node_in(me->il_prev, policy->nodes);
1908	if (next < MAX_NUMNODES)
1909		me->il_prev = next;
1910	return next;
1911}
1912
1913/*
1914 * Depending on the memory policy provide a node from which to allocate the
1915 * next slab entry.
1916 */
1917unsigned int mempolicy_slab_node(void)
1918{
1919	struct mempolicy *policy;
1920	int node = numa_mem_id();
1921
1922	if (!in_task())
1923		return node;
1924
1925	policy = current->mempolicy;
1926	if (!policy)
1927		return node;
1928
1929	switch (policy->mode) {
1930	case MPOL_PREFERRED:
1931		return first_node(policy->nodes);
1932
1933	case MPOL_INTERLEAVE:
1934		return interleave_nodes(policy);
1935
1936	case MPOL_BIND:
1937	case MPOL_PREFERRED_MANY:
1938	{
1939		struct zoneref *z;
1940
1941		/*
1942		 * Follow bind policy behavior and start allocation at the
1943		 * first node.
1944		 */
1945		struct zonelist *zonelist;
1946		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1947		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1948		z = first_zones_zonelist(zonelist, highest_zoneidx,
1949							&policy->nodes);
1950		return z->zone ? zone_to_nid(z->zone) : node;
1951	}
1952	case MPOL_LOCAL:
1953		return node;
1954
1955	default:
1956		BUG();
1957	}
1958}
1959
1960/*
1961 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1962 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1963 * number of present nodes.
1964 */
1965static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1966{
1967	nodemask_t nodemask = pol->nodes;
1968	unsigned int target, nnodes;
1969	int i;
1970	int nid;
1971	/*
1972	 * The barrier will stabilize the nodemask in a register or on
1973	 * the stack so that it will stop changing under the code.
1974	 *
1975	 * Between first_node() and next_node(), pol->nodes could be changed
1976	 * by other threads. So we put pol->nodes in a local stack.
1977	 */
1978	barrier();
1979
1980	nnodes = nodes_weight(nodemask);
1981	if (!nnodes)
1982		return numa_node_id();
1983	target = (unsigned int)n % nnodes;
1984	nid = first_node(nodemask);
1985	for (i = 0; i < target; i++)
1986		nid = next_node(nid, nodemask);
1987	return nid;
1988}
1989
1990/* Determine a node number for interleave */
1991static inline unsigned interleave_nid(struct mempolicy *pol,
1992		 struct vm_area_struct *vma, unsigned long addr, int shift)
1993{
1994	if (vma) {
1995		unsigned long off;
1996
1997		/*
1998		 * for small pages, there is no difference between
1999		 * shift and PAGE_SHIFT, so the bit-shift is safe.
2000		 * for huge pages, since vm_pgoff is in units of small
2001		 * pages, we need to shift off the always 0 bits to get
2002		 * a useful offset.
2003		 */
2004		BUG_ON(shift < PAGE_SHIFT);
2005		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
2006		off += (addr - vma->vm_start) >> shift;
2007		return offset_il_node(pol, off);
2008	} else
2009		return interleave_nodes(pol);
2010}
2011
2012#ifdef CONFIG_HUGETLBFS
2013/*
2014 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2015 * @vma: virtual memory area whose policy is sought
2016 * @addr: address in @vma for shared policy lookup and interleave policy
2017 * @gfp_flags: for requested zone
2018 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2019 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2020 *
2021 * Returns a nid suitable for a huge page allocation and a pointer
2022 * to the struct mempolicy for conditional unref after allocation.
2023 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2024 * to the mempolicy's @nodemask for filtering the zonelist.
2025 *
2026 * Must be protected by read_mems_allowed_begin()
2027 */
2028int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2029				struct mempolicy **mpol, nodemask_t **nodemask)
2030{
2031	int nid;
2032	int mode;
2033
2034	*mpol = get_vma_policy(vma, addr);
2035	*nodemask = NULL;
2036	mode = (*mpol)->mode;
2037
2038	if (unlikely(mode == MPOL_INTERLEAVE)) {
2039		nid = interleave_nid(*mpol, vma, addr,
2040					huge_page_shift(hstate_vma(vma)));
2041	} else {
2042		nid = policy_node(gfp_flags, *mpol, numa_node_id());
2043		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
2044			*nodemask = &(*mpol)->nodes;
2045	}
2046	return nid;
2047}
2048
2049/*
2050 * init_nodemask_of_mempolicy
2051 *
2052 * If the current task's mempolicy is "default" [NULL], return 'false'
2053 * to indicate default policy.  Otherwise, extract the policy nodemask
2054 * for 'bind' or 'interleave' policy into the argument nodemask, or
2055 * initialize the argument nodemask to contain the single node for
2056 * 'preferred' or 'local' policy and return 'true' to indicate presence
2057 * of non-default mempolicy.
2058 *
2059 * We don't bother with reference counting the mempolicy [mpol_get/put]
2060 * because the current task is examining it's own mempolicy and a task's
2061 * mempolicy is only ever changed by the task itself.
2062 *
2063 * N.B., it is the caller's responsibility to free a returned nodemask.
2064 */
2065bool init_nodemask_of_mempolicy(nodemask_t *mask)
2066{
2067	struct mempolicy *mempolicy;
2068
2069	if (!(mask && current->mempolicy))
2070		return false;
2071
2072	task_lock(current);
2073	mempolicy = current->mempolicy;
2074	switch (mempolicy->mode) {
2075	case MPOL_PREFERRED:
2076	case MPOL_PREFERRED_MANY:
2077	case MPOL_BIND:
2078	case MPOL_INTERLEAVE:
2079		*mask = mempolicy->nodes;
2080		break;
2081
2082	case MPOL_LOCAL:
2083		init_nodemask_of_node(mask, numa_node_id());
2084		break;
2085
2086	default:
2087		BUG();
2088	}
2089	task_unlock(current);
2090
2091	return true;
2092}
2093#endif
2094
2095/*
2096 * mempolicy_in_oom_domain
2097 *
2098 * If tsk's mempolicy is "bind", check for intersection between mask and
2099 * the policy nodemask. Otherwise, return true for all other policies
2100 * including "interleave", as a tsk with "interleave" policy may have
2101 * memory allocated from all nodes in system.
2102 *
2103 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2104 */
2105bool mempolicy_in_oom_domain(struct task_struct *tsk,
2106					const nodemask_t *mask)
2107{
2108	struct mempolicy *mempolicy;
2109	bool ret = true;
2110
2111	if (!mask)
2112		return ret;
2113
2114	task_lock(tsk);
2115	mempolicy = tsk->mempolicy;
2116	if (mempolicy && mempolicy->mode == MPOL_BIND)
2117		ret = nodes_intersects(mempolicy->nodes, *mask);
2118	task_unlock(tsk);
2119
2120	return ret;
2121}
2122
2123/* Allocate a page in interleaved policy.
2124   Own path because it needs to do special accounting. */
2125static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2126					unsigned nid)
2127{
2128	struct page *page;
2129
2130	page = __alloc_pages(gfp, order, nid, NULL);
2131	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2132	if (!static_branch_likely(&vm_numa_stat_key))
2133		return page;
2134	if (page && page_to_nid(page) == nid) {
2135		preempt_disable();
2136		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2137		preempt_enable();
2138	}
2139	return page;
2140}
2141
2142static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2143						int nid, struct mempolicy *pol)
2144{
2145	struct page *page;
2146	gfp_t preferred_gfp;
2147
2148	/*
2149	 * This is a two pass approach. The first pass will only try the
2150	 * preferred nodes but skip the direct reclaim and allow the
2151	 * allocation to fail, while the second pass will try all the
2152	 * nodes in system.
2153	 */
2154	preferred_gfp = gfp | __GFP_NOWARN;
2155	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2156	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2157	if (!page)
2158		page = __alloc_pages(gfp, order, nid, NULL);
2159
2160	return page;
2161}
2162
2163/**
2164 * vma_alloc_folio - Allocate a folio for a VMA.
2165 * @gfp: GFP flags.
2166 * @order: Order of the folio.
2167 * @vma: Pointer to VMA or NULL if not available.
2168 * @addr: Virtual address of the allocation.  Must be inside @vma.
2169 * @hugepage: For hugepages try only the preferred node if possible.
2170 *
2171 * Allocate a folio for a specific address in @vma, using the appropriate
2172 * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2173 * of the mm_struct of the VMA to prevent it from going away.  Should be
2174 * used for all allocations for folios that will be mapped into user space.
2175 *
2176 * Return: The folio on success or NULL if allocation fails.
2177 */
2178struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2179		unsigned long addr, bool hugepage)
2180{
2181	struct mempolicy *pol;
2182	int node = numa_node_id();
2183	struct folio *folio;
2184	int preferred_nid;
2185	nodemask_t *nmask;
2186
2187	pol = get_vma_policy(vma, addr);
2188
2189	if (pol->mode == MPOL_INTERLEAVE) {
2190		struct page *page;
2191		unsigned nid;
2192
2193		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2194		mpol_cond_put(pol);
2195		gfp |= __GFP_COMP;
2196		page = alloc_page_interleave(gfp, order, nid);
2197		folio = (struct folio *)page;
2198		if (folio && order > 1)
2199			folio_prep_large_rmappable(folio);
2200		goto out;
2201	}
2202
2203	if (pol->mode == MPOL_PREFERRED_MANY) {
2204		struct page *page;
2205
2206		node = policy_node(gfp, pol, node);
2207		gfp |= __GFP_COMP;
2208		page = alloc_pages_preferred_many(gfp, order, node, pol);
2209		mpol_cond_put(pol);
2210		folio = (struct folio *)page;
2211		if (folio && order > 1)
2212			folio_prep_large_rmappable(folio);
2213		goto out;
2214	}
2215
2216	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2217		int hpage_node = node;
2218
2219		/*
2220		 * For hugepage allocation and non-interleave policy which
2221		 * allows the current node (or other explicitly preferred
2222		 * node) we only try to allocate from the current/preferred
2223		 * node and don't fall back to other nodes, as the cost of
2224		 * remote accesses would likely offset THP benefits.
2225		 *
2226		 * If the policy is interleave or does not allow the current
2227		 * node in its nodemask, we allocate the standard way.
2228		 */
2229		if (pol->mode == MPOL_PREFERRED)
2230			hpage_node = first_node(pol->nodes);
2231
2232		nmask = policy_nodemask(gfp, pol);
2233		if (!nmask || node_isset(hpage_node, *nmask)) {
2234			mpol_cond_put(pol);
2235			/*
2236			 * First, try to allocate THP only on local node, but
2237			 * don't reclaim unnecessarily, just compact.
2238			 */
2239			folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2240					__GFP_NORETRY, order, hpage_node);
2241
2242			/*
2243			 * If hugepage allocations are configured to always
2244			 * synchronous compact or the vma has been madvised
2245			 * to prefer hugepage backing, retry allowing remote
2246			 * memory with both reclaim and compact as well.
2247			 */
2248			if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2249				folio = __folio_alloc(gfp, order, hpage_node,
2250						      nmask);
2251
2252			goto out;
2253		}
2254	}
2255
2256	nmask = policy_nodemask(gfp, pol);
2257	preferred_nid = policy_node(gfp, pol, node);
2258	folio = __folio_alloc(gfp, order, preferred_nid, nmask);
2259	mpol_cond_put(pol);
2260out:
2261	return folio;
2262}
2263EXPORT_SYMBOL(vma_alloc_folio);
2264
2265/**
2266 * alloc_pages - Allocate pages.
2267 * @gfp: GFP flags.
2268 * @order: Power of two of number of pages to allocate.
2269 *
2270 * Allocate 1 << @order contiguous pages.  The physical address of the
2271 * first page is naturally aligned (eg an order-3 allocation will be aligned
2272 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2273 * process is honoured when in process context.
2274 *
2275 * Context: Can be called from any context, providing the appropriate GFP
2276 * flags are used.
2277 * Return: The page on success or NULL if allocation fails.
2278 */
2279struct page *alloc_pages(gfp_t gfp, unsigned order)
2280{
2281	struct mempolicy *pol = &default_policy;
2282	struct page *page;
2283
2284	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2285		pol = get_task_policy(current);
2286
2287	/*
2288	 * No reference counting needed for current->mempolicy
2289	 * nor system default_policy
2290	 */
2291	if (pol->mode == MPOL_INTERLEAVE)
2292		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2293	else if (pol->mode == MPOL_PREFERRED_MANY)
2294		page = alloc_pages_preferred_many(gfp, order,
2295				  policy_node(gfp, pol, numa_node_id()), pol);
2296	else
2297		page = __alloc_pages(gfp, order,
2298				policy_node(gfp, pol, numa_node_id()),
2299				policy_nodemask(gfp, pol));
2300
2301	return page;
2302}
2303EXPORT_SYMBOL(alloc_pages);
2304
2305struct folio *folio_alloc(gfp_t gfp, unsigned order)
2306{
2307	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2308	struct folio *folio = (struct folio *)page;
2309
2310	if (folio && order > 1)
2311		folio_prep_large_rmappable(folio);
2312	return folio;
2313}
2314EXPORT_SYMBOL(folio_alloc);
2315
2316static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2317		struct mempolicy *pol, unsigned long nr_pages,
2318		struct page **page_array)
2319{
2320	int nodes;
2321	unsigned long nr_pages_per_node;
2322	int delta;
2323	int i;
2324	unsigned long nr_allocated;
2325	unsigned long total_allocated = 0;
2326
2327	nodes = nodes_weight(pol->nodes);
2328	nr_pages_per_node = nr_pages / nodes;
2329	delta = nr_pages - nodes * nr_pages_per_node;
2330
2331	for (i = 0; i < nodes; i++) {
2332		if (delta) {
2333			nr_allocated = __alloc_pages_bulk(gfp,
2334					interleave_nodes(pol), NULL,
2335					nr_pages_per_node + 1, NULL,
2336					page_array);
2337			delta--;
2338		} else {
2339			nr_allocated = __alloc_pages_bulk(gfp,
2340					interleave_nodes(pol), NULL,
2341					nr_pages_per_node, NULL, page_array);
2342		}
2343
2344		page_array += nr_allocated;
2345		total_allocated += nr_allocated;
2346	}
2347
2348	return total_allocated;
2349}
2350
2351static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2352		struct mempolicy *pol, unsigned long nr_pages,
2353		struct page **page_array)
2354{
2355	gfp_t preferred_gfp;
2356	unsigned long nr_allocated = 0;
2357
2358	preferred_gfp = gfp | __GFP_NOWARN;
2359	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2360
2361	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2362					   nr_pages, NULL, page_array);
2363
2364	if (nr_allocated < nr_pages)
2365		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2366				nr_pages - nr_allocated, NULL,
2367				page_array + nr_allocated);
2368	return nr_allocated;
2369}
2370
2371/* alloc pages bulk and mempolicy should be considered at the
2372 * same time in some situation such as vmalloc.
2373 *
2374 * It can accelerate memory allocation especially interleaving
2375 * allocate memory.
2376 */
2377unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2378		unsigned long nr_pages, struct page **page_array)
2379{
2380	struct mempolicy *pol = &default_policy;
2381
2382	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2383		pol = get_task_policy(current);
2384
2385	if (pol->mode == MPOL_INTERLEAVE)
2386		return alloc_pages_bulk_array_interleave(gfp, pol,
2387							 nr_pages, page_array);
2388
2389	if (pol->mode == MPOL_PREFERRED_MANY)
2390		return alloc_pages_bulk_array_preferred_many(gfp,
2391				numa_node_id(), pol, nr_pages, page_array);
2392
2393	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2394				  policy_nodemask(gfp, pol), nr_pages, NULL,
2395				  page_array);
2396}
2397
2398int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2399{
2400	struct mempolicy *pol = mpol_dup(vma_policy(src));
2401
2402	if (IS_ERR(pol))
2403		return PTR_ERR(pol);
2404	dst->vm_policy = pol;
2405	return 0;
2406}
2407
2408/*
2409 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2410 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2411 * with the mems_allowed returned by cpuset_mems_allowed().  This
2412 * keeps mempolicies cpuset relative after its cpuset moves.  See
2413 * further kernel/cpuset.c update_nodemask().
2414 *
2415 * current's mempolicy may be rebinded by the other task(the task that changes
2416 * cpuset's mems), so we needn't do rebind work for current task.
2417 */
2418
2419/* Slow path of a mempolicy duplicate */
2420struct mempolicy *__mpol_dup(struct mempolicy *old)
2421{
2422	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2423
2424	if (!new)
2425		return ERR_PTR(-ENOMEM);
2426
2427	/* task's mempolicy is protected by alloc_lock */
2428	if (old == current->mempolicy) {
2429		task_lock(current);
2430		*new = *old;
2431		task_unlock(current);
2432	} else
2433		*new = *old;
2434
2435	if (current_cpuset_is_being_rebound()) {
2436		nodemask_t mems = cpuset_mems_allowed(current);
2437		mpol_rebind_policy(new, &mems);
2438	}
2439	atomic_set(&new->refcnt, 1);
2440	return new;
2441}
2442
2443/* Slow path of a mempolicy comparison */
2444bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2445{
2446	if (!a || !b)
2447		return false;
2448	if (a->mode != b->mode)
2449		return false;
2450	if (a->flags != b->flags)
2451		return false;
2452	if (a->home_node != b->home_node)
2453		return false;
2454	if (mpol_store_user_nodemask(a))
2455		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2456			return false;
2457
2458	switch (a->mode) {
2459	case MPOL_BIND:
2460	case MPOL_INTERLEAVE:
2461	case MPOL_PREFERRED:
2462	case MPOL_PREFERRED_MANY:
2463		return !!nodes_equal(a->nodes, b->nodes);
2464	case MPOL_LOCAL:
2465		return true;
2466	default:
2467		BUG();
2468		return false;
2469	}
2470}
2471
2472/*
2473 * Shared memory backing store policy support.
2474 *
2475 * Remember policies even when nobody has shared memory mapped.
2476 * The policies are kept in Red-Black tree linked from the inode.
2477 * They are protected by the sp->lock rwlock, which should be held
2478 * for any accesses to the tree.
2479 */
2480
2481/*
2482 * lookup first element intersecting start-end.  Caller holds sp->lock for
2483 * reading or for writing
2484 */
2485static struct sp_node *
2486sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2487{
2488	struct rb_node *n = sp->root.rb_node;
2489
2490	while (n) {
2491		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2492
2493		if (start >= p->end)
2494			n = n->rb_right;
2495		else if (end <= p->start)
2496			n = n->rb_left;
2497		else
2498			break;
2499	}
2500	if (!n)
2501		return NULL;
2502	for (;;) {
2503		struct sp_node *w = NULL;
2504		struct rb_node *prev = rb_prev(n);
2505		if (!prev)
2506			break;
2507		w = rb_entry(prev, struct sp_node, nd);
2508		if (w->end <= start)
2509			break;
2510		n = prev;
2511	}
2512	return rb_entry(n, struct sp_node, nd);
2513}
2514
2515/*
2516 * Insert a new shared policy into the list.  Caller holds sp->lock for
2517 * writing.
2518 */
2519static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2520{
2521	struct rb_node **p = &sp->root.rb_node;
2522	struct rb_node *parent = NULL;
2523	struct sp_node *nd;
2524
2525	while (*p) {
2526		parent = *p;
2527		nd = rb_entry(parent, struct sp_node, nd);
2528		if (new->start < nd->start)
2529			p = &(*p)->rb_left;
2530		else if (new->end > nd->end)
2531			p = &(*p)->rb_right;
2532		else
2533			BUG();
2534	}
2535	rb_link_node(&new->nd, parent, p);
2536	rb_insert_color(&new->nd, &sp->root);
2537	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2538		 new->policy ? new->policy->mode : 0);
2539}
2540
2541/* Find shared policy intersecting idx */
2542struct mempolicy *
2543mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2544{
2545	struct mempolicy *pol = NULL;
2546	struct sp_node *sn;
2547
2548	if (!sp->root.rb_node)
2549		return NULL;
2550	read_lock(&sp->lock);
2551	sn = sp_lookup(sp, idx, idx+1);
2552	if (sn) {
2553		mpol_get(sn->policy);
2554		pol = sn->policy;
2555	}
2556	read_unlock(&sp->lock);
2557	return pol;
2558}
2559
2560static void sp_free(struct sp_node *n)
2561{
2562	mpol_put(n->policy);
2563	kmem_cache_free(sn_cache, n);
2564}
2565
2566/**
2567 * mpol_misplaced - check whether current page node is valid in policy
2568 *
2569 * @page: page to be checked
2570 * @vma: vm area where page mapped
2571 * @addr: virtual address where page mapped
2572 *
2573 * Lookup current policy node id for vma,addr and "compare to" page's
2574 * node id.  Policy determination "mimics" alloc_page_vma().
2575 * Called from fault path where we know the vma and faulting address.
2576 *
2577 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2578 * policy, or a suitable node ID to allocate a replacement page from.
2579 */
2580int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2581{
2582	struct mempolicy *pol;
2583	struct zoneref *z;
2584	int curnid = page_to_nid(page);
2585	unsigned long pgoff;
2586	int thiscpu = raw_smp_processor_id();
2587	int thisnid = cpu_to_node(thiscpu);
2588	int polnid = NUMA_NO_NODE;
2589	int ret = NUMA_NO_NODE;
2590
2591	pol = get_vma_policy(vma, addr);
2592	if (!(pol->flags & MPOL_F_MOF))
2593		goto out;
2594
2595	switch (pol->mode) {
2596	case MPOL_INTERLEAVE:
2597		pgoff = vma->vm_pgoff;
2598		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2599		polnid = offset_il_node(pol, pgoff);
2600		break;
2601
2602	case MPOL_PREFERRED:
2603		if (node_isset(curnid, pol->nodes))
2604			goto out;
2605		polnid = first_node(pol->nodes);
2606		break;
2607
2608	case MPOL_LOCAL:
2609		polnid = numa_node_id();
2610		break;
2611
2612	case MPOL_BIND:
2613		/* Optimize placement among multiple nodes via NUMA balancing */
2614		if (pol->flags & MPOL_F_MORON) {
2615			if (node_isset(thisnid, pol->nodes))
2616				break;
2617			goto out;
2618		}
2619		fallthrough;
2620
2621	case MPOL_PREFERRED_MANY:
2622		/*
2623		 * use current page if in policy nodemask,
2624		 * else select nearest allowed node, if any.
2625		 * If no allowed nodes, use current [!misplaced].
2626		 */
2627		if (node_isset(curnid, pol->nodes))
2628			goto out;
2629		z = first_zones_zonelist(
2630				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2631				gfp_zone(GFP_HIGHUSER),
2632				&pol->nodes);
2633		polnid = zone_to_nid(z->zone);
2634		break;
2635
2636	default:
2637		BUG();
2638	}
2639
2640	/* Migrate the page towards the node whose CPU is referencing it */
2641	if (pol->flags & MPOL_F_MORON) {
2642		polnid = thisnid;
2643
2644		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2645			goto out;
2646	}
2647
2648	if (curnid != polnid)
2649		ret = polnid;
2650out:
2651	mpol_cond_put(pol);
2652
2653	return ret;
2654}
2655
2656/*
2657 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2658 * dropped after task->mempolicy is set to NULL so that any allocation done as
2659 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2660 * policy.
2661 */
2662void mpol_put_task_policy(struct task_struct *task)
2663{
2664	struct mempolicy *pol;
2665
2666	task_lock(task);
2667	pol = task->mempolicy;
2668	task->mempolicy = NULL;
2669	task_unlock(task);
2670	mpol_put(pol);
2671}
2672
2673static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2674{
2675	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2676	rb_erase(&n->nd, &sp->root);
2677	sp_free(n);
2678}
2679
2680static void sp_node_init(struct sp_node *node, unsigned long start,
2681			unsigned long end, struct mempolicy *pol)
2682{
2683	node->start = start;
2684	node->end = end;
2685	node->policy = pol;
2686}
2687
2688static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2689				struct mempolicy *pol)
2690{
2691	struct sp_node *n;
2692	struct mempolicy *newpol;
2693
2694	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2695	if (!n)
2696		return NULL;
2697
2698	newpol = mpol_dup(pol);
2699	if (IS_ERR(newpol)) {
2700		kmem_cache_free(sn_cache, n);
2701		return NULL;
2702	}
2703	newpol->flags |= MPOL_F_SHARED;
2704	sp_node_init(n, start, end, newpol);
2705
2706	return n;
2707}
2708
2709/* Replace a policy range. */
2710static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2711				 unsigned long end, struct sp_node *new)
2712{
2713	struct sp_node *n;
2714	struct sp_node *n_new = NULL;
2715	struct mempolicy *mpol_new = NULL;
2716	int ret = 0;
2717
2718restart:
2719	write_lock(&sp->lock);
2720	n = sp_lookup(sp, start, end);
2721	/* Take care of old policies in the same range. */
2722	while (n && n->start < end) {
2723		struct rb_node *next = rb_next(&n->nd);
2724		if (n->start >= start) {
2725			if (n->end <= end)
2726				sp_delete(sp, n);
2727			else
2728				n->start = end;
2729		} else {
2730			/* Old policy spanning whole new range. */
2731			if (n->end > end) {
2732				if (!n_new)
2733					goto alloc_new;
2734
2735				*mpol_new = *n->policy;
2736				atomic_set(&mpol_new->refcnt, 1);
2737				sp_node_init(n_new, end, n->end, mpol_new);
2738				n->end = start;
2739				sp_insert(sp, n_new);
2740				n_new = NULL;
2741				mpol_new = NULL;
2742				break;
2743			} else
2744				n->end = start;
2745		}
2746		if (!next)
2747			break;
2748		n = rb_entry(next, struct sp_node, nd);
2749	}
2750	if (new)
2751		sp_insert(sp, new);
2752	write_unlock(&sp->lock);
2753	ret = 0;
2754
2755err_out:
2756	if (mpol_new)
2757		mpol_put(mpol_new);
2758	if (n_new)
2759		kmem_cache_free(sn_cache, n_new);
2760
2761	return ret;
2762
2763alloc_new:
2764	write_unlock(&sp->lock);
2765	ret = -ENOMEM;
2766	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2767	if (!n_new)
2768		goto err_out;
2769	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2770	if (!mpol_new)
2771		goto err_out;
2772	atomic_set(&mpol_new->refcnt, 1);
2773	goto restart;
2774}
2775
2776/**
2777 * mpol_shared_policy_init - initialize shared policy for inode
2778 * @sp: pointer to inode shared policy
2779 * @mpol:  struct mempolicy to install
2780 *
2781 * Install non-NULL @mpol in inode's shared policy rb-tree.
2782 * On entry, the current task has a reference on a non-NULL @mpol.
2783 * This must be released on exit.
2784 * This is called at get_inode() calls and we can use GFP_KERNEL.
2785 */
2786void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2787{
2788	int ret;
2789
2790	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2791	rwlock_init(&sp->lock);
2792
2793	if (mpol) {
2794		struct vm_area_struct pvma;
2795		struct mempolicy *new;
2796		NODEMASK_SCRATCH(scratch);
2797
2798		if (!scratch)
2799			goto put_mpol;
2800		/* contextualize the tmpfs mount point mempolicy */
2801		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2802		if (IS_ERR(new))
2803			goto free_scratch; /* no valid nodemask intersection */
2804
2805		task_lock(current);
2806		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2807		task_unlock(current);
2808		if (ret)
2809			goto put_new;
2810
2811		/* Create pseudo-vma that contains just the policy */
2812		vma_init(&pvma, NULL);
2813		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2814		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2815
2816put_new:
2817		mpol_put(new);			/* drop initial ref */
2818free_scratch:
2819		NODEMASK_SCRATCH_FREE(scratch);
2820put_mpol:
2821		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2822	}
2823}
2824
2825int mpol_set_shared_policy(struct shared_policy *info,
2826			struct vm_area_struct *vma, struct mempolicy *npol)
2827{
2828	int err;
2829	struct sp_node *new = NULL;
2830	unsigned long sz = vma_pages(vma);
2831
2832	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2833		 vma->vm_pgoff,
2834		 sz, npol ? npol->mode : -1,
2835		 npol ? npol->flags : -1,
2836		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2837
2838	if (npol) {
2839		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2840		if (!new)
2841			return -ENOMEM;
2842	}
2843	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2844	if (err && new)
2845		sp_free(new);
2846	return err;
2847}
2848
2849/* Free a backing policy store on inode delete. */
2850void mpol_free_shared_policy(struct shared_policy *p)
2851{
2852	struct sp_node *n;
2853	struct rb_node *next;
2854
2855	if (!p->root.rb_node)
2856		return;
2857	write_lock(&p->lock);
2858	next = rb_first(&p->root);
2859	while (next) {
2860		n = rb_entry(next, struct sp_node, nd);
2861		next = rb_next(&n->nd);
2862		sp_delete(p, n);
2863	}
2864	write_unlock(&p->lock);
2865}
2866
2867#ifdef CONFIG_NUMA_BALANCING
2868static int __initdata numabalancing_override;
2869
2870static void __init check_numabalancing_enable(void)
2871{
2872	bool numabalancing_default = false;
2873
2874	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2875		numabalancing_default = true;
2876
2877	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2878	if (numabalancing_override)
2879		set_numabalancing_state(numabalancing_override == 1);
2880
2881	if (num_online_nodes() > 1 && !numabalancing_override) {
2882		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2883			numabalancing_default ? "Enabling" : "Disabling");
2884		set_numabalancing_state(numabalancing_default);
2885	}
2886}
2887
2888static int __init setup_numabalancing(char *str)
2889{
2890	int ret = 0;
2891	if (!str)
2892		goto out;
2893
2894	if (!strcmp(str, "enable")) {
2895		numabalancing_override = 1;
2896		ret = 1;
2897	} else if (!strcmp(str, "disable")) {
2898		numabalancing_override = -1;
2899		ret = 1;
2900	}
2901out:
2902	if (!ret)
2903		pr_warn("Unable to parse numa_balancing=\n");
2904
2905	return ret;
2906}
2907__setup("numa_balancing=", setup_numabalancing);
2908#else
2909static inline void __init check_numabalancing_enable(void)
2910{
2911}
2912#endif /* CONFIG_NUMA_BALANCING */
2913
2914/* assumes fs == KERNEL_DS */
2915void __init numa_policy_init(void)
2916{
2917	nodemask_t interleave_nodes;
2918	unsigned long largest = 0;
2919	int nid, prefer = 0;
2920
2921	policy_cache = kmem_cache_create("numa_policy",
2922					 sizeof(struct mempolicy),
2923					 0, SLAB_PANIC, NULL);
2924
2925	sn_cache = kmem_cache_create("shared_policy_node",
2926				     sizeof(struct sp_node),
2927				     0, SLAB_PANIC, NULL);
2928
2929	for_each_node(nid) {
2930		preferred_node_policy[nid] = (struct mempolicy) {
2931			.refcnt = ATOMIC_INIT(1),
2932			.mode = MPOL_PREFERRED,
2933			.flags = MPOL_F_MOF | MPOL_F_MORON,
2934			.nodes = nodemask_of_node(nid),
2935		};
2936	}
2937
2938	/*
2939	 * Set interleaving policy for system init. Interleaving is only
2940	 * enabled across suitably sized nodes (default is >= 16MB), or
2941	 * fall back to the largest node if they're all smaller.
2942	 */
2943	nodes_clear(interleave_nodes);
2944	for_each_node_state(nid, N_MEMORY) {
2945		unsigned long total_pages = node_present_pages(nid);
2946
2947		/* Preserve the largest node */
2948		if (largest < total_pages) {
2949			largest = total_pages;
2950			prefer = nid;
2951		}
2952
2953		/* Interleave this node? */
2954		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2955			node_set(nid, interleave_nodes);
2956	}
2957
2958	/* All too small, use the largest */
2959	if (unlikely(nodes_empty(interleave_nodes)))
2960		node_set(prefer, interleave_nodes);
2961
2962	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2963		pr_err("%s: interleaving failed\n", __func__);
2964
2965	check_numabalancing_enable();
2966}
2967
2968/* Reset policy of current process to default */
2969void numa_default_policy(void)
2970{
2971	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2972}
2973
2974/*
2975 * Parse and format mempolicy from/to strings
2976 */
2977
2978static const char * const policy_modes[] =
2979{
2980	[MPOL_DEFAULT]    = "default",
2981	[MPOL_PREFERRED]  = "prefer",
2982	[MPOL_BIND]       = "bind",
2983	[MPOL_INTERLEAVE] = "interleave",
2984	[MPOL_LOCAL]      = "local",
2985	[MPOL_PREFERRED_MANY]  = "prefer (many)",
2986};
2987
2988
2989#ifdef CONFIG_TMPFS
2990/**
2991 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2992 * @str:  string containing mempolicy to parse
2993 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2994 *
2995 * Format of input:
2996 *	<mode>[=<flags>][:<nodelist>]
2997 *
2998 * Return: %0 on success, else %1
2999 */
3000int mpol_parse_str(char *str, struct mempolicy **mpol)
3001{
3002	struct mempolicy *new = NULL;
3003	unsigned short mode_flags;
3004	nodemask_t nodes;
3005	char *nodelist = strchr(str, ':');
3006	char *flags = strchr(str, '=');
3007	int err = 1, mode;
3008
3009	if (flags)
3010		*flags++ = '\0';	/* terminate mode string */
3011
3012	if (nodelist) {
3013		/* NUL-terminate mode or flags string */
3014		*nodelist++ = '\0';
3015		if (nodelist_parse(nodelist, nodes))
3016			goto out;
3017		if (!nodes_subset(nodes, node_states[N_MEMORY]))
3018			goto out;
3019	} else
3020		nodes_clear(nodes);
3021
3022	mode = match_string(policy_modes, MPOL_MAX, str);
3023	if (mode < 0)
3024		goto out;
3025
3026	switch (mode) {
3027	case MPOL_PREFERRED:
3028		/*
3029		 * Insist on a nodelist of one node only, although later
3030		 * we use first_node(nodes) to grab a single node, so here
3031		 * nodelist (or nodes) cannot be empty.
3032		 */
3033		if (nodelist) {
3034			char *rest = nodelist;
3035			while (isdigit(*rest))
3036				rest++;
3037			if (*rest)
3038				goto out;
3039			if (nodes_empty(nodes))
3040				goto out;
3041		}
3042		break;
3043	case MPOL_INTERLEAVE:
3044		/*
3045		 * Default to online nodes with memory if no nodelist
3046		 */
3047		if (!nodelist)
3048			nodes = node_states[N_MEMORY];
3049		break;
3050	case MPOL_LOCAL:
3051		/*
3052		 * Don't allow a nodelist;  mpol_new() checks flags
3053		 */
3054		if (nodelist)
3055			goto out;
3056		break;
3057	case MPOL_DEFAULT:
3058		/*
3059		 * Insist on a empty nodelist
3060		 */
3061		if (!nodelist)
3062			err = 0;
3063		goto out;
3064	case MPOL_PREFERRED_MANY:
3065	case MPOL_BIND:
3066		/*
3067		 * Insist on a nodelist
3068		 */
3069		if (!nodelist)
3070			goto out;
3071	}
3072
3073	mode_flags = 0;
3074	if (flags) {
3075		/*
3076		 * Currently, we only support two mutually exclusive
3077		 * mode flags.
3078		 */
3079		if (!strcmp(flags, "static"))
3080			mode_flags |= MPOL_F_STATIC_NODES;
3081		else if (!strcmp(flags, "relative"))
3082			mode_flags |= MPOL_F_RELATIVE_NODES;
3083		else
3084			goto out;
3085	}
3086
3087	new = mpol_new(mode, mode_flags, &nodes);
3088	if (IS_ERR(new))
3089		goto out;
3090
3091	/*
3092	 * Save nodes for mpol_to_str() to show the tmpfs mount options
3093	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3094	 */
3095	if (mode != MPOL_PREFERRED) {
3096		new->nodes = nodes;
3097	} else if (nodelist) {
3098		nodes_clear(new->nodes);
3099		node_set(first_node(nodes), new->nodes);
3100	} else {
3101		new->mode = MPOL_LOCAL;
3102	}
3103
3104	/*
3105	 * Save nodes for contextualization: this will be used to "clone"
3106	 * the mempolicy in a specific context [cpuset] at a later time.
3107	 */
3108	new->w.user_nodemask = nodes;
3109
3110	err = 0;
3111
3112out:
3113	/* Restore string for error message */
3114	if (nodelist)
3115		*--nodelist = ':';
3116	if (flags)
3117		*--flags = '=';
3118	if (!err)
3119		*mpol = new;
3120	return err;
3121}
3122#endif /* CONFIG_TMPFS */
3123
3124/**
3125 * mpol_to_str - format a mempolicy structure for printing
3126 * @buffer:  to contain formatted mempolicy string
3127 * @maxlen:  length of @buffer
3128 * @pol:  pointer to mempolicy to be formatted
3129 *
3130 * Convert @pol into a string.  If @buffer is too short, truncate the string.
3131 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3132 * longest flag, "relative", and to display at least a few node ids.
3133 */
3134void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3135{
3136	char *p = buffer;
3137	nodemask_t nodes = NODE_MASK_NONE;
3138	unsigned short mode = MPOL_DEFAULT;
3139	unsigned short flags = 0;
3140
3141	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3142		mode = pol->mode;
3143		flags = pol->flags;
3144	}
3145
3146	switch (mode) {
3147	case MPOL_DEFAULT:
3148	case MPOL_LOCAL:
3149		break;
3150	case MPOL_PREFERRED:
3151	case MPOL_PREFERRED_MANY:
3152	case MPOL_BIND:
3153	case MPOL_INTERLEAVE:
3154		nodes = pol->nodes;
3155		break;
3156	default:
3157		WARN_ON_ONCE(1);
3158		snprintf(p, maxlen, "unknown");
3159		return;
3160	}
3161
3162	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3163
3164	if (flags & MPOL_MODE_FLAGS) {
3165		p += snprintf(p, buffer + maxlen - p, "=");
3166
3167		/*
3168		 * Currently, the only defined flags are mutually exclusive
3169		 */
3170		if (flags & MPOL_F_STATIC_NODES)
3171			p += snprintf(p, buffer + maxlen - p, "static");
3172		else if (flags & MPOL_F_RELATIVE_NODES)
3173			p += snprintf(p, buffer + maxlen - p, "relative");
3174	}
3175
3176	if (!nodes_empty(nodes))
3177		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3178			       nodemask_pr_args(&nodes));
3179}
Configure Feed

Configure Feed