mm/mempolicy.c at v5.4-rc7 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / mempolicy.c
at v5.4-rc7 2968 lines 76 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Simple NUMA memory policy for the Linux kernel.
   4 *
   5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case NUMA_NO_NODE here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *		  in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70#include <linux/mempolicy.h>
  71#include <linux/pagewalk.h>
  72#include <linux/highmem.h>
  73#include <linux/hugetlb.h>
  74#include <linux/kernel.h>
  75#include <linux/sched.h>
  76#include <linux/sched/mm.h>
  77#include <linux/sched/numa_balancing.h>
  78#include <linux/sched/task.h>
  79#include <linux/nodemask.h>
  80#include <linux/cpuset.h>
  81#include <linux/slab.h>
  82#include <linux/string.h>
  83#include <linux/export.h>
  84#include <linux/nsproxy.h>
  85#include <linux/interrupt.h>
  86#include <linux/init.h>
  87#include <linux/compat.h>
  88#include <linux/ptrace.h>
  89#include <linux/swap.h>
  90#include <linux/seq_file.h>
  91#include <linux/proc_fs.h>
  92#include <linux/migrate.h>
  93#include <linux/ksm.h>
  94#include <linux/rmap.h>
  95#include <linux/security.h>
  96#include <linux/syscalls.h>
  97#include <linux/ctype.h>
  98#include <linux/mm_inline.h>
  99#include <linux/mmu_notifier.h>
 100#include <linux/printk.h>
 101#include <linux/swapops.h>
 102
 103#include <asm/tlbflush.h>
 104#include <linux/uaccess.h>
 105
 106#include "internal.h"
 107
 108/* Internal flags */
 109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 111
 112static struct kmem_cache *policy_cache;
 113static struct kmem_cache *sn_cache;
 114
 115/* Highest zone. An specific allocation for a zone below that is not
 116   policied. */
 117enum zone_type policy_zone = 0;
 118
 119/*
 120 * run-time system-wide default policy => local allocation
 121 */
 122static struct mempolicy default_policy = {
 123	.refcnt = ATOMIC_INIT(1), /* never free it */
 124	.mode = MPOL_PREFERRED,
 125	.flags = MPOL_F_LOCAL,
 126};
 127
 128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 129
 130struct mempolicy *get_task_policy(struct task_struct *p)
 131{
 132	struct mempolicy *pol = p->mempolicy;
 133	int node;
 134
 135	if (pol)
 136		return pol;
 137
 138	node = numa_node_id();
 139	if (node != NUMA_NO_NODE) {
 140		pol = &preferred_node_policy[node];
 141		/* preferred_node_policy is not initialised early in boot */
 142		if (pol->mode)
 143			return pol;
 144	}
 145
 146	return &default_policy;
 147}
 148
 149static const struct mempolicy_operations {
 150	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 151	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 152} mpol_ops[MPOL_MAX];
 153
 154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 155{
 156	return pol->flags & MPOL_MODE_FLAGS;
 157}
 158
 159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 160				   const nodemask_t *rel)
 161{
 162	nodemask_t tmp;
 163	nodes_fold(tmp, *orig, nodes_weight(*rel));
 164	nodes_onto(*ret, tmp, *rel);
 165}
 166
 167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 168{
 169	if (nodes_empty(*nodes))
 170		return -EINVAL;
 171	pol->v.nodes = *nodes;
 172	return 0;
 173}
 174
 175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177	if (!nodes)
 178		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
 179	else if (nodes_empty(*nodes))
 180		return -EINVAL;			/*  no allowed nodes */
 181	else
 182		pol->v.preferred_node = first_node(*nodes);
 183	return 0;
 184}
 185
 186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 187{
 188	if (nodes_empty(*nodes))
 189		return -EINVAL;
 190	pol->v.nodes = *nodes;
 191	return 0;
 192}
 193
 194/*
 195 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 196 * any, for the new policy.  mpol_new() has already validated the nodes
 197 * parameter with respect to the policy mode and flags.  But, we need to
 198 * handle an empty nodemask with MPOL_PREFERRED here.
 199 *
 200 * Must be called holding task's alloc_lock to protect task's mems_allowed
 201 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 202 */
 203static int mpol_set_nodemask(struct mempolicy *pol,
 204		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 205{
 206	int ret;
 207
 208	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 209	if (pol == NULL)
 210		return 0;
 211	/* Check N_MEMORY */
 212	nodes_and(nsc->mask1,
 213		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 214
 215	VM_BUG_ON(!nodes);
 216	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 217		nodes = NULL;	/* explicit local allocation */
 218	else {
 219		if (pol->flags & MPOL_F_RELATIVE_NODES)
 220			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 221		else
 222			nodes_and(nsc->mask2, *nodes, nsc->mask1);
 223
 224		if (mpol_store_user_nodemask(pol))
 225			pol->w.user_nodemask = *nodes;
 226		else
 227			pol->w.cpuset_mems_allowed =
 228						cpuset_current_mems_allowed;
 229	}
 230
 231	if (nodes)
 232		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 233	else
 234		ret = mpol_ops[pol->mode].create(pol, NULL);
 235	return ret;
 236}
 237
 238/*
 239 * This function just creates a new policy, does some check and simple
 240 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 241 */
 242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 243				  nodemask_t *nodes)
 244{
 245	struct mempolicy *policy;
 246
 247	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 248		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 249
 250	if (mode == MPOL_DEFAULT) {
 251		if (nodes && !nodes_empty(*nodes))
 252			return ERR_PTR(-EINVAL);
 253		return NULL;
 254	}
 255	VM_BUG_ON(!nodes);
 256
 257	/*
 258	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 259	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 260	 * All other modes require a valid pointer to a non-empty nodemask.
 261	 */
 262	if (mode == MPOL_PREFERRED) {
 263		if (nodes_empty(*nodes)) {
 264			if (((flags & MPOL_F_STATIC_NODES) ||
 265			     (flags & MPOL_F_RELATIVE_NODES)))
 266				return ERR_PTR(-EINVAL);
 267		}
 268	} else if (mode == MPOL_LOCAL) {
 269		if (!nodes_empty(*nodes) ||
 270		    (flags & MPOL_F_STATIC_NODES) ||
 271		    (flags & MPOL_F_RELATIVE_NODES))
 272			return ERR_PTR(-EINVAL);
 273		mode = MPOL_PREFERRED;
 274	} else if (nodes_empty(*nodes))
 275		return ERR_PTR(-EINVAL);
 276	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 277	if (!policy)
 278		return ERR_PTR(-ENOMEM);
 279	atomic_set(&policy->refcnt, 1);
 280	policy->mode = mode;
 281	policy->flags = flags;
 282
 283	return policy;
 284}
 285
 286/* Slow path of a mpol destructor. */
 287void __mpol_put(struct mempolicy *p)
 288{
 289	if (!atomic_dec_and_test(&p->refcnt))
 290		return;
 291	kmem_cache_free(policy_cache, p);
 292}
 293
 294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 295{
 296}
 297
 298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 299{
 300	nodemask_t tmp;
 301
 302	if (pol->flags & MPOL_F_STATIC_NODES)
 303		nodes_and(tmp, pol->w.user_nodemask, *nodes);
 304	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 305		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 306	else {
 307		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
 308								*nodes);
 309		pol->w.cpuset_mems_allowed = *nodes;
 310	}
 311
 312	if (nodes_empty(tmp))
 313		tmp = *nodes;
 314
 315	pol->v.nodes = tmp;
 316}
 317
 318static void mpol_rebind_preferred(struct mempolicy *pol,
 319						const nodemask_t *nodes)
 320{
 321	nodemask_t tmp;
 322
 323	if (pol->flags & MPOL_F_STATIC_NODES) {
 324		int node = first_node(pol->w.user_nodemask);
 325
 326		if (node_isset(node, *nodes)) {
 327			pol->v.preferred_node = node;
 328			pol->flags &= ~MPOL_F_LOCAL;
 329		} else
 330			pol->flags |= MPOL_F_LOCAL;
 331	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 332		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 333		pol->v.preferred_node = first_node(tmp);
 334	} else if (!(pol->flags & MPOL_F_LOCAL)) {
 335		pol->v.preferred_node = node_remap(pol->v.preferred_node,
 336						   pol->w.cpuset_mems_allowed,
 337						   *nodes);
 338		pol->w.cpuset_mems_allowed = *nodes;
 339	}
 340}
 341
 342/*
 343 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 344 *
 345 * Per-vma policies are protected by mmap_sem. Allocations using per-task
 346 * policies are protected by task->mems_allowed_seq to prevent a premature
 347 * OOM/allocation failure due to parallel nodemask modification.
 348 */
 349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 350{
 351	if (!pol)
 352		return;
 353	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
 354	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 355		return;
 356
 357	mpol_ops[pol->mode].rebind(pol, newmask);
 358}
 359
 360/*
 361 * Wrapper for mpol_rebind_policy() that just requires task
 362 * pointer, and updates task mempolicy.
 363 *
 364 * Called with task's alloc_lock held.
 365 */
 366
 367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 368{
 369	mpol_rebind_policy(tsk->mempolicy, new);
 370}
 371
 372/*
 373 * Rebind each vma in mm to new nodemask.
 374 *
 375 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 376 */
 377
 378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 379{
 380	struct vm_area_struct *vma;
 381
 382	down_write(&mm->mmap_sem);
 383	for (vma = mm->mmap; vma; vma = vma->vm_next)
 384		mpol_rebind_policy(vma->vm_policy, new);
 385	up_write(&mm->mmap_sem);
 386}
 387
 388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 389	[MPOL_DEFAULT] = {
 390		.rebind = mpol_rebind_default,
 391	},
 392	[MPOL_INTERLEAVE] = {
 393		.create = mpol_new_interleave,
 394		.rebind = mpol_rebind_nodemask,
 395	},
 396	[MPOL_PREFERRED] = {
 397		.create = mpol_new_preferred,
 398		.rebind = mpol_rebind_preferred,
 399	},
 400	[MPOL_BIND] = {
 401		.create = mpol_new_bind,
 402		.rebind = mpol_rebind_nodemask,
 403	},
 404};
 405
 406static int migrate_page_add(struct page *page, struct list_head *pagelist,
 407				unsigned long flags);
 408
 409struct queue_pages {
 410	struct list_head *pagelist;
 411	unsigned long flags;
 412	nodemask_t *nmask;
 413	struct vm_area_struct *prev;
 414};
 415
 416/*
 417 * Check if the page's nid is in qp->nmask.
 418 *
 419 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 420 * in the invert of qp->nmask.
 421 */
 422static inline bool queue_pages_required(struct page *page,
 423					struct queue_pages *qp)
 424{
 425	int nid = page_to_nid(page);
 426	unsigned long flags = qp->flags;
 427
 428	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 429}
 430
 431/*
 432 * queue_pages_pmd() has four possible return values:
 433 * 0 - pages are placed on the right node or queued successfully.
 434 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 435 *     specified.
 436 * 2 - THP was split.
 437 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 438 *        existing page was already on a node that does not follow the
 439 *        policy.
 440 */
 441static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 442				unsigned long end, struct mm_walk *walk)
 443{
 444	int ret = 0;
 445	struct page *page;
 446	struct queue_pages *qp = walk->private;
 447	unsigned long flags;
 448
 449	if (unlikely(is_pmd_migration_entry(*pmd))) {
 450		ret = -EIO;
 451		goto unlock;
 452	}
 453	page = pmd_page(*pmd);
 454	if (is_huge_zero_page(page)) {
 455		spin_unlock(ptl);
 456		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
 457		ret = 2;
 458		goto out;
 459	}
 460	if (!queue_pages_required(page, qp))
 461		goto unlock;
 462
 463	flags = qp->flags;
 464	/* go to thp migration */
 465	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 466		if (!vma_migratable(walk->vma) ||
 467		    migrate_page_add(page, qp->pagelist, flags)) {
 468			ret = 1;
 469			goto unlock;
 470		}
 471	} else
 472		ret = -EIO;
 473unlock:
 474	spin_unlock(ptl);
 475out:
 476	return ret;
 477}
 478
 479/*
 480 * Scan through pages checking if pages follow certain conditions,
 481 * and move them to the pagelist if they do.
 482 *
 483 * queue_pages_pte_range() has three possible return values:
 484 * 0 - pages are placed on the right node or queued successfully.
 485 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 486 *     specified.
 487 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 488 *        on a node that does not follow the policy.
 489 */
 490static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 491			unsigned long end, struct mm_walk *walk)
 492{
 493	struct vm_area_struct *vma = walk->vma;
 494	struct page *page;
 495	struct queue_pages *qp = walk->private;
 496	unsigned long flags = qp->flags;
 497	int ret;
 498	bool has_unmovable = false;
 499	pte_t *pte;
 500	spinlock_t *ptl;
 501
 502	ptl = pmd_trans_huge_lock(pmd, vma);
 503	if (ptl) {
 504		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
 505		if (ret != 2)
 506			return ret;
 507	}
 508	/* THP was split, fall through to pte walk */
 509
 510	if (pmd_trans_unstable(pmd))
 511		return 0;
 512
 513	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 514	for (; addr != end; pte++, addr += PAGE_SIZE) {
 515		if (!pte_present(*pte))
 516			continue;
 517		page = vm_normal_page(vma, addr, *pte);
 518		if (!page)
 519			continue;
 520		/*
 521		 * vm_normal_page() filters out zero pages, but there might
 522		 * still be PageReserved pages to skip, perhaps in a VDSO.
 523		 */
 524		if (PageReserved(page))
 525			continue;
 526		if (!queue_pages_required(page, qp))
 527			continue;
 528		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 529			/* MPOL_MF_STRICT must be specified if we get here */
 530			if (!vma_migratable(vma)) {
 531				has_unmovable = true;
 532				break;
 533			}
 534
 535			/*
 536			 * Do not abort immediately since there may be
 537			 * temporary off LRU pages in the range.  Still
 538			 * need migrate other LRU pages.
 539			 */
 540			if (migrate_page_add(page, qp->pagelist, flags))
 541				has_unmovable = true;
 542		} else
 543			break;
 544	}
 545	pte_unmap_unlock(pte - 1, ptl);
 546	cond_resched();
 547
 548	if (has_unmovable)
 549		return 1;
 550
 551	return addr != end ? -EIO : 0;
 552}
 553
 554static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 555			       unsigned long addr, unsigned long end,
 556			       struct mm_walk *walk)
 557{
 558#ifdef CONFIG_HUGETLB_PAGE
 559	struct queue_pages *qp = walk->private;
 560	unsigned long flags = qp->flags;
 561	struct page *page;
 562	spinlock_t *ptl;
 563	pte_t entry;
 564
 565	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 566	entry = huge_ptep_get(pte);
 567	if (!pte_present(entry))
 568		goto unlock;
 569	page = pte_page(entry);
 570	if (!queue_pages_required(page, qp))
 571		goto unlock;
 572	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 573	if (flags & (MPOL_MF_MOVE_ALL) ||
 574	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 575		isolate_huge_page(page, qp->pagelist);
 576unlock:
 577	spin_unlock(ptl);
 578#else
 579	BUG();
 580#endif
 581	return 0;
 582}
 583
 584#ifdef CONFIG_NUMA_BALANCING
 585/*
 586 * This is used to mark a range of virtual addresses to be inaccessible.
 587 * These are later cleared by a NUMA hinting fault. Depending on these
 588 * faults, pages may be migrated for better NUMA placement.
 589 *
 590 * This is assuming that NUMA faults are handled using PROT_NONE. If
 591 * an architecture makes a different choice, it will need further
 592 * changes to the core.
 593 */
 594unsigned long change_prot_numa(struct vm_area_struct *vma,
 595			unsigned long addr, unsigned long end)
 596{
 597	int nr_updated;
 598
 599	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
 600	if (nr_updated)
 601		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 602
 603	return nr_updated;
 604}
 605#else
 606static unsigned long change_prot_numa(struct vm_area_struct *vma,
 607			unsigned long addr, unsigned long end)
 608{
 609	return 0;
 610}
 611#endif /* CONFIG_NUMA_BALANCING */
 612
 613static int queue_pages_test_walk(unsigned long start, unsigned long end,
 614				struct mm_walk *walk)
 615{
 616	struct vm_area_struct *vma = walk->vma;
 617	struct queue_pages *qp = walk->private;
 618	unsigned long endvma = vma->vm_end;
 619	unsigned long flags = qp->flags;
 620
 621	/*
 622	 * Need check MPOL_MF_STRICT to return -EIO if possible
 623	 * regardless of vma_migratable
 624	 */
 625	if (!vma_migratable(vma) &&
 626	    !(flags & MPOL_MF_STRICT))
 627		return 1;
 628
 629	if (endvma > end)
 630		endvma = end;
 631	if (vma->vm_start > start)
 632		start = vma->vm_start;
 633
 634	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 635		if (!vma->vm_next && vma->vm_end < end)
 636			return -EFAULT;
 637		if (qp->prev && qp->prev->vm_end < vma->vm_start)
 638			return -EFAULT;
 639	}
 640
 641	qp->prev = vma;
 642
 643	if (flags & MPOL_MF_LAZY) {
 644		/* Similar to task_numa_work, skip inaccessible VMAs */
 645		if (!is_vm_hugetlb_page(vma) &&
 646			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
 647			!(vma->vm_flags & VM_MIXEDMAP))
 648			change_prot_numa(vma, start, endvma);
 649		return 1;
 650	}
 651
 652	/* queue pages from current vma */
 653	if (flags & MPOL_MF_VALID)
 654		return 0;
 655	return 1;
 656}
 657
 658static const struct mm_walk_ops queue_pages_walk_ops = {
 659	.hugetlb_entry		= queue_pages_hugetlb,
 660	.pmd_entry		= queue_pages_pte_range,
 661	.test_walk		= queue_pages_test_walk,
 662};
 663
 664/*
 665 * Walk through page tables and collect pages to be migrated.
 666 *
 667 * If pages found in a given range are on a set of nodes (determined by
 668 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 669 * passed via @private.
 670 *
 671 * queue_pages_range() has three possible return values:
 672 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 673 *     specified.
 674 * 0 - queue pages successfully or no misplaced page.
 675 * -EIO - there is misplaced page and only MPOL_MF_STRICT was specified.
 676 */
 677static int
 678queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 679		nodemask_t *nodes, unsigned long flags,
 680		struct list_head *pagelist)
 681{
 682	struct queue_pages qp = {
 683		.pagelist = pagelist,
 684		.flags = flags,
 685		.nmask = nodes,
 686		.prev = NULL,
 687	};
 688
 689	return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 690}
 691
 692/*
 693 * Apply policy to a single VMA
 694 * This must be called with the mmap_sem held for writing.
 695 */
 696static int vma_replace_policy(struct vm_area_struct *vma,
 697						struct mempolicy *pol)
 698{
 699	int err;
 700	struct mempolicy *old;
 701	struct mempolicy *new;
 702
 703	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 704		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 705		 vma->vm_ops, vma->vm_file,
 706		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 707
 708	new = mpol_dup(pol);
 709	if (IS_ERR(new))
 710		return PTR_ERR(new);
 711
 712	if (vma->vm_ops && vma->vm_ops->set_policy) {
 713		err = vma->vm_ops->set_policy(vma, new);
 714		if (err)
 715			goto err_out;
 716	}
 717
 718	old = vma->vm_policy;
 719	vma->vm_policy = new; /* protected by mmap_sem */
 720	mpol_put(old);
 721
 722	return 0;
 723 err_out:
 724	mpol_put(new);
 725	return err;
 726}
 727
 728/* Step 2: apply policy to a range and do splits. */
 729static int mbind_range(struct mm_struct *mm, unsigned long start,
 730		       unsigned long end, struct mempolicy *new_pol)
 731{
 732	struct vm_area_struct *next;
 733	struct vm_area_struct *prev;
 734	struct vm_area_struct *vma;
 735	int err = 0;
 736	pgoff_t pgoff;
 737	unsigned long vmstart;
 738	unsigned long vmend;
 739
 740	vma = find_vma(mm, start);
 741	if (!vma || vma->vm_start > start)
 742		return -EFAULT;
 743
 744	prev = vma->vm_prev;
 745	if (start > vma->vm_start)
 746		prev = vma;
 747
 748	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 749		next = vma->vm_next;
 750		vmstart = max(start, vma->vm_start);
 751		vmend   = min(end, vma->vm_end);
 752
 753		if (mpol_equal(vma_policy(vma), new_pol))
 754			continue;
 755
 756		pgoff = vma->vm_pgoff +
 757			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 758		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 759				 vma->anon_vma, vma->vm_file, pgoff,
 760				 new_pol, vma->vm_userfaultfd_ctx);
 761		if (prev) {
 762			vma = prev;
 763			next = vma->vm_next;
 764			if (mpol_equal(vma_policy(vma), new_pol))
 765				continue;
 766			/* vma_merge() joined vma && vma->next, case 8 */
 767			goto replace;
 768		}
 769		if (vma->vm_start != vmstart) {
 770			err = split_vma(vma->vm_mm, vma, vmstart, 1);
 771			if (err)
 772				goto out;
 773		}
 774		if (vma->vm_end != vmend) {
 775			err = split_vma(vma->vm_mm, vma, vmend, 0);
 776			if (err)
 777				goto out;
 778		}
 779 replace:
 780		err = vma_replace_policy(vma, new_pol);
 781		if (err)
 782			goto out;
 783	}
 784
 785 out:
 786	return err;
 787}
 788
 789/* Set the process memory policy */
 790static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 791			     nodemask_t *nodes)
 792{
 793	struct mempolicy *new, *old;
 794	NODEMASK_SCRATCH(scratch);
 795	int ret;
 796
 797	if (!scratch)
 798		return -ENOMEM;
 799
 800	new = mpol_new(mode, flags, nodes);
 801	if (IS_ERR(new)) {
 802		ret = PTR_ERR(new);
 803		goto out;
 804	}
 805
 806	task_lock(current);
 807	ret = mpol_set_nodemask(new, nodes, scratch);
 808	if (ret) {
 809		task_unlock(current);
 810		mpol_put(new);
 811		goto out;
 812	}
 813	old = current->mempolicy;
 814	current->mempolicy = new;
 815	if (new && new->mode == MPOL_INTERLEAVE)
 816		current->il_prev = MAX_NUMNODES-1;
 817	task_unlock(current);
 818	mpol_put(old);
 819	ret = 0;
 820out:
 821	NODEMASK_SCRATCH_FREE(scratch);
 822	return ret;
 823}
 824
 825/*
 826 * Return nodemask for policy for get_mempolicy() query
 827 *
 828 * Called with task's alloc_lock held
 829 */
 830static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 831{
 832	nodes_clear(*nodes);
 833	if (p == &default_policy)
 834		return;
 835
 836	switch (p->mode) {
 837	case MPOL_BIND:
 838		/* Fall through */
 839	case MPOL_INTERLEAVE:
 840		*nodes = p->v.nodes;
 841		break;
 842	case MPOL_PREFERRED:
 843		if (!(p->flags & MPOL_F_LOCAL))
 844			node_set(p->v.preferred_node, *nodes);
 845		/* else return empty node mask for local allocation */
 846		break;
 847	default:
 848		BUG();
 849	}
 850}
 851
 852static int lookup_node(struct mm_struct *mm, unsigned long addr)
 853{
 854	struct page *p;
 855	int err;
 856
 857	int locked = 1;
 858	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
 859	if (err >= 0) {
 860		err = page_to_nid(p);
 861		put_page(p);
 862	}
 863	if (locked)
 864		up_read(&mm->mmap_sem);
 865	return err;
 866}
 867
 868/* Retrieve NUMA policy */
 869static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 870			     unsigned long addr, unsigned long flags)
 871{
 872	int err;
 873	struct mm_struct *mm = current->mm;
 874	struct vm_area_struct *vma = NULL;
 875	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 876
 877	if (flags &
 878		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 879		return -EINVAL;
 880
 881	if (flags & MPOL_F_MEMS_ALLOWED) {
 882		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 883			return -EINVAL;
 884		*policy = 0;	/* just so it's initialized */
 885		task_lock(current);
 886		*nmask  = cpuset_current_mems_allowed;
 887		task_unlock(current);
 888		return 0;
 889	}
 890
 891	if (flags & MPOL_F_ADDR) {
 892		/*
 893		 * Do NOT fall back to task policy if the
 894		 * vma/shared policy at addr is NULL.  We
 895		 * want to return MPOL_DEFAULT in this case.
 896		 */
 897		down_read(&mm->mmap_sem);
 898		vma = find_vma_intersection(mm, addr, addr+1);
 899		if (!vma) {
 900			up_read(&mm->mmap_sem);
 901			return -EFAULT;
 902		}
 903		if (vma->vm_ops && vma->vm_ops->get_policy)
 904			pol = vma->vm_ops->get_policy(vma, addr);
 905		else
 906			pol = vma->vm_policy;
 907	} else if (addr)
 908		return -EINVAL;
 909
 910	if (!pol)
 911		pol = &default_policy;	/* indicates default behavior */
 912
 913	if (flags & MPOL_F_NODE) {
 914		if (flags & MPOL_F_ADDR) {
 915			/*
 916			 * Take a refcount on the mpol, lookup_node()
 917			 * wil drop the mmap_sem, so after calling
 918			 * lookup_node() only "pol" remains valid, "vma"
 919			 * is stale.
 920			 */
 921			pol_refcount = pol;
 922			vma = NULL;
 923			mpol_get(pol);
 924			err = lookup_node(mm, addr);
 925			if (err < 0)
 926				goto out;
 927			*policy = err;
 928		} else if (pol == current->mempolicy &&
 929				pol->mode == MPOL_INTERLEAVE) {
 930			*policy = next_node_in(current->il_prev, pol->v.nodes);
 931		} else {
 932			err = -EINVAL;
 933			goto out;
 934		}
 935	} else {
 936		*policy = pol == &default_policy ? MPOL_DEFAULT :
 937						pol->mode;
 938		/*
 939		 * Internal mempolicy flags must be masked off before exposing
 940		 * the policy to userspace.
 941		 */
 942		*policy |= (pol->flags & MPOL_MODE_FLAGS);
 943	}
 944
 945	err = 0;
 946	if (nmask) {
 947		if (mpol_store_user_nodemask(pol)) {
 948			*nmask = pol->w.user_nodemask;
 949		} else {
 950			task_lock(current);
 951			get_policy_nodemask(pol, nmask);
 952			task_unlock(current);
 953		}
 954	}
 955
 956 out:
 957	mpol_cond_put(pol);
 958	if (vma)
 959		up_read(&mm->mmap_sem);
 960	if (pol_refcount)
 961		mpol_put(pol_refcount);
 962	return err;
 963}
 964
 965#ifdef CONFIG_MIGRATION
 966/*
 967 * page migration, thp tail pages can be passed.
 968 */
 969static int migrate_page_add(struct page *page, struct list_head *pagelist,
 970				unsigned long flags)
 971{
 972	struct page *head = compound_head(page);
 973	/*
 974	 * Avoid migrating a page that is shared with others.
 975	 */
 976	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
 977		if (!isolate_lru_page(head)) {
 978			list_add_tail(&head->lru, pagelist);
 979			mod_node_page_state(page_pgdat(head),
 980				NR_ISOLATED_ANON + page_is_file_cache(head),
 981				hpage_nr_pages(head));
 982		} else if (flags & MPOL_MF_STRICT) {
 983			/*
 984			 * Non-movable page may reach here.  And, there may be
 985			 * temporary off LRU pages or non-LRU movable pages.
 986			 * Treat them as unmovable pages since they can't be
 987			 * isolated, so they can't be moved at the moment.  It
 988			 * should return -EIO for this case too.
 989			 */
 990			return -EIO;
 991		}
 992	}
 993
 994	return 0;
 995}
 996
 997/* page allocation callback for NUMA node migration */
 998struct page *alloc_new_node_page(struct page *page, unsigned long node)
 999{
1000	if (PageHuge(page))
1001		return alloc_huge_page_node(page_hstate(compound_head(page)),
1002					node);
1003	else if (PageTransHuge(page)) {
1004		struct page *thp;
1005
1006		thp = alloc_pages_node(node,
1007			(GFP_TRANSHUGE | __GFP_THISNODE),
1008			HPAGE_PMD_ORDER);
1009		if (!thp)
1010			return NULL;
1011		prep_transhuge_page(thp);
1012		return thp;
1013	} else
1014		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1015						    __GFP_THISNODE, 0);
1016}
1017
1018/*
1019 * Migrate pages from one node to a target node.
1020 * Returns error or the number of pages not migrated.
1021 */
1022static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1023			   int flags)
1024{
1025	nodemask_t nmask;
1026	LIST_HEAD(pagelist);
1027	int err = 0;
1028
1029	nodes_clear(nmask);
1030	node_set(source, nmask);
1031
1032	/*
1033	 * This does not "check" the range but isolates all pages that
1034	 * need migration.  Between passing in the full user address
1035	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1036	 */
1037	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1038	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1039			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1040
1041	if (!list_empty(&pagelist)) {
1042		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1043					MIGRATE_SYNC, MR_SYSCALL);
1044		if (err)
1045			putback_movable_pages(&pagelist);
1046	}
1047
1048	return err;
1049}
1050
1051/*
1052 * Move pages between the two nodesets so as to preserve the physical
1053 * layout as much as possible.
1054 *
1055 * Returns the number of page that could not be moved.
1056 */
1057int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1058		     const nodemask_t *to, int flags)
1059{
1060	int busy = 0;
1061	int err;
1062	nodemask_t tmp;
1063
1064	err = migrate_prep();
1065	if (err)
1066		return err;
1067
1068	down_read(&mm->mmap_sem);
1069
1070	/*
1071	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1072	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1073	 * bit in 'tmp', and return that <source, dest> pair for migration.
1074	 * The pair of nodemasks 'to' and 'from' define the map.
1075	 *
1076	 * If no pair of bits is found that way, fallback to picking some
1077	 * pair of 'source' and 'dest' bits that are not the same.  If the
1078	 * 'source' and 'dest' bits are the same, this represents a node
1079	 * that will be migrating to itself, so no pages need move.
1080	 *
1081	 * If no bits are left in 'tmp', or if all remaining bits left
1082	 * in 'tmp' correspond to the same bit in 'to', return false
1083	 * (nothing left to migrate).
1084	 *
1085	 * This lets us pick a pair of nodes to migrate between, such that
1086	 * if possible the dest node is not already occupied by some other
1087	 * source node, minimizing the risk of overloading the memory on a
1088	 * node that would happen if we migrated incoming memory to a node
1089	 * before migrating outgoing memory source that same node.
1090	 *
1091	 * A single scan of tmp is sufficient.  As we go, we remember the
1092	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1093	 * that not only moved, but what's better, moved to an empty slot
1094	 * (d is not set in tmp), then we break out then, with that pair.
1095	 * Otherwise when we finish scanning from_tmp, we at least have the
1096	 * most recent <s, d> pair that moved.  If we get all the way through
1097	 * the scan of tmp without finding any node that moved, much less
1098	 * moved to an empty node, then there is nothing left worth migrating.
1099	 */
1100
1101	tmp = *from;
1102	while (!nodes_empty(tmp)) {
1103		int s,d;
1104		int source = NUMA_NO_NODE;
1105		int dest = 0;
1106
1107		for_each_node_mask(s, tmp) {
1108
1109			/*
1110			 * do_migrate_pages() tries to maintain the relative
1111			 * node relationship of the pages established between
1112			 * threads and memory areas.
1113                         *
1114			 * However if the number of source nodes is not equal to
1115			 * the number of destination nodes we can not preserve
1116			 * this node relative relationship.  In that case, skip
1117			 * copying memory from a node that is in the destination
1118			 * mask.
1119			 *
1120			 * Example: [2,3,4] -> [3,4,5] moves everything.
1121			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1122			 */
1123
1124			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1125						(node_isset(s, *to)))
1126				continue;
1127
1128			d = node_remap(s, *from, *to);
1129			if (s == d)
1130				continue;
1131
1132			source = s;	/* Node moved. Memorize */
1133			dest = d;
1134
1135			/* dest not in remaining from nodes? */
1136			if (!node_isset(dest, tmp))
1137				break;
1138		}
1139		if (source == NUMA_NO_NODE)
1140			break;
1141
1142		node_clear(source, tmp);
1143		err = migrate_to_node(mm, source, dest, flags);
1144		if (err > 0)
1145			busy += err;
1146		if (err < 0)
1147			break;
1148	}
1149	up_read(&mm->mmap_sem);
1150	if (err < 0)
1151		return err;
1152	return busy;
1153
1154}
1155
1156/*
1157 * Allocate a new page for page migration based on vma policy.
1158 * Start by assuming the page is mapped by the same vma as contains @start.
1159 * Search forward from there, if not.  N.B., this assumes that the
1160 * list of pages handed to migrate_pages()--which is how we get here--
1161 * is in virtual address order.
1162 */
1163static struct page *new_page(struct page *page, unsigned long start)
1164{
1165	struct vm_area_struct *vma;
1166	unsigned long uninitialized_var(address);
1167
1168	vma = find_vma(current->mm, start);
1169	while (vma) {
1170		address = page_address_in_vma(page, vma);
1171		if (address != -EFAULT)
1172			break;
1173		vma = vma->vm_next;
1174	}
1175
1176	if (PageHuge(page)) {
1177		return alloc_huge_page_vma(page_hstate(compound_head(page)),
1178				vma, address);
1179	} else if (PageTransHuge(page)) {
1180		struct page *thp;
1181
1182		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1183					 HPAGE_PMD_ORDER);
1184		if (!thp)
1185			return NULL;
1186		prep_transhuge_page(thp);
1187		return thp;
1188	}
1189	/*
1190	 * if !vma, alloc_page_vma() will use task or system default policy
1191	 */
1192	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1193			vma, address);
1194}
1195#else
1196
1197static int migrate_page_add(struct page *page, struct list_head *pagelist,
1198				unsigned long flags)
1199{
1200	return -EIO;
1201}
1202
1203int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1204		     const nodemask_t *to, int flags)
1205{
1206	return -ENOSYS;
1207}
1208
1209static struct page *new_page(struct page *page, unsigned long start)
1210{
1211	return NULL;
1212}
1213#endif
1214
1215static long do_mbind(unsigned long start, unsigned long len,
1216		     unsigned short mode, unsigned short mode_flags,
1217		     nodemask_t *nmask, unsigned long flags)
1218{
1219	struct mm_struct *mm = current->mm;
1220	struct mempolicy *new;
1221	unsigned long end;
1222	int err;
1223	int ret;
1224	LIST_HEAD(pagelist);
1225
1226	if (flags & ~(unsigned long)MPOL_MF_VALID)
1227		return -EINVAL;
1228	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1229		return -EPERM;
1230
1231	if (start & ~PAGE_MASK)
1232		return -EINVAL;
1233
1234	if (mode == MPOL_DEFAULT)
1235		flags &= ~MPOL_MF_STRICT;
1236
1237	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1238	end = start + len;
1239
1240	if (end < start)
1241		return -EINVAL;
1242	if (end == start)
1243		return 0;
1244
1245	new = mpol_new(mode, mode_flags, nmask);
1246	if (IS_ERR(new))
1247		return PTR_ERR(new);
1248
1249	if (flags & MPOL_MF_LAZY)
1250		new->flags |= MPOL_F_MOF;
1251
1252	/*
1253	 * If we are using the default policy then operation
1254	 * on discontinuous address spaces is okay after all
1255	 */
1256	if (!new)
1257		flags |= MPOL_MF_DISCONTIG_OK;
1258
1259	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1260		 start, start + len, mode, mode_flags,
1261		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1262
1263	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1264
1265		err = migrate_prep();
1266		if (err)
1267			goto mpol_out;
1268	}
1269	{
1270		NODEMASK_SCRATCH(scratch);
1271		if (scratch) {
1272			down_write(&mm->mmap_sem);
1273			task_lock(current);
1274			err = mpol_set_nodemask(new, nmask, scratch);
1275			task_unlock(current);
1276			if (err)
1277				up_write(&mm->mmap_sem);
1278		} else
1279			err = -ENOMEM;
1280		NODEMASK_SCRATCH_FREE(scratch);
1281	}
1282	if (err)
1283		goto mpol_out;
1284
1285	ret = queue_pages_range(mm, start, end, nmask,
1286			  flags | MPOL_MF_INVERT, &pagelist);
1287
1288	if (ret < 0) {
1289		err = -EIO;
1290		goto up_out;
1291	}
1292
1293	err = mbind_range(mm, start, end, new);
1294
1295	if (!err) {
1296		int nr_failed = 0;
1297
1298		if (!list_empty(&pagelist)) {
1299			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1300			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1301				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1302			if (nr_failed)
1303				putback_movable_pages(&pagelist);
1304		}
1305
1306		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1307			err = -EIO;
1308	} else
1309		putback_movable_pages(&pagelist);
1310
1311up_out:
1312	up_write(&mm->mmap_sem);
1313mpol_out:
1314	mpol_put(new);
1315	return err;
1316}
1317
1318/*
1319 * User space interface with variable sized bitmaps for nodelists.
1320 */
1321
1322/* Copy a node mask from user space. */
1323static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1324		     unsigned long maxnode)
1325{
1326	unsigned long k;
1327	unsigned long t;
1328	unsigned long nlongs;
1329	unsigned long endmask;
1330
1331	--maxnode;
1332	nodes_clear(*nodes);
1333	if (maxnode == 0 || !nmask)
1334		return 0;
1335	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1336		return -EINVAL;
1337
1338	nlongs = BITS_TO_LONGS(maxnode);
1339	if ((maxnode % BITS_PER_LONG) == 0)
1340		endmask = ~0UL;
1341	else
1342		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1343
1344	/*
1345	 * When the user specified more nodes than supported just check
1346	 * if the non supported part is all zero.
1347	 *
1348	 * If maxnode have more longs than MAX_NUMNODES, check
1349	 * the bits in that area first. And then go through to
1350	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1351	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1352	 */
1353	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1354		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1355			if (get_user(t, nmask + k))
1356				return -EFAULT;
1357			if (k == nlongs - 1) {
1358				if (t & endmask)
1359					return -EINVAL;
1360			} else if (t)
1361				return -EINVAL;
1362		}
1363		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1364		endmask = ~0UL;
1365	}
1366
1367	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1368		unsigned long valid_mask = endmask;
1369
1370		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1371		if (get_user(t, nmask + nlongs - 1))
1372			return -EFAULT;
1373		if (t & valid_mask)
1374			return -EINVAL;
1375	}
1376
1377	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1378		return -EFAULT;
1379	nodes_addr(*nodes)[nlongs-1] &= endmask;
1380	return 0;
1381}
1382
1383/* Copy a kernel node mask to user space */
1384static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1385			      nodemask_t *nodes)
1386{
1387	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1388	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1389
1390	if (copy > nbytes) {
1391		if (copy > PAGE_SIZE)
1392			return -EINVAL;
1393		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1394			return -EFAULT;
1395		copy = nbytes;
1396	}
1397	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1398}
1399
1400static long kernel_mbind(unsigned long start, unsigned long len,
1401			 unsigned long mode, const unsigned long __user *nmask,
1402			 unsigned long maxnode, unsigned int flags)
1403{
1404	nodemask_t nodes;
1405	int err;
1406	unsigned short mode_flags;
1407
1408	start = untagged_addr(start);
1409	mode_flags = mode & MPOL_MODE_FLAGS;
1410	mode &= ~MPOL_MODE_FLAGS;
1411	if (mode >= MPOL_MAX)
1412		return -EINVAL;
1413	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1414	    (mode_flags & MPOL_F_RELATIVE_NODES))
1415		return -EINVAL;
1416	err = get_nodes(&nodes, nmask, maxnode);
1417	if (err)
1418		return err;
1419	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1420}
1421
1422SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1423		unsigned long, mode, const unsigned long __user *, nmask,
1424		unsigned long, maxnode, unsigned int, flags)
1425{
1426	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1427}
1428
1429/* Set the process memory policy */
1430static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1431				 unsigned long maxnode)
1432{
1433	int err;
1434	nodemask_t nodes;
1435	unsigned short flags;
1436
1437	flags = mode & MPOL_MODE_FLAGS;
1438	mode &= ~MPOL_MODE_FLAGS;
1439	if ((unsigned int)mode >= MPOL_MAX)
1440		return -EINVAL;
1441	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1442		return -EINVAL;
1443	err = get_nodes(&nodes, nmask, maxnode);
1444	if (err)
1445		return err;
1446	return do_set_mempolicy(mode, flags, &nodes);
1447}
1448
1449SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1450		unsigned long, maxnode)
1451{
1452	return kernel_set_mempolicy(mode, nmask, maxnode);
1453}
1454
1455static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1456				const unsigned long __user *old_nodes,
1457				const unsigned long __user *new_nodes)
1458{
1459	struct mm_struct *mm = NULL;
1460	struct task_struct *task;
1461	nodemask_t task_nodes;
1462	int err;
1463	nodemask_t *old;
1464	nodemask_t *new;
1465	NODEMASK_SCRATCH(scratch);
1466
1467	if (!scratch)
1468		return -ENOMEM;
1469
1470	old = &scratch->mask1;
1471	new = &scratch->mask2;
1472
1473	err = get_nodes(old, old_nodes, maxnode);
1474	if (err)
1475		goto out;
1476
1477	err = get_nodes(new, new_nodes, maxnode);
1478	if (err)
1479		goto out;
1480
1481	/* Find the mm_struct */
1482	rcu_read_lock();
1483	task = pid ? find_task_by_vpid(pid) : current;
1484	if (!task) {
1485		rcu_read_unlock();
1486		err = -ESRCH;
1487		goto out;
1488	}
1489	get_task_struct(task);
1490
1491	err = -EINVAL;
1492
1493	/*
1494	 * Check if this process has the right to modify the specified process.
1495	 * Use the regular "ptrace_may_access()" checks.
1496	 */
1497	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1498		rcu_read_unlock();
1499		err = -EPERM;
1500		goto out_put;
1501	}
1502	rcu_read_unlock();
1503
1504	task_nodes = cpuset_mems_allowed(task);
1505	/* Is the user allowed to access the target nodes? */
1506	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1507		err = -EPERM;
1508		goto out_put;
1509	}
1510
1511	task_nodes = cpuset_mems_allowed(current);
1512	nodes_and(*new, *new, task_nodes);
1513	if (nodes_empty(*new))
1514		goto out_put;
1515
1516	err = security_task_movememory(task);
1517	if (err)
1518		goto out_put;
1519
1520	mm = get_task_mm(task);
1521	put_task_struct(task);
1522
1523	if (!mm) {
1524		err = -EINVAL;
1525		goto out;
1526	}
1527
1528	err = do_migrate_pages(mm, old, new,
1529		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1530
1531	mmput(mm);
1532out:
1533	NODEMASK_SCRATCH_FREE(scratch);
1534
1535	return err;
1536
1537out_put:
1538	put_task_struct(task);
1539	goto out;
1540
1541}
1542
1543SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1544		const unsigned long __user *, old_nodes,
1545		const unsigned long __user *, new_nodes)
1546{
1547	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1548}
1549
1550
1551/* Retrieve NUMA policy */
1552static int kernel_get_mempolicy(int __user *policy,
1553				unsigned long __user *nmask,
1554				unsigned long maxnode,
1555				unsigned long addr,
1556				unsigned long flags)
1557{
1558	int err;
1559	int uninitialized_var(pval);
1560	nodemask_t nodes;
1561
1562	addr = untagged_addr(addr);
1563
1564	if (nmask != NULL && maxnode < nr_node_ids)
1565		return -EINVAL;
1566
1567	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1568
1569	if (err)
1570		return err;
1571
1572	if (policy && put_user(pval, policy))
1573		return -EFAULT;
1574
1575	if (nmask)
1576		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1577
1578	return err;
1579}
1580
1581SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1582		unsigned long __user *, nmask, unsigned long, maxnode,
1583		unsigned long, addr, unsigned long, flags)
1584{
1585	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1586}
1587
1588#ifdef CONFIG_COMPAT
1589
1590COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1591		       compat_ulong_t __user *, nmask,
1592		       compat_ulong_t, maxnode,
1593		       compat_ulong_t, addr, compat_ulong_t, flags)
1594{
1595	long err;
1596	unsigned long __user *nm = NULL;
1597	unsigned long nr_bits, alloc_size;
1598	DECLARE_BITMAP(bm, MAX_NUMNODES);
1599
1600	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1601	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1602
1603	if (nmask)
1604		nm = compat_alloc_user_space(alloc_size);
1605
1606	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1607
1608	if (!err && nmask) {
1609		unsigned long copy_size;
1610		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1611		err = copy_from_user(bm, nm, copy_size);
1612		/* ensure entire bitmap is zeroed */
1613		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1614		err |= compat_put_bitmap(nmask, bm, nr_bits);
1615	}
1616
1617	return err;
1618}
1619
1620COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1621		       compat_ulong_t, maxnode)
1622{
1623	unsigned long __user *nm = NULL;
1624	unsigned long nr_bits, alloc_size;
1625	DECLARE_BITMAP(bm, MAX_NUMNODES);
1626
1627	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1628	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1629
1630	if (nmask) {
1631		if (compat_get_bitmap(bm, nmask, nr_bits))
1632			return -EFAULT;
1633		nm = compat_alloc_user_space(alloc_size);
1634		if (copy_to_user(nm, bm, alloc_size))
1635			return -EFAULT;
1636	}
1637
1638	return kernel_set_mempolicy(mode, nm, nr_bits+1);
1639}
1640
1641COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1642		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1643		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1644{
1645	unsigned long __user *nm = NULL;
1646	unsigned long nr_bits, alloc_size;
1647	nodemask_t bm;
1648
1649	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1650	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1651
1652	if (nmask) {
1653		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1654			return -EFAULT;
1655		nm = compat_alloc_user_space(alloc_size);
1656		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1657			return -EFAULT;
1658	}
1659
1660	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1661}
1662
1663COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1664		       compat_ulong_t, maxnode,
1665		       const compat_ulong_t __user *, old_nodes,
1666		       const compat_ulong_t __user *, new_nodes)
1667{
1668	unsigned long __user *old = NULL;
1669	unsigned long __user *new = NULL;
1670	nodemask_t tmp_mask;
1671	unsigned long nr_bits;
1672	unsigned long size;
1673
1674	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1675	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1676	if (old_nodes) {
1677		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1678			return -EFAULT;
1679		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1680		if (new_nodes)
1681			new = old + size / sizeof(unsigned long);
1682		if (copy_to_user(old, nodes_addr(tmp_mask), size))
1683			return -EFAULT;
1684	}
1685	if (new_nodes) {
1686		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1687			return -EFAULT;
1688		if (new == NULL)
1689			new = compat_alloc_user_space(size);
1690		if (copy_to_user(new, nodes_addr(tmp_mask), size))
1691			return -EFAULT;
1692	}
1693	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1694}
1695
1696#endif /* CONFIG_COMPAT */
1697
1698struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1699						unsigned long addr)
1700{
1701	struct mempolicy *pol = NULL;
1702
1703	if (vma) {
1704		if (vma->vm_ops && vma->vm_ops->get_policy) {
1705			pol = vma->vm_ops->get_policy(vma, addr);
1706		} else if (vma->vm_policy) {
1707			pol = vma->vm_policy;
1708
1709			/*
1710			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1711			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1712			 * count on these policies which will be dropped by
1713			 * mpol_cond_put() later
1714			 */
1715			if (mpol_needs_cond_ref(pol))
1716				mpol_get(pol);
1717		}
1718	}
1719
1720	return pol;
1721}
1722
1723/*
1724 * get_vma_policy(@vma, @addr)
1725 * @vma: virtual memory area whose policy is sought
1726 * @addr: address in @vma for shared policy lookup
1727 *
1728 * Returns effective policy for a VMA at specified address.
1729 * Falls back to current->mempolicy or system default policy, as necessary.
1730 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1731 * count--added by the get_policy() vm_op, as appropriate--to protect against
1732 * freeing by another task.  It is the caller's responsibility to free the
1733 * extra reference for shared policies.
1734 */
1735static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1736						unsigned long addr)
1737{
1738	struct mempolicy *pol = __get_vma_policy(vma, addr);
1739
1740	if (!pol)
1741		pol = get_task_policy(current);
1742
1743	return pol;
1744}
1745
1746bool vma_policy_mof(struct vm_area_struct *vma)
1747{
1748	struct mempolicy *pol;
1749
1750	if (vma->vm_ops && vma->vm_ops->get_policy) {
1751		bool ret = false;
1752
1753		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1754		if (pol && (pol->flags & MPOL_F_MOF))
1755			ret = true;
1756		mpol_cond_put(pol);
1757
1758		return ret;
1759	}
1760
1761	pol = vma->vm_policy;
1762	if (!pol)
1763		pol = get_task_policy(current);
1764
1765	return pol->flags & MPOL_F_MOF;
1766}
1767
1768static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1769{
1770	enum zone_type dynamic_policy_zone = policy_zone;
1771
1772	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1773
1774	/*
1775	 * if policy->v.nodes has movable memory only,
1776	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1777	 *
1778	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1779	 * so if the following test faile, it implies
1780	 * policy->v.nodes has movable memory only.
1781	 */
1782	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1783		dynamic_policy_zone = ZONE_MOVABLE;
1784
1785	return zone >= dynamic_policy_zone;
1786}
1787
1788/*
1789 * Return a nodemask representing a mempolicy for filtering nodes for
1790 * page allocation
1791 */
1792static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1793{
1794	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1795	if (unlikely(policy->mode == MPOL_BIND) &&
1796			apply_policy_zone(policy, gfp_zone(gfp)) &&
1797			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1798		return &policy->v.nodes;
1799
1800	return NULL;
1801}
1802
1803/* Return the node id preferred by the given mempolicy, or the given id */
1804static int policy_node(gfp_t gfp, struct mempolicy *policy,
1805								int nd)
1806{
1807	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1808		nd = policy->v.preferred_node;
1809	else {
1810		/*
1811		 * __GFP_THISNODE shouldn't even be used with the bind policy
1812		 * because we might easily break the expectation to stay on the
1813		 * requested node and not break the policy.
1814		 */
1815		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1816	}
1817
1818	return nd;
1819}
1820
1821/* Do dynamic interleaving for a process */
1822static unsigned interleave_nodes(struct mempolicy *policy)
1823{
1824	unsigned next;
1825	struct task_struct *me = current;
1826
1827	next = next_node_in(me->il_prev, policy->v.nodes);
1828	if (next < MAX_NUMNODES)
1829		me->il_prev = next;
1830	return next;
1831}
1832
1833/*
1834 * Depending on the memory policy provide a node from which to allocate the
1835 * next slab entry.
1836 */
1837unsigned int mempolicy_slab_node(void)
1838{
1839	struct mempolicy *policy;
1840	int node = numa_mem_id();
1841
1842	if (in_interrupt())
1843		return node;
1844
1845	policy = current->mempolicy;
1846	if (!policy || policy->flags & MPOL_F_LOCAL)
1847		return node;
1848
1849	switch (policy->mode) {
1850	case MPOL_PREFERRED:
1851		/*
1852		 * handled MPOL_F_LOCAL above
1853		 */
1854		return policy->v.preferred_node;
1855
1856	case MPOL_INTERLEAVE:
1857		return interleave_nodes(policy);
1858
1859	case MPOL_BIND: {
1860		struct zoneref *z;
1861
1862		/*
1863		 * Follow bind policy behavior and start allocation at the
1864		 * first node.
1865		 */
1866		struct zonelist *zonelist;
1867		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1868		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1869		z = first_zones_zonelist(zonelist, highest_zoneidx,
1870							&policy->v.nodes);
1871		return z->zone ? zone_to_nid(z->zone) : node;
1872	}
1873
1874	default:
1875		BUG();
1876	}
1877}
1878
1879/*
1880 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1881 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1882 * number of present nodes.
1883 */
1884static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1885{
1886	unsigned nnodes = nodes_weight(pol->v.nodes);
1887	unsigned target;
1888	int i;
1889	int nid;
1890
1891	if (!nnodes)
1892		return numa_node_id();
1893	target = (unsigned int)n % nnodes;
1894	nid = first_node(pol->v.nodes);
1895	for (i = 0; i < target; i++)
1896		nid = next_node(nid, pol->v.nodes);
1897	return nid;
1898}
1899
1900/* Determine a node number for interleave */
1901static inline unsigned interleave_nid(struct mempolicy *pol,
1902		 struct vm_area_struct *vma, unsigned long addr, int shift)
1903{
1904	if (vma) {
1905		unsigned long off;
1906
1907		/*
1908		 * for small pages, there is no difference between
1909		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1910		 * for huge pages, since vm_pgoff is in units of small
1911		 * pages, we need to shift off the always 0 bits to get
1912		 * a useful offset.
1913		 */
1914		BUG_ON(shift < PAGE_SHIFT);
1915		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1916		off += (addr - vma->vm_start) >> shift;
1917		return offset_il_node(pol, off);
1918	} else
1919		return interleave_nodes(pol);
1920}
1921
1922#ifdef CONFIG_HUGETLBFS
1923/*
1924 * huge_node(@vma, @addr, @gfp_flags, @mpol)
1925 * @vma: virtual memory area whose policy is sought
1926 * @addr: address in @vma for shared policy lookup and interleave policy
1927 * @gfp_flags: for requested zone
1928 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1929 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1930 *
1931 * Returns a nid suitable for a huge page allocation and a pointer
1932 * to the struct mempolicy for conditional unref after allocation.
1933 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1934 * @nodemask for filtering the zonelist.
1935 *
1936 * Must be protected by read_mems_allowed_begin()
1937 */
1938int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1939				struct mempolicy **mpol, nodemask_t **nodemask)
1940{
1941	int nid;
1942
1943	*mpol = get_vma_policy(vma, addr);
1944	*nodemask = NULL;	/* assume !MPOL_BIND */
1945
1946	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1947		nid = interleave_nid(*mpol, vma, addr,
1948					huge_page_shift(hstate_vma(vma)));
1949	} else {
1950		nid = policy_node(gfp_flags, *mpol, numa_node_id());
1951		if ((*mpol)->mode == MPOL_BIND)
1952			*nodemask = &(*mpol)->v.nodes;
1953	}
1954	return nid;
1955}
1956
1957/*
1958 * init_nodemask_of_mempolicy
1959 *
1960 * If the current task's mempolicy is "default" [NULL], return 'false'
1961 * to indicate default policy.  Otherwise, extract the policy nodemask
1962 * for 'bind' or 'interleave' policy into the argument nodemask, or
1963 * initialize the argument nodemask to contain the single node for
1964 * 'preferred' or 'local' policy and return 'true' to indicate presence
1965 * of non-default mempolicy.
1966 *
1967 * We don't bother with reference counting the mempolicy [mpol_get/put]
1968 * because the current task is examining it's own mempolicy and a task's
1969 * mempolicy is only ever changed by the task itself.
1970 *
1971 * N.B., it is the caller's responsibility to free a returned nodemask.
1972 */
1973bool init_nodemask_of_mempolicy(nodemask_t *mask)
1974{
1975	struct mempolicy *mempolicy;
1976	int nid;
1977
1978	if (!(mask && current->mempolicy))
1979		return false;
1980
1981	task_lock(current);
1982	mempolicy = current->mempolicy;
1983	switch (mempolicy->mode) {
1984	case MPOL_PREFERRED:
1985		if (mempolicy->flags & MPOL_F_LOCAL)
1986			nid = numa_node_id();
1987		else
1988			nid = mempolicy->v.preferred_node;
1989		init_nodemask_of_node(mask, nid);
1990		break;
1991
1992	case MPOL_BIND:
1993		/* Fall through */
1994	case MPOL_INTERLEAVE:
1995		*mask =  mempolicy->v.nodes;
1996		break;
1997
1998	default:
1999		BUG();
2000	}
2001	task_unlock(current);
2002
2003	return true;
2004}
2005#endif
2006
2007/*
2008 * mempolicy_nodemask_intersects
2009 *
2010 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2011 * policy.  Otherwise, check for intersection between mask and the policy
2012 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
2013 * policy, always return true since it may allocate elsewhere on fallback.
2014 *
2015 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2016 */
2017bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2018					const nodemask_t *mask)
2019{
2020	struct mempolicy *mempolicy;
2021	bool ret = true;
2022
2023	if (!mask)
2024		return ret;
2025	task_lock(tsk);
2026	mempolicy = tsk->mempolicy;
2027	if (!mempolicy)
2028		goto out;
2029
2030	switch (mempolicy->mode) {
2031	case MPOL_PREFERRED:
2032		/*
2033		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2034		 * allocate from, they may fallback to other nodes when oom.
2035		 * Thus, it's possible for tsk to have allocated memory from
2036		 * nodes in mask.
2037		 */
2038		break;
2039	case MPOL_BIND:
2040	case MPOL_INTERLEAVE:
2041		ret = nodes_intersects(mempolicy->v.nodes, *mask);
2042		break;
2043	default:
2044		BUG();
2045	}
2046out:
2047	task_unlock(tsk);
2048	return ret;
2049}
2050
2051/* Allocate a page in interleaved policy.
2052   Own path because it needs to do special accounting. */
2053static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2054					unsigned nid)
2055{
2056	struct page *page;
2057
2058	page = __alloc_pages(gfp, order, nid);
2059	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2060	if (!static_branch_likely(&vm_numa_stat_key))
2061		return page;
2062	if (page && page_to_nid(page) == nid) {
2063		preempt_disable();
2064		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2065		preempt_enable();
2066	}
2067	return page;
2068}
2069
2070/**
2071 * 	alloc_pages_vma	- Allocate a page for a VMA.
2072 *
2073 * 	@gfp:
2074 *      %GFP_USER    user allocation.
2075 *      %GFP_KERNEL  kernel allocations,
2076 *      %GFP_HIGHMEM highmem/user allocations,
2077 *      %GFP_FS      allocation should not call back into a file system.
2078 *      %GFP_ATOMIC  don't sleep.
2079 *
2080 *	@order:Order of the GFP allocation.
2081 * 	@vma:  Pointer to VMA or NULL if not available.
2082 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2083 *	@node: Which node to prefer for allocation (modulo policy).
2084 *	@hugepage: for hugepages try only the preferred node if possible
2085 *
2086 * 	This function allocates a page from the kernel page pool and applies
2087 *	a NUMA policy associated with the VMA or the current process.
2088 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2089 *	mm_struct of the VMA to prevent it from going away. Should be used for
2090 *	all allocations for pages that will be mapped into user space. Returns
2091 *	NULL when no page can be allocated.
2092 */
2093struct page *
2094alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2095		unsigned long addr, int node, bool hugepage)
2096{
2097	struct mempolicy *pol;
2098	struct page *page;
2099	int preferred_nid;
2100	nodemask_t *nmask;
2101
2102	pol = get_vma_policy(vma, addr);
2103
2104	if (pol->mode == MPOL_INTERLEAVE) {
2105		unsigned nid;
2106
2107		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2108		mpol_cond_put(pol);
2109		page = alloc_page_interleave(gfp, order, nid);
2110		goto out;
2111	}
2112
2113	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2114		int hpage_node = node;
2115
2116		/*
2117		 * For hugepage allocation and non-interleave policy which
2118		 * allows the current node (or other explicitly preferred
2119		 * node) we only try to allocate from the current/preferred
2120		 * node and don't fall back to other nodes, as the cost of
2121		 * remote accesses would likely offset THP benefits.
2122		 *
2123		 * If the policy is interleave, or does not allow the current
2124		 * node in its nodemask, we allocate the standard way.
2125		 */
2126		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2127			hpage_node = pol->v.preferred_node;
2128
2129		nmask = policy_nodemask(gfp, pol);
2130		if (!nmask || node_isset(hpage_node, *nmask)) {
2131			mpol_cond_put(pol);
2132			page = __alloc_pages_node(hpage_node,
2133						gfp | __GFP_THISNODE, order);
2134
2135			/*
2136			 * If hugepage allocations are configured to always
2137			 * synchronous compact or the vma has been madvised
2138			 * to prefer hugepage backing, retry allowing remote
2139			 * memory as well.
2140			 */
2141			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2142				page = __alloc_pages_node(hpage_node,
2143						gfp | __GFP_NORETRY, order);
2144
2145			goto out;
2146		}
2147	}
2148
2149	nmask = policy_nodemask(gfp, pol);
2150	preferred_nid = policy_node(gfp, pol, node);
2151	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2152	mpol_cond_put(pol);
2153out:
2154	return page;
2155}
2156EXPORT_SYMBOL(alloc_pages_vma);
2157
2158/**
2159 * 	alloc_pages_current - Allocate pages.
2160 *
2161 *	@gfp:
2162 *		%GFP_USER   user allocation,
2163 *      	%GFP_KERNEL kernel allocation,
2164 *      	%GFP_HIGHMEM highmem allocation,
2165 *      	%GFP_FS     don't call back into a file system.
2166 *      	%GFP_ATOMIC don't sleep.
2167 *	@order: Power of two of allocation size in pages. 0 is a single page.
2168 *
2169 *	Allocate a page from the kernel page pool.  When not in
2170 *	interrupt context and apply the current process NUMA policy.
2171 *	Returns NULL when no page can be allocated.
2172 */
2173struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2174{
2175	struct mempolicy *pol = &default_policy;
2176	struct page *page;
2177
2178	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2179		pol = get_task_policy(current);
2180
2181	/*
2182	 * No reference counting needed for current->mempolicy
2183	 * nor system default_policy
2184	 */
2185	if (pol->mode == MPOL_INTERLEAVE)
2186		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2187	else
2188		page = __alloc_pages_nodemask(gfp, order,
2189				policy_node(gfp, pol, numa_node_id()),
2190				policy_nodemask(gfp, pol));
2191
2192	return page;
2193}
2194EXPORT_SYMBOL(alloc_pages_current);
2195
2196int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2197{
2198	struct mempolicy *pol = mpol_dup(vma_policy(src));
2199
2200	if (IS_ERR(pol))
2201		return PTR_ERR(pol);
2202	dst->vm_policy = pol;
2203	return 0;
2204}
2205
2206/*
2207 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2208 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2209 * with the mems_allowed returned by cpuset_mems_allowed().  This
2210 * keeps mempolicies cpuset relative after its cpuset moves.  See
2211 * further kernel/cpuset.c update_nodemask().
2212 *
2213 * current's mempolicy may be rebinded by the other task(the task that changes
2214 * cpuset's mems), so we needn't do rebind work for current task.
2215 */
2216
2217/* Slow path of a mempolicy duplicate */
2218struct mempolicy *__mpol_dup(struct mempolicy *old)
2219{
2220	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2221
2222	if (!new)
2223		return ERR_PTR(-ENOMEM);
2224
2225	/* task's mempolicy is protected by alloc_lock */
2226	if (old == current->mempolicy) {
2227		task_lock(current);
2228		*new = *old;
2229		task_unlock(current);
2230	} else
2231		*new = *old;
2232
2233	if (current_cpuset_is_being_rebound()) {
2234		nodemask_t mems = cpuset_mems_allowed(current);
2235		mpol_rebind_policy(new, &mems);
2236	}
2237	atomic_set(&new->refcnt, 1);
2238	return new;
2239}
2240
2241/* Slow path of a mempolicy comparison */
2242bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2243{
2244	if (!a || !b)
2245		return false;
2246	if (a->mode != b->mode)
2247		return false;
2248	if (a->flags != b->flags)
2249		return false;
2250	if (mpol_store_user_nodemask(a))
2251		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2252			return false;
2253
2254	switch (a->mode) {
2255	case MPOL_BIND:
2256		/* Fall through */
2257	case MPOL_INTERLEAVE:
2258		return !!nodes_equal(a->v.nodes, b->v.nodes);
2259	case MPOL_PREFERRED:
2260		/* a's ->flags is the same as b's */
2261		if (a->flags & MPOL_F_LOCAL)
2262			return true;
2263		return a->v.preferred_node == b->v.preferred_node;
2264	default:
2265		BUG();
2266		return false;
2267	}
2268}
2269
2270/*
2271 * Shared memory backing store policy support.
2272 *
2273 * Remember policies even when nobody has shared memory mapped.
2274 * The policies are kept in Red-Black tree linked from the inode.
2275 * They are protected by the sp->lock rwlock, which should be held
2276 * for any accesses to the tree.
2277 */
2278
2279/*
2280 * lookup first element intersecting start-end.  Caller holds sp->lock for
2281 * reading or for writing
2282 */
2283static struct sp_node *
2284sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2285{
2286	struct rb_node *n = sp->root.rb_node;
2287
2288	while (n) {
2289		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2290
2291		if (start >= p->end)
2292			n = n->rb_right;
2293		else if (end <= p->start)
2294			n = n->rb_left;
2295		else
2296			break;
2297	}
2298	if (!n)
2299		return NULL;
2300	for (;;) {
2301		struct sp_node *w = NULL;
2302		struct rb_node *prev = rb_prev(n);
2303		if (!prev)
2304			break;
2305		w = rb_entry(prev, struct sp_node, nd);
2306		if (w->end <= start)
2307			break;
2308		n = prev;
2309	}
2310	return rb_entry(n, struct sp_node, nd);
2311}
2312
2313/*
2314 * Insert a new shared policy into the list.  Caller holds sp->lock for
2315 * writing.
2316 */
2317static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2318{
2319	struct rb_node **p = &sp->root.rb_node;
2320	struct rb_node *parent = NULL;
2321	struct sp_node *nd;
2322
2323	while (*p) {
2324		parent = *p;
2325		nd = rb_entry(parent, struct sp_node, nd);
2326		if (new->start < nd->start)
2327			p = &(*p)->rb_left;
2328		else if (new->end > nd->end)
2329			p = &(*p)->rb_right;
2330		else
2331			BUG();
2332	}
2333	rb_link_node(&new->nd, parent, p);
2334	rb_insert_color(&new->nd, &sp->root);
2335	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2336		 new->policy ? new->policy->mode : 0);
2337}
2338
2339/* Find shared policy intersecting idx */
2340struct mempolicy *
2341mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2342{
2343	struct mempolicy *pol = NULL;
2344	struct sp_node *sn;
2345
2346	if (!sp->root.rb_node)
2347		return NULL;
2348	read_lock(&sp->lock);
2349	sn = sp_lookup(sp, idx, idx+1);
2350	if (sn) {
2351		mpol_get(sn->policy);
2352		pol = sn->policy;
2353	}
2354	read_unlock(&sp->lock);
2355	return pol;
2356}
2357
2358static void sp_free(struct sp_node *n)
2359{
2360	mpol_put(n->policy);
2361	kmem_cache_free(sn_cache, n);
2362}
2363
2364/**
2365 * mpol_misplaced - check whether current page node is valid in policy
2366 *
2367 * @page: page to be checked
2368 * @vma: vm area where page mapped
2369 * @addr: virtual address where page mapped
2370 *
2371 * Lookup current policy node id for vma,addr and "compare to" page's
2372 * node id.
2373 *
2374 * Returns:
2375 *	-1	- not misplaced, page is in the right node
2376 *	node	- node id where the page should be
2377 *
2378 * Policy determination "mimics" alloc_page_vma().
2379 * Called from fault path where we know the vma and faulting address.
2380 */
2381int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2382{
2383	struct mempolicy *pol;
2384	struct zoneref *z;
2385	int curnid = page_to_nid(page);
2386	unsigned long pgoff;
2387	int thiscpu = raw_smp_processor_id();
2388	int thisnid = cpu_to_node(thiscpu);
2389	int polnid = NUMA_NO_NODE;
2390	int ret = -1;
2391
2392	pol = get_vma_policy(vma, addr);
2393	if (!(pol->flags & MPOL_F_MOF))
2394		goto out;
2395
2396	switch (pol->mode) {
2397	case MPOL_INTERLEAVE:
2398		pgoff = vma->vm_pgoff;
2399		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2400		polnid = offset_il_node(pol, pgoff);
2401		break;
2402
2403	case MPOL_PREFERRED:
2404		if (pol->flags & MPOL_F_LOCAL)
2405			polnid = numa_node_id();
2406		else
2407			polnid = pol->v.preferred_node;
2408		break;
2409
2410	case MPOL_BIND:
2411
2412		/*
2413		 * allows binding to multiple nodes.
2414		 * use current page if in policy nodemask,
2415		 * else select nearest allowed node, if any.
2416		 * If no allowed nodes, use current [!misplaced].
2417		 */
2418		if (node_isset(curnid, pol->v.nodes))
2419			goto out;
2420		z = first_zones_zonelist(
2421				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2422				gfp_zone(GFP_HIGHUSER),
2423				&pol->v.nodes);
2424		polnid = zone_to_nid(z->zone);
2425		break;
2426
2427	default:
2428		BUG();
2429	}
2430
2431	/* Migrate the page towards the node whose CPU is referencing it */
2432	if (pol->flags & MPOL_F_MORON) {
2433		polnid = thisnid;
2434
2435		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2436			goto out;
2437	}
2438
2439	if (curnid != polnid)
2440		ret = polnid;
2441out:
2442	mpol_cond_put(pol);
2443
2444	return ret;
2445}
2446
2447/*
2448 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2449 * dropped after task->mempolicy is set to NULL so that any allocation done as
2450 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2451 * policy.
2452 */
2453void mpol_put_task_policy(struct task_struct *task)
2454{
2455	struct mempolicy *pol;
2456
2457	task_lock(task);
2458	pol = task->mempolicy;
2459	task->mempolicy = NULL;
2460	task_unlock(task);
2461	mpol_put(pol);
2462}
2463
2464static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2465{
2466	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2467	rb_erase(&n->nd, &sp->root);
2468	sp_free(n);
2469}
2470
2471static void sp_node_init(struct sp_node *node, unsigned long start,
2472			unsigned long end, struct mempolicy *pol)
2473{
2474	node->start = start;
2475	node->end = end;
2476	node->policy = pol;
2477}
2478
2479static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2480				struct mempolicy *pol)
2481{
2482	struct sp_node *n;
2483	struct mempolicy *newpol;
2484
2485	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2486	if (!n)
2487		return NULL;
2488
2489	newpol = mpol_dup(pol);
2490	if (IS_ERR(newpol)) {
2491		kmem_cache_free(sn_cache, n);
2492		return NULL;
2493	}
2494	newpol->flags |= MPOL_F_SHARED;
2495	sp_node_init(n, start, end, newpol);
2496
2497	return n;
2498}
2499
2500/* Replace a policy range. */
2501static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2502				 unsigned long end, struct sp_node *new)
2503{
2504	struct sp_node *n;
2505	struct sp_node *n_new = NULL;
2506	struct mempolicy *mpol_new = NULL;
2507	int ret = 0;
2508
2509restart:
2510	write_lock(&sp->lock);
2511	n = sp_lookup(sp, start, end);
2512	/* Take care of old policies in the same range. */
2513	while (n && n->start < end) {
2514		struct rb_node *next = rb_next(&n->nd);
2515		if (n->start >= start) {
2516			if (n->end <= end)
2517				sp_delete(sp, n);
2518			else
2519				n->start = end;
2520		} else {
2521			/* Old policy spanning whole new range. */
2522			if (n->end > end) {
2523				if (!n_new)
2524					goto alloc_new;
2525
2526				*mpol_new = *n->policy;
2527				atomic_set(&mpol_new->refcnt, 1);
2528				sp_node_init(n_new, end, n->end, mpol_new);
2529				n->end = start;
2530				sp_insert(sp, n_new);
2531				n_new = NULL;
2532				mpol_new = NULL;
2533				break;
2534			} else
2535				n->end = start;
2536		}
2537		if (!next)
2538			break;
2539		n = rb_entry(next, struct sp_node, nd);
2540	}
2541	if (new)
2542		sp_insert(sp, new);
2543	write_unlock(&sp->lock);
2544	ret = 0;
2545
2546err_out:
2547	if (mpol_new)
2548		mpol_put(mpol_new);
2549	if (n_new)
2550		kmem_cache_free(sn_cache, n_new);
2551
2552	return ret;
2553
2554alloc_new:
2555	write_unlock(&sp->lock);
2556	ret = -ENOMEM;
2557	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2558	if (!n_new)
2559		goto err_out;
2560	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2561	if (!mpol_new)
2562		goto err_out;
2563	goto restart;
2564}
2565
2566/**
2567 * mpol_shared_policy_init - initialize shared policy for inode
2568 * @sp: pointer to inode shared policy
2569 * @mpol:  struct mempolicy to install
2570 *
2571 * Install non-NULL @mpol in inode's shared policy rb-tree.
2572 * On entry, the current task has a reference on a non-NULL @mpol.
2573 * This must be released on exit.
2574 * This is called at get_inode() calls and we can use GFP_KERNEL.
2575 */
2576void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2577{
2578	int ret;
2579
2580	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2581	rwlock_init(&sp->lock);
2582
2583	if (mpol) {
2584		struct vm_area_struct pvma;
2585		struct mempolicy *new;
2586		NODEMASK_SCRATCH(scratch);
2587
2588		if (!scratch)
2589			goto put_mpol;
2590		/* contextualize the tmpfs mount point mempolicy */
2591		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2592		if (IS_ERR(new))
2593			goto free_scratch; /* no valid nodemask intersection */
2594
2595		task_lock(current);
2596		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2597		task_unlock(current);
2598		if (ret)
2599			goto put_new;
2600
2601		/* Create pseudo-vma that contains just the policy */
2602		vma_init(&pvma, NULL);
2603		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2604		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2605
2606put_new:
2607		mpol_put(new);			/* drop initial ref */
2608free_scratch:
2609		NODEMASK_SCRATCH_FREE(scratch);
2610put_mpol:
2611		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2612	}
2613}
2614
2615int mpol_set_shared_policy(struct shared_policy *info,
2616			struct vm_area_struct *vma, struct mempolicy *npol)
2617{
2618	int err;
2619	struct sp_node *new = NULL;
2620	unsigned long sz = vma_pages(vma);
2621
2622	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2623		 vma->vm_pgoff,
2624		 sz, npol ? npol->mode : -1,
2625		 npol ? npol->flags : -1,
2626		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2627
2628	if (npol) {
2629		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2630		if (!new)
2631			return -ENOMEM;
2632	}
2633	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2634	if (err && new)
2635		sp_free(new);
2636	return err;
2637}
2638
2639/* Free a backing policy store on inode delete. */
2640void mpol_free_shared_policy(struct shared_policy *p)
2641{
2642	struct sp_node *n;
2643	struct rb_node *next;
2644
2645	if (!p->root.rb_node)
2646		return;
2647	write_lock(&p->lock);
2648	next = rb_first(&p->root);
2649	while (next) {
2650		n = rb_entry(next, struct sp_node, nd);
2651		next = rb_next(&n->nd);
2652		sp_delete(p, n);
2653	}
2654	write_unlock(&p->lock);
2655}
2656
2657#ifdef CONFIG_NUMA_BALANCING
2658static int __initdata numabalancing_override;
2659
2660static void __init check_numabalancing_enable(void)
2661{
2662	bool numabalancing_default = false;
2663
2664	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2665		numabalancing_default = true;
2666
2667	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2668	if (numabalancing_override)
2669		set_numabalancing_state(numabalancing_override == 1);
2670
2671	if (num_online_nodes() > 1 && !numabalancing_override) {
2672		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2673			numabalancing_default ? "Enabling" : "Disabling");
2674		set_numabalancing_state(numabalancing_default);
2675	}
2676}
2677
2678static int __init setup_numabalancing(char *str)
2679{
2680	int ret = 0;
2681	if (!str)
2682		goto out;
2683
2684	if (!strcmp(str, "enable")) {
2685		numabalancing_override = 1;
2686		ret = 1;
2687	} else if (!strcmp(str, "disable")) {
2688		numabalancing_override = -1;
2689		ret = 1;
2690	}
2691out:
2692	if (!ret)
2693		pr_warn("Unable to parse numa_balancing=\n");
2694
2695	return ret;
2696}
2697__setup("numa_balancing=", setup_numabalancing);
2698#else
2699static inline void __init check_numabalancing_enable(void)
2700{
2701}
2702#endif /* CONFIG_NUMA_BALANCING */
2703
2704/* assumes fs == KERNEL_DS */
2705void __init numa_policy_init(void)
2706{
2707	nodemask_t interleave_nodes;
2708	unsigned long largest = 0;
2709	int nid, prefer = 0;
2710
2711	policy_cache = kmem_cache_create("numa_policy",
2712					 sizeof(struct mempolicy),
2713					 0, SLAB_PANIC, NULL);
2714
2715	sn_cache = kmem_cache_create("shared_policy_node",
2716				     sizeof(struct sp_node),
2717				     0, SLAB_PANIC, NULL);
2718
2719	for_each_node(nid) {
2720		preferred_node_policy[nid] = (struct mempolicy) {
2721			.refcnt = ATOMIC_INIT(1),
2722			.mode = MPOL_PREFERRED,
2723			.flags = MPOL_F_MOF | MPOL_F_MORON,
2724			.v = { .preferred_node = nid, },
2725		};
2726	}
2727
2728	/*
2729	 * Set interleaving policy for system init. Interleaving is only
2730	 * enabled across suitably sized nodes (default is >= 16MB), or
2731	 * fall back to the largest node if they're all smaller.
2732	 */
2733	nodes_clear(interleave_nodes);
2734	for_each_node_state(nid, N_MEMORY) {
2735		unsigned long total_pages = node_present_pages(nid);
2736
2737		/* Preserve the largest node */
2738		if (largest < total_pages) {
2739			largest = total_pages;
2740			prefer = nid;
2741		}
2742
2743		/* Interleave this node? */
2744		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2745			node_set(nid, interleave_nodes);
2746	}
2747
2748	/* All too small, use the largest */
2749	if (unlikely(nodes_empty(interleave_nodes)))
2750		node_set(prefer, interleave_nodes);
2751
2752	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2753		pr_err("%s: interleaving failed\n", __func__);
2754
2755	check_numabalancing_enable();
2756}
2757
2758/* Reset policy of current process to default */
2759void numa_default_policy(void)
2760{
2761	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2762}
2763
2764/*
2765 * Parse and format mempolicy from/to strings
2766 */
2767
2768/*
2769 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2770 */
2771static const char * const policy_modes[] =
2772{
2773	[MPOL_DEFAULT]    = "default",
2774	[MPOL_PREFERRED]  = "prefer",
2775	[MPOL_BIND]       = "bind",
2776	[MPOL_INTERLEAVE] = "interleave",
2777	[MPOL_LOCAL]      = "local",
2778};
2779
2780
2781#ifdef CONFIG_TMPFS
2782/**
2783 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2784 * @str:  string containing mempolicy to parse
2785 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2786 *
2787 * Format of input:
2788 *	<mode>[=<flags>][:<nodelist>]
2789 *
2790 * On success, returns 0, else 1
2791 */
2792int mpol_parse_str(char *str, struct mempolicy **mpol)
2793{
2794	struct mempolicy *new = NULL;
2795	unsigned short mode_flags;
2796	nodemask_t nodes;
2797	char *nodelist = strchr(str, ':');
2798	char *flags = strchr(str, '=');
2799	int err = 1, mode;
2800
2801	if (nodelist) {
2802		/* NUL-terminate mode or flags string */
2803		*nodelist++ = '\0';
2804		if (nodelist_parse(nodelist, nodes))
2805			goto out;
2806		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2807			goto out;
2808	} else
2809		nodes_clear(nodes);
2810
2811	if (flags)
2812		*flags++ = '\0';	/* terminate mode string */
2813
2814	mode = match_string(policy_modes, MPOL_MAX, str);
2815	if (mode < 0)
2816		goto out;
2817
2818	switch (mode) {
2819	case MPOL_PREFERRED:
2820		/*
2821		 * Insist on a nodelist of one node only
2822		 */
2823		if (nodelist) {
2824			char *rest = nodelist;
2825			while (isdigit(*rest))
2826				rest++;
2827			if (*rest)
2828				goto out;
2829		}
2830		break;
2831	case MPOL_INTERLEAVE:
2832		/*
2833		 * Default to online nodes with memory if no nodelist
2834		 */
2835		if (!nodelist)
2836			nodes = node_states[N_MEMORY];
2837		break;
2838	case MPOL_LOCAL:
2839		/*
2840		 * Don't allow a nodelist;  mpol_new() checks flags
2841		 */
2842		if (nodelist)
2843			goto out;
2844		mode = MPOL_PREFERRED;
2845		break;
2846	case MPOL_DEFAULT:
2847		/*
2848		 * Insist on a empty nodelist
2849		 */
2850		if (!nodelist)
2851			err = 0;
2852		goto out;
2853	case MPOL_BIND:
2854		/*
2855		 * Insist on a nodelist
2856		 */
2857		if (!nodelist)
2858			goto out;
2859	}
2860
2861	mode_flags = 0;
2862	if (flags) {
2863		/*
2864		 * Currently, we only support two mutually exclusive
2865		 * mode flags.
2866		 */
2867		if (!strcmp(flags, "static"))
2868			mode_flags |= MPOL_F_STATIC_NODES;
2869		else if (!strcmp(flags, "relative"))
2870			mode_flags |= MPOL_F_RELATIVE_NODES;
2871		else
2872			goto out;
2873	}
2874
2875	new = mpol_new(mode, mode_flags, &nodes);
2876	if (IS_ERR(new))
2877		goto out;
2878
2879	/*
2880	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2881	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2882	 */
2883	if (mode != MPOL_PREFERRED)
2884		new->v.nodes = nodes;
2885	else if (nodelist)
2886		new->v.preferred_node = first_node(nodes);
2887	else
2888		new->flags |= MPOL_F_LOCAL;
2889
2890	/*
2891	 * Save nodes for contextualization: this will be used to "clone"
2892	 * the mempolicy in a specific context [cpuset] at a later time.
2893	 */
2894	new->w.user_nodemask = nodes;
2895
2896	err = 0;
2897
2898out:
2899	/* Restore string for error message */
2900	if (nodelist)
2901		*--nodelist = ':';
2902	if (flags)
2903		*--flags = '=';
2904	if (!err)
2905		*mpol = new;
2906	return err;
2907}
2908#endif /* CONFIG_TMPFS */
2909
2910/**
2911 * mpol_to_str - format a mempolicy structure for printing
2912 * @buffer:  to contain formatted mempolicy string
2913 * @maxlen:  length of @buffer
2914 * @pol:  pointer to mempolicy to be formatted
2915 *
2916 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2917 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2918 * longest flag, "relative", and to display at least a few node ids.
2919 */
2920void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2921{
2922	char *p = buffer;
2923	nodemask_t nodes = NODE_MASK_NONE;
2924	unsigned short mode = MPOL_DEFAULT;
2925	unsigned short flags = 0;
2926
2927	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2928		mode = pol->mode;
2929		flags = pol->flags;
2930	}
2931
2932	switch (mode) {
2933	case MPOL_DEFAULT:
2934		break;
2935	case MPOL_PREFERRED:
2936		if (flags & MPOL_F_LOCAL)
2937			mode = MPOL_LOCAL;
2938		else
2939			node_set(pol->v.preferred_node, nodes);
2940		break;
2941	case MPOL_BIND:
2942	case MPOL_INTERLEAVE:
2943		nodes = pol->v.nodes;
2944		break;
2945	default:
2946		WARN_ON_ONCE(1);
2947		snprintf(p, maxlen, "unknown");
2948		return;
2949	}
2950
2951	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2952
2953	if (flags & MPOL_MODE_FLAGS) {
2954		p += snprintf(p, buffer + maxlen - p, "=");
2955
2956		/*
2957		 * Currently, the only defined flags are mutually exclusive
2958		 */
2959		if (flags & MPOL_F_STATIC_NODES)
2960			p += snprintf(p, buffer + maxlen - p, "static");
2961		else if (flags & MPOL_F_RELATIVE_NODES)
2962			p += snprintf(p, buffer + maxlen - p, "relative");
2963	}
2964
2965	if (!nodes_empty(nodes))
2966		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2967			       nodemask_pr_args(&nodes));
2968}
Configure Feed

Configure Feed