mm/hugetlb.c at v6.14-rc4 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / hugetlb.c
at v6.14-rc4 7775 lines 220 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Generic hugetlb support.
   4 * (C) Nadia Yvette Chambers, April 2004
   5 */
   6#include <linux/list.h>
   7#include <linux/init.h>
   8#include <linux/mm.h>
   9#include <linux/seq_file.h>
  10#include <linux/sysctl.h>
  11#include <linux/highmem.h>
  12#include <linux/mmu_notifier.h>
  13#include <linux/nodemask.h>
  14#include <linux/pagemap.h>
  15#include <linux/mempolicy.h>
  16#include <linux/compiler.h>
  17#include <linux/cpuset.h>
  18#include <linux/mutex.h>
  19#include <linux/memblock.h>
  20#include <linux/sysfs.h>
  21#include <linux/slab.h>
  22#include <linux/sched/mm.h>
  23#include <linux/mmdebug.h>
  24#include <linux/sched/signal.h>
  25#include <linux/rmap.h>
  26#include <linux/string_helpers.h>
  27#include <linux/swap.h>
  28#include <linux/swapops.h>
  29#include <linux/jhash.h>
  30#include <linux/numa.h>
  31#include <linux/llist.h>
  32#include <linux/cma.h>
  33#include <linux/migrate.h>
  34#include <linux/nospec.h>
  35#include <linux/delayacct.h>
  36#include <linux/memory.h>
  37#include <linux/mm_inline.h>
  38#include <linux/padata.h>
  39
  40#include <asm/page.h>
  41#include <asm/pgalloc.h>
  42#include <asm/tlb.h>
  43
  44#include <linux/io.h>
  45#include <linux/hugetlb.h>
  46#include <linux/hugetlb_cgroup.h>
  47#include <linux/node.h>
  48#include <linux/page_owner.h>
  49#include "internal.h"
  50#include "hugetlb_vmemmap.h"
  51#include <linux/page-isolation.h>
  52
  53int hugetlb_max_hstate __read_mostly;
  54unsigned int default_hstate_idx;
  55struct hstate hstates[HUGE_MAX_HSTATE];
  56
  57#ifdef CONFIG_CMA
  58static struct cma *hugetlb_cma[MAX_NUMNODES];
  59static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
  60#endif
  61static unsigned long hugetlb_cma_size __initdata;
  62
  63__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
  64
  65/* for command line parsing */
  66static struct hstate * __initdata parsed_hstate;
  67static unsigned long __initdata default_hstate_max_huge_pages;
  68static bool __initdata parsed_valid_hugepagesz = true;
  69static bool __initdata parsed_default_hugepagesz;
  70static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
  71
  72/*
  73 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
  74 * free_huge_pages, and surplus_huge_pages.
  75 */
  76__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);
  77
  78/*
  79 * Serializes faults on the same logical page.  This is used to
  80 * prevent spurious OOMs when the hugepage pool is fully utilized.
  81 */
  82static int num_fault_mutexes __ro_after_init;
  83struct mutex *hugetlb_fault_mutex_table __ro_after_init;
  84
  85/* Forward declaration */
  86static int hugetlb_acct_memory(struct hstate *h, long delta);
  87static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
  88static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
  89static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
  90static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
  91		unsigned long start, unsigned long end);
  92static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
  93
  94static void hugetlb_free_folio(struct folio *folio)
  95{
  96#ifdef CONFIG_CMA
  97	int nid = folio_nid(folio);
  98
  99	if (cma_free_folio(hugetlb_cma[nid], folio))
 100		return;
 101#endif
 102	folio_put(folio);
 103}
 104
 105static inline bool subpool_is_free(struct hugepage_subpool *spool)
 106{
 107	if (spool->count)
 108		return false;
 109	if (spool->max_hpages != -1)
 110		return spool->used_hpages == 0;
 111	if (spool->min_hpages != -1)
 112		return spool->rsv_hpages == spool->min_hpages;
 113
 114	return true;
 115}
 116
 117static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
 118						unsigned long irq_flags)
 119{
 120	spin_unlock_irqrestore(&spool->lock, irq_flags);
 121
 122	/* If no pages are used, and no other handles to the subpool
 123	 * remain, give up any reservations based on minimum size and
 124	 * free the subpool */
 125	if (subpool_is_free(spool)) {
 126		if (spool->min_hpages != -1)
 127			hugetlb_acct_memory(spool->hstate,
 128						-spool->min_hpages);
 129		kfree(spool);
 130	}
 131}
 132
 133struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 134						long min_hpages)
 135{
 136	struct hugepage_subpool *spool;
 137
 138	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
 139	if (!spool)
 140		return NULL;
 141
 142	spin_lock_init(&spool->lock);
 143	spool->count = 1;
 144	spool->max_hpages = max_hpages;
 145	spool->hstate = h;
 146	spool->min_hpages = min_hpages;
 147
 148	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
 149		kfree(spool);
 150		return NULL;
 151	}
 152	spool->rsv_hpages = min_hpages;
 153
 154	return spool;
 155}
 156
 157void hugepage_put_subpool(struct hugepage_subpool *spool)
 158{
 159	unsigned long flags;
 160
 161	spin_lock_irqsave(&spool->lock, flags);
 162	BUG_ON(!spool->count);
 163	spool->count--;
 164	unlock_or_release_subpool(spool, flags);
 165}
 166
 167/*
 168 * Subpool accounting for allocating and reserving pages.
 169 * Return -ENOMEM if there are not enough resources to satisfy the
 170 * request.  Otherwise, return the number of pages by which the
 171 * global pools must be adjusted (upward).  The returned value may
 172 * only be different than the passed value (delta) in the case where
 173 * a subpool minimum size must be maintained.
 174 */
 175static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
 176				      long delta)
 177{
 178	long ret = delta;
 179
 180	if (!spool)
 181		return ret;
 182
 183	spin_lock_irq(&spool->lock);
 184
 185	if (spool->max_hpages != -1) {		/* maximum size accounting */
 186		if ((spool->used_hpages + delta) <= spool->max_hpages)
 187			spool->used_hpages += delta;
 188		else {
 189			ret = -ENOMEM;
 190			goto unlock_ret;
 191		}
 192	}
 193
 194	/* minimum size accounting */
 195	if (spool->min_hpages != -1 && spool->rsv_hpages) {
 196		if (delta > spool->rsv_hpages) {
 197			/*
 198			 * Asking for more reserves than those already taken on
 199			 * behalf of subpool.  Return difference.
 200			 */
 201			ret = delta - spool->rsv_hpages;
 202			spool->rsv_hpages = 0;
 203		} else {
 204			ret = 0;	/* reserves already accounted for */
 205			spool->rsv_hpages -= delta;
 206		}
 207	}
 208
 209unlock_ret:
 210	spin_unlock_irq(&spool->lock);
 211	return ret;
 212}
 213
 214/*
 215 * Subpool accounting for freeing and unreserving pages.
 216 * Return the number of global page reservations that must be dropped.
 217 * The return value may only be different than the passed value (delta)
 218 * in the case where a subpool minimum size must be maintained.
 219 */
 220static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
 221				       long delta)
 222{
 223	long ret = delta;
 224	unsigned long flags;
 225
 226	if (!spool)
 227		return delta;
 228
 229	spin_lock_irqsave(&spool->lock, flags);
 230
 231	if (spool->max_hpages != -1)		/* maximum size accounting */
 232		spool->used_hpages -= delta;
 233
 234	 /* minimum size accounting */
 235	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
 236		if (spool->rsv_hpages + delta <= spool->min_hpages)
 237			ret = 0;
 238		else
 239			ret = spool->rsv_hpages + delta - spool->min_hpages;
 240
 241		spool->rsv_hpages += delta;
 242		if (spool->rsv_hpages > spool->min_hpages)
 243			spool->rsv_hpages = spool->min_hpages;
 244	}
 245
 246	/*
 247	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
 248	 * quota reference, free it now.
 249	 */
 250	unlock_or_release_subpool(spool, flags);
 251
 252	return ret;
 253}
 254
 255static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
 256{
 257	return HUGETLBFS_SB(inode->i_sb)->spool;
 258}
 259
 260static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
 261{
 262	return subpool_inode(file_inode(vma->vm_file));
 263}
 264
 265/*
 266 * hugetlb vma_lock helper routines
 267 */
 268void hugetlb_vma_lock_read(struct vm_area_struct *vma)
 269{
 270	if (__vma_shareable_lock(vma)) {
 271		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 272
 273		down_read(&vma_lock->rw_sema);
 274	} else if (__vma_private_lock(vma)) {
 275		struct resv_map *resv_map = vma_resv_map(vma);
 276
 277		down_read(&resv_map->rw_sema);
 278	}
 279}
 280
 281void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
 282{
 283	if (__vma_shareable_lock(vma)) {
 284		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 285
 286		up_read(&vma_lock->rw_sema);
 287	} else if (__vma_private_lock(vma)) {
 288		struct resv_map *resv_map = vma_resv_map(vma);
 289
 290		up_read(&resv_map->rw_sema);
 291	}
 292}
 293
 294void hugetlb_vma_lock_write(struct vm_area_struct *vma)
 295{
 296	if (__vma_shareable_lock(vma)) {
 297		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 298
 299		down_write(&vma_lock->rw_sema);
 300	} else if (__vma_private_lock(vma)) {
 301		struct resv_map *resv_map = vma_resv_map(vma);
 302
 303		down_write(&resv_map->rw_sema);
 304	}
 305}
 306
 307void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
 308{
 309	if (__vma_shareable_lock(vma)) {
 310		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 311
 312		up_write(&vma_lock->rw_sema);
 313	} else if (__vma_private_lock(vma)) {
 314		struct resv_map *resv_map = vma_resv_map(vma);
 315
 316		up_write(&resv_map->rw_sema);
 317	}
 318}
 319
 320int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
 321{
 322
 323	if (__vma_shareable_lock(vma)) {
 324		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 325
 326		return down_write_trylock(&vma_lock->rw_sema);
 327	} else if (__vma_private_lock(vma)) {
 328		struct resv_map *resv_map = vma_resv_map(vma);
 329
 330		return down_write_trylock(&resv_map->rw_sema);
 331	}
 332
 333	return 1;
 334}
 335
 336void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
 337{
 338	if (__vma_shareable_lock(vma)) {
 339		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 340
 341		lockdep_assert_held(&vma_lock->rw_sema);
 342	} else if (__vma_private_lock(vma)) {
 343		struct resv_map *resv_map = vma_resv_map(vma);
 344
 345		lockdep_assert_held(&resv_map->rw_sema);
 346	}
 347}
 348
 349void hugetlb_vma_lock_release(struct kref *kref)
 350{
 351	struct hugetlb_vma_lock *vma_lock = container_of(kref,
 352			struct hugetlb_vma_lock, refs);
 353
 354	kfree(vma_lock);
 355}
 356
 357static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
 358{
 359	struct vm_area_struct *vma = vma_lock->vma;
 360
 361	/*
 362	 * vma_lock structure may or not be released as a result of put,
 363	 * it certainly will no longer be attached to vma so clear pointer.
 364	 * Semaphore synchronizes access to vma_lock->vma field.
 365	 */
 366	vma_lock->vma = NULL;
 367	vma->vm_private_data = NULL;
 368	up_write(&vma_lock->rw_sema);
 369	kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
 370}
 371
 372static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
 373{
 374	if (__vma_shareable_lock(vma)) {
 375		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 376
 377		__hugetlb_vma_unlock_write_put(vma_lock);
 378	} else if (__vma_private_lock(vma)) {
 379		struct resv_map *resv_map = vma_resv_map(vma);
 380
 381		/* no free for anon vmas, but still need to unlock */
 382		up_write(&resv_map->rw_sema);
 383	}
 384}
 385
 386static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 387{
 388	/*
 389	 * Only present in sharable vmas.
 390	 */
 391	if (!vma || !__vma_shareable_lock(vma))
 392		return;
 393
 394	if (vma->vm_private_data) {
 395		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 396
 397		down_write(&vma_lock->rw_sema);
 398		__hugetlb_vma_unlock_write_put(vma_lock);
 399	}
 400}
 401
 402static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 403{
 404	struct hugetlb_vma_lock *vma_lock;
 405
 406	/* Only establish in (flags) sharable vmas */
 407	if (!vma || !(vma->vm_flags & VM_MAYSHARE))
 408		return;
 409
 410	/* Should never get here with non-NULL vm_private_data */
 411	if (vma->vm_private_data)
 412		return;
 413
 414	vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
 415	if (!vma_lock) {
 416		/*
 417		 * If we can not allocate structure, then vma can not
 418		 * participate in pmd sharing.  This is only a possible
 419		 * performance enhancement and memory saving issue.
 420		 * However, the lock is also used to synchronize page
 421		 * faults with truncation.  If the lock is not present,
 422		 * unlikely races could leave pages in a file past i_size
 423		 * until the file is removed.  Warn in the unlikely case of
 424		 * allocation failure.
 425		 */
 426		pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
 427		return;
 428	}
 429
 430	kref_init(&vma_lock->refs);
 431	init_rwsem(&vma_lock->rw_sema);
 432	vma_lock->vma = vma;
 433	vma->vm_private_data = vma_lock;
 434}
 435
 436/* Helper that removes a struct file_region from the resv_map cache and returns
 437 * it for use.
 438 */
 439static struct file_region *
 440get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
 441{
 442	struct file_region *nrg;
 443
 444	VM_BUG_ON(resv->region_cache_count <= 0);
 445
 446	resv->region_cache_count--;
 447	nrg = list_first_entry(&resv->region_cache, struct file_region, link);
 448	list_del(&nrg->link);
 449
 450	nrg->from = from;
 451	nrg->to = to;
 452
 453	return nrg;
 454}
 455
 456static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
 457					      struct file_region *rg)
 458{
 459#ifdef CONFIG_CGROUP_HUGETLB
 460	nrg->reservation_counter = rg->reservation_counter;
 461	nrg->css = rg->css;
 462	if (rg->css)
 463		css_get(rg->css);
 464#endif
 465}
 466
 467/* Helper that records hugetlb_cgroup uncharge info. */
 468static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
 469						struct hstate *h,
 470						struct resv_map *resv,
 471						struct file_region *nrg)
 472{
 473#ifdef CONFIG_CGROUP_HUGETLB
 474	if (h_cg) {
 475		nrg->reservation_counter =
 476			&h_cg->rsvd_hugepage[hstate_index(h)];
 477		nrg->css = &h_cg->css;
 478		/*
 479		 * The caller will hold exactly one h_cg->css reference for the
 480		 * whole contiguous reservation region. But this area might be
 481		 * scattered when there are already some file_regions reside in
 482		 * it. As a result, many file_regions may share only one css
 483		 * reference. In order to ensure that one file_region must hold
 484		 * exactly one h_cg->css reference, we should do css_get for
 485		 * each file_region and leave the reference held by caller
 486		 * untouched.
 487		 */
 488		css_get(&h_cg->css);
 489		if (!resv->pages_per_hpage)
 490			resv->pages_per_hpage = pages_per_huge_page(h);
 491		/* pages_per_hpage should be the same for all entries in
 492		 * a resv_map.
 493		 */
 494		VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
 495	} else {
 496		nrg->reservation_counter = NULL;
 497		nrg->css = NULL;
 498	}
 499#endif
 500}
 501
 502static void put_uncharge_info(struct file_region *rg)
 503{
 504#ifdef CONFIG_CGROUP_HUGETLB
 505	if (rg->css)
 506		css_put(rg->css);
 507#endif
 508}
 509
 510static bool has_same_uncharge_info(struct file_region *rg,
 511				   struct file_region *org)
 512{
 513#ifdef CONFIG_CGROUP_HUGETLB
 514	return rg->reservation_counter == org->reservation_counter &&
 515	       rg->css == org->css;
 516
 517#else
 518	return true;
 519#endif
 520}
 521
 522static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
 523{
 524	struct file_region *nrg, *prg;
 525
 526	prg = list_prev_entry(rg, link);
 527	if (&prg->link != &resv->regions && prg->to == rg->from &&
 528	    has_same_uncharge_info(prg, rg)) {
 529		prg->to = rg->to;
 530
 531		list_del(&rg->link);
 532		put_uncharge_info(rg);
 533		kfree(rg);
 534
 535		rg = prg;
 536	}
 537
 538	nrg = list_next_entry(rg, link);
 539	if (&nrg->link != &resv->regions && nrg->from == rg->to &&
 540	    has_same_uncharge_info(nrg, rg)) {
 541		nrg->from = rg->from;
 542
 543		list_del(&rg->link);
 544		put_uncharge_info(rg);
 545		kfree(rg);
 546	}
 547}
 548
 549static inline long
 550hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
 551		     long to, struct hstate *h, struct hugetlb_cgroup *cg,
 552		     long *regions_needed)
 553{
 554	struct file_region *nrg;
 555
 556	if (!regions_needed) {
 557		nrg = get_file_region_entry_from_cache(map, from, to);
 558		record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
 559		list_add(&nrg->link, rg);
 560		coalesce_file_region(map, nrg);
 561	} else
 562		*regions_needed += 1;
 563
 564	return to - from;
 565}
 566
 567/*
 568 * Must be called with resv->lock held.
 569 *
 570 * Calling this with regions_needed != NULL will count the number of pages
 571 * to be added but will not modify the linked list. And regions_needed will
 572 * indicate the number of file_regions needed in the cache to carry out to add
 573 * the regions for this range.
 574 */
 575static long add_reservation_in_range(struct resv_map *resv, long f, long t,
 576				     struct hugetlb_cgroup *h_cg,
 577				     struct hstate *h, long *regions_needed)
 578{
 579	long add = 0;
 580	struct list_head *head = &resv->regions;
 581	long last_accounted_offset = f;
 582	struct file_region *iter, *trg = NULL;
 583	struct list_head *rg = NULL;
 584
 585	if (regions_needed)
 586		*regions_needed = 0;
 587
 588	/* In this loop, we essentially handle an entry for the range
 589	 * [last_accounted_offset, iter->from), at every iteration, with some
 590	 * bounds checking.
 591	 */
 592	list_for_each_entry_safe(iter, trg, head, link) {
 593		/* Skip irrelevant regions that start before our range. */
 594		if (iter->from < f) {
 595			/* If this region ends after the last accounted offset,
 596			 * then we need to update last_accounted_offset.
 597			 */
 598			if (iter->to > last_accounted_offset)
 599				last_accounted_offset = iter->to;
 600			continue;
 601		}
 602
 603		/* When we find a region that starts beyond our range, we've
 604		 * finished.
 605		 */
 606		if (iter->from >= t) {
 607			rg = iter->link.prev;
 608			break;
 609		}
 610
 611		/* Add an entry for last_accounted_offset -> iter->from, and
 612		 * update last_accounted_offset.
 613		 */
 614		if (iter->from > last_accounted_offset)
 615			add += hugetlb_resv_map_add(resv, iter->link.prev,
 616						    last_accounted_offset,
 617						    iter->from, h, h_cg,
 618						    regions_needed);
 619
 620		last_accounted_offset = iter->to;
 621	}
 622
 623	/* Handle the case where our range extends beyond
 624	 * last_accounted_offset.
 625	 */
 626	if (!rg)
 627		rg = head->prev;
 628	if (last_accounted_offset < t)
 629		add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
 630					    t, h, h_cg, regions_needed);
 631
 632	return add;
 633}
 634
 635/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
 636 */
 637static int allocate_file_region_entries(struct resv_map *resv,
 638					int regions_needed)
 639	__must_hold(&resv->lock)
 640{
 641	LIST_HEAD(allocated_regions);
 642	int to_allocate = 0, i = 0;
 643	struct file_region *trg = NULL, *rg = NULL;
 644
 645	VM_BUG_ON(regions_needed < 0);
 646
 647	/*
 648	 * Check for sufficient descriptors in the cache to accommodate
 649	 * the number of in progress add operations plus regions_needed.
 650	 *
 651	 * This is a while loop because when we drop the lock, some other call
 652	 * to region_add or region_del may have consumed some region_entries,
 653	 * so we keep looping here until we finally have enough entries for
 654	 * (adds_in_progress + regions_needed).
 655	 */
 656	while (resv->region_cache_count <
 657	       (resv->adds_in_progress + regions_needed)) {
 658		to_allocate = resv->adds_in_progress + regions_needed -
 659			      resv->region_cache_count;
 660
 661		/* At this point, we should have enough entries in the cache
 662		 * for all the existing adds_in_progress. We should only be
 663		 * needing to allocate for regions_needed.
 664		 */
 665		VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
 666
 667		spin_unlock(&resv->lock);
 668		for (i = 0; i < to_allocate; i++) {
 669			trg = kmalloc(sizeof(*trg), GFP_KERNEL);
 670			if (!trg)
 671				goto out_of_memory;
 672			list_add(&trg->link, &allocated_regions);
 673		}
 674
 675		spin_lock(&resv->lock);
 676
 677		list_splice(&allocated_regions, &resv->region_cache);
 678		resv->region_cache_count += to_allocate;
 679	}
 680
 681	return 0;
 682
 683out_of_memory:
 684	list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
 685		list_del(&rg->link);
 686		kfree(rg);
 687	}
 688	return -ENOMEM;
 689}
 690
 691/*
 692 * Add the huge page range represented by [f, t) to the reserve
 693 * map.  Regions will be taken from the cache to fill in this range.
 694 * Sufficient regions should exist in the cache due to the previous
 695 * call to region_chg with the same range, but in some cases the cache will not
 696 * have sufficient entries due to races with other code doing region_add or
 697 * region_del.  The extra needed entries will be allocated.
 698 *
 699 * regions_needed is the out value provided by a previous call to region_chg.
 700 *
 701 * Return the number of new huge pages added to the map.  This number is greater
 702 * than or equal to zero.  If file_region entries needed to be allocated for
 703 * this operation and we were not able to allocate, it returns -ENOMEM.
 704 * region_add of regions of length 1 never allocate file_regions and cannot
 705 * fail; region_chg will always allocate at least 1 entry and a region_add for
 706 * 1 page will only require at most 1 entry.
 707 */
 708static long region_add(struct resv_map *resv, long f, long t,
 709		       long in_regions_needed, struct hstate *h,
 710		       struct hugetlb_cgroup *h_cg)
 711{
 712	long add = 0, actual_regions_needed = 0;
 713
 714	spin_lock(&resv->lock);
 715retry:
 716
 717	/* Count how many regions are actually needed to execute this add. */
 718	add_reservation_in_range(resv, f, t, NULL, NULL,
 719				 &actual_regions_needed);
 720
 721	/*
 722	 * Check for sufficient descriptors in the cache to accommodate
 723	 * this add operation. Note that actual_regions_needed may be greater
 724	 * than in_regions_needed, as the resv_map may have been modified since
 725	 * the region_chg call. In this case, we need to make sure that we
 726	 * allocate extra entries, such that we have enough for all the
 727	 * existing adds_in_progress, plus the excess needed for this
 728	 * operation.
 729	 */
 730	if (actual_regions_needed > in_regions_needed &&
 731	    resv->region_cache_count <
 732		    resv->adds_in_progress +
 733			    (actual_regions_needed - in_regions_needed)) {
 734		/* region_add operation of range 1 should never need to
 735		 * allocate file_region entries.
 736		 */
 737		VM_BUG_ON(t - f <= 1);
 738
 739		if (allocate_file_region_entries(
 740			    resv, actual_regions_needed - in_regions_needed)) {
 741			return -ENOMEM;
 742		}
 743
 744		goto retry;
 745	}
 746
 747	add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
 748
 749	resv->adds_in_progress -= in_regions_needed;
 750
 751	spin_unlock(&resv->lock);
 752	return add;
 753}
 754
 755/*
 756 * Examine the existing reserve map and determine how many
 757 * huge pages in the specified range [f, t) are NOT currently
 758 * represented.  This routine is called before a subsequent
 759 * call to region_add that will actually modify the reserve
 760 * map to add the specified range [f, t).  region_chg does
 761 * not change the number of huge pages represented by the
 762 * map.  A number of new file_region structures is added to the cache as a
 763 * placeholder, for the subsequent region_add call to use. At least 1
 764 * file_region structure is added.
 765 *
 766 * out_regions_needed is the number of regions added to the
 767 * resv->adds_in_progress.  This value needs to be provided to a follow up call
 768 * to region_add or region_abort for proper accounting.
 769 *
 770 * Returns the number of huge pages that need to be added to the existing
 771 * reservation map for the range [f, t).  This number is greater or equal to
 772 * zero.  -ENOMEM is returned if a new file_region structure or cache entry
 773 * is needed and can not be allocated.
 774 */
 775static long region_chg(struct resv_map *resv, long f, long t,
 776		       long *out_regions_needed)
 777{
 778	long chg = 0;
 779
 780	spin_lock(&resv->lock);
 781
 782	/* Count how many hugepages in this range are NOT represented. */
 783	chg = add_reservation_in_range(resv, f, t, NULL, NULL,
 784				       out_regions_needed);
 785
 786	if (*out_regions_needed == 0)
 787		*out_regions_needed = 1;
 788
 789	if (allocate_file_region_entries(resv, *out_regions_needed))
 790		return -ENOMEM;
 791
 792	resv->adds_in_progress += *out_regions_needed;
 793
 794	spin_unlock(&resv->lock);
 795	return chg;
 796}
 797
 798/*
 799 * Abort the in progress add operation.  The adds_in_progress field
 800 * of the resv_map keeps track of the operations in progress between
 801 * calls to region_chg and region_add.  Operations are sometimes
 802 * aborted after the call to region_chg.  In such cases, region_abort
 803 * is called to decrement the adds_in_progress counter. regions_needed
 804 * is the value returned by the region_chg call, it is used to decrement
 805 * the adds_in_progress counter.
 806 *
 807 * NOTE: The range arguments [f, t) are not needed or used in this
 808 * routine.  They are kept to make reading the calling code easier as
 809 * arguments will match the associated region_chg call.
 810 */
 811static void region_abort(struct resv_map *resv, long f, long t,
 812			 long regions_needed)
 813{
 814	spin_lock(&resv->lock);
 815	VM_BUG_ON(!resv->region_cache_count);
 816	resv->adds_in_progress -= regions_needed;
 817	spin_unlock(&resv->lock);
 818}
 819
 820/*
 821 * Delete the specified range [f, t) from the reserve map.  If the
 822 * t parameter is LONG_MAX, this indicates that ALL regions after f
 823 * should be deleted.  Locate the regions which intersect [f, t)
 824 * and either trim, delete or split the existing regions.
 825 *
 826 * Returns the number of huge pages deleted from the reserve map.
 827 * In the normal case, the return value is zero or more.  In the
 828 * case where a region must be split, a new region descriptor must
 829 * be allocated.  If the allocation fails, -ENOMEM will be returned.
 830 * NOTE: If the parameter t == LONG_MAX, then we will never split
 831 * a region and possibly return -ENOMEM.  Callers specifying
 832 * t == LONG_MAX do not need to check for -ENOMEM error.
 833 */
 834static long region_del(struct resv_map *resv, long f, long t)
 835{
 836	struct list_head *head = &resv->regions;
 837	struct file_region *rg, *trg;
 838	struct file_region *nrg = NULL;
 839	long del = 0;
 840
 841retry:
 842	spin_lock(&resv->lock);
 843	list_for_each_entry_safe(rg, trg, head, link) {
 844		/*
 845		 * Skip regions before the range to be deleted.  file_region
 846		 * ranges are normally of the form [from, to).  However, there
 847		 * may be a "placeholder" entry in the map which is of the form
 848		 * (from, to) with from == to.  Check for placeholder entries
 849		 * at the beginning of the range to be deleted.
 850		 */
 851		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
 852			continue;
 853
 854		if (rg->from >= t)
 855			break;
 856
 857		if (f > rg->from && t < rg->to) { /* Must split region */
 858			/*
 859			 * Check for an entry in the cache before dropping
 860			 * lock and attempting allocation.
 861			 */
 862			if (!nrg &&
 863			    resv->region_cache_count > resv->adds_in_progress) {
 864				nrg = list_first_entry(&resv->region_cache,
 865							struct file_region,
 866							link);
 867				list_del(&nrg->link);
 868				resv->region_cache_count--;
 869			}
 870
 871			if (!nrg) {
 872				spin_unlock(&resv->lock);
 873				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
 874				if (!nrg)
 875					return -ENOMEM;
 876				goto retry;
 877			}
 878
 879			del += t - f;
 880			hugetlb_cgroup_uncharge_file_region(
 881				resv, rg, t - f, false);
 882
 883			/* New entry for end of split region */
 884			nrg->from = t;
 885			nrg->to = rg->to;
 886
 887			copy_hugetlb_cgroup_uncharge_info(nrg, rg);
 888
 889			INIT_LIST_HEAD(&nrg->link);
 890
 891			/* Original entry is trimmed */
 892			rg->to = f;
 893
 894			list_add(&nrg->link, &rg->link);
 895			nrg = NULL;
 896			break;
 897		}
 898
 899		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
 900			del += rg->to - rg->from;
 901			hugetlb_cgroup_uncharge_file_region(resv, rg,
 902							    rg->to - rg->from, true);
 903			list_del(&rg->link);
 904			kfree(rg);
 905			continue;
 906		}
 907
 908		if (f <= rg->from) {	/* Trim beginning of region */
 909			hugetlb_cgroup_uncharge_file_region(resv, rg,
 910							    t - rg->from, false);
 911
 912			del += t - rg->from;
 913			rg->from = t;
 914		} else {		/* Trim end of region */
 915			hugetlb_cgroup_uncharge_file_region(resv, rg,
 916							    rg->to - f, false);
 917
 918			del += rg->to - f;
 919			rg->to = f;
 920		}
 921	}
 922
 923	spin_unlock(&resv->lock);
 924	kfree(nrg);
 925	return del;
 926}
 927
 928/*
 929 * A rare out of memory error was encountered which prevented removal of
 930 * the reserve map region for a page.  The huge page itself was free'ed
 931 * and removed from the page cache.  This routine will adjust the subpool
 932 * usage count, and the global reserve count if needed.  By incrementing
 933 * these counts, the reserve map entry which could not be deleted will
 934 * appear as a "reserved" entry instead of simply dangling with incorrect
 935 * counts.
 936 */
 937void hugetlb_fix_reserve_counts(struct inode *inode)
 938{
 939	struct hugepage_subpool *spool = subpool_inode(inode);
 940	long rsv_adjust;
 941	bool reserved = false;
 942
 943	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
 944	if (rsv_adjust > 0) {
 945		struct hstate *h = hstate_inode(inode);
 946
 947		if (!hugetlb_acct_memory(h, 1))
 948			reserved = true;
 949	} else if (!rsv_adjust) {
 950		reserved = true;
 951	}
 952
 953	if (!reserved)
 954		pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
 955}
 956
 957/*
 958 * Count and return the number of huge pages in the reserve map
 959 * that intersect with the range [f, t).
 960 */
 961static long region_count(struct resv_map *resv, long f, long t)
 962{
 963	struct list_head *head = &resv->regions;
 964	struct file_region *rg;
 965	long chg = 0;
 966
 967	spin_lock(&resv->lock);
 968	/* Locate each segment we overlap with, and count that overlap. */
 969	list_for_each_entry(rg, head, link) {
 970		long seg_from;
 971		long seg_to;
 972
 973		if (rg->to <= f)
 974			continue;
 975		if (rg->from >= t)
 976			break;
 977
 978		seg_from = max(rg->from, f);
 979		seg_to = min(rg->to, t);
 980
 981		chg += seg_to - seg_from;
 982	}
 983	spin_unlock(&resv->lock);
 984
 985	return chg;
 986}
 987
 988/*
 989 * Convert the address within this vma to the page offset within
 990 * the mapping, huge page units here.
 991 */
 992static pgoff_t vma_hugecache_offset(struct hstate *h,
 993			struct vm_area_struct *vma, unsigned long address)
 994{
 995	return ((address - vma->vm_start) >> huge_page_shift(h)) +
 996			(vma->vm_pgoff >> huge_page_order(h));
 997}
 998
 999/**
1000 * vma_kernel_pagesize - Page size granularity for this VMA.
1001 * @vma: The user mapping.
1002 *
1003 * Folios in this VMA will be aligned to, and at least the size of the
1004 * number of bytes returned by this function.
1005 *
1006 * Return: The default size of the folios allocated when backing a VMA.
1007 */
1008unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
1009{
1010	if (vma->vm_ops && vma->vm_ops->pagesize)
1011		return vma->vm_ops->pagesize(vma);
1012	return PAGE_SIZE;
1013}
1014EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
1015
1016/*
1017 * Return the page size being used by the MMU to back a VMA. In the majority
1018 * of cases, the page size used by the kernel matches the MMU size. On
1019 * architectures where it differs, an architecture-specific 'strong'
1020 * version of this symbol is required.
1021 */
1022__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
1023{
1024	return vma_kernel_pagesize(vma);
1025}
1026
1027/*
1028 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
1029 * bits of the reservation map pointer, which are always clear due to
1030 * alignment.
1031 */
1032#define HPAGE_RESV_OWNER    (1UL << 0)
1033#define HPAGE_RESV_UNMAPPED (1UL << 1)
1034#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
1035
1036/*
1037 * These helpers are used to track how many pages are reserved for
1038 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
1039 * is guaranteed to have their future faults succeed.
1040 *
1041 * With the exception of hugetlb_dup_vma_private() which is called at fork(),
1042 * the reserve counters are updated with the hugetlb_lock held. It is safe
1043 * to reset the VMA at fork() time as it is not in use yet and there is no
1044 * chance of the global counters getting corrupted as a result of the values.
1045 *
1046 * The private mapping reservation is represented in a subtly different
1047 * manner to a shared mapping.  A shared mapping has a region map associated
1048 * with the underlying file, this region map represents the backing file
1049 * pages which have ever had a reservation assigned which this persists even
1050 * after the page is instantiated.  A private mapping has a region map
1051 * associated with the original mmap which is attached to all VMAs which
1052 * reference it, this region map represents those offsets which have consumed
1053 * reservation ie. where pages have been instantiated.
1054 */
1055static unsigned long get_vma_private_data(struct vm_area_struct *vma)
1056{
1057	return (unsigned long)vma->vm_private_data;
1058}
1059
1060static void set_vma_private_data(struct vm_area_struct *vma,
1061							unsigned long value)
1062{
1063	vma->vm_private_data = (void *)value;
1064}
1065
1066static void
1067resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
1068					  struct hugetlb_cgroup *h_cg,
1069					  struct hstate *h)
1070{
1071#ifdef CONFIG_CGROUP_HUGETLB
1072	if (!h_cg || !h) {
1073		resv_map->reservation_counter = NULL;
1074		resv_map->pages_per_hpage = 0;
1075		resv_map->css = NULL;
1076	} else {
1077		resv_map->reservation_counter =
1078			&h_cg->rsvd_hugepage[hstate_index(h)];
1079		resv_map->pages_per_hpage = pages_per_huge_page(h);
1080		resv_map->css = &h_cg->css;
1081	}
1082#endif
1083}
1084
1085struct resv_map *resv_map_alloc(void)
1086{
1087	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
1088	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
1089
1090	if (!resv_map || !rg) {
1091		kfree(resv_map);
1092		kfree(rg);
1093		return NULL;
1094	}
1095
1096	kref_init(&resv_map->refs);
1097	spin_lock_init(&resv_map->lock);
1098	INIT_LIST_HEAD(&resv_map->regions);
1099	init_rwsem(&resv_map->rw_sema);
1100
1101	resv_map->adds_in_progress = 0;
1102	/*
1103	 * Initialize these to 0. On shared mappings, 0's here indicate these
1104	 * fields don't do cgroup accounting. On private mappings, these will be
1105	 * re-initialized to the proper values, to indicate that hugetlb cgroup
1106	 * reservations are to be un-charged from here.
1107	 */
1108	resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
1109
1110	INIT_LIST_HEAD(&resv_map->region_cache);
1111	list_add(&rg->link, &resv_map->region_cache);
1112	resv_map->region_cache_count = 1;
1113
1114	return resv_map;
1115}
1116
1117void resv_map_release(struct kref *ref)
1118{
1119	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
1120	struct list_head *head = &resv_map->region_cache;
1121	struct file_region *rg, *trg;
1122
1123	/* Clear out any active regions before we release the map. */
1124	region_del(resv_map, 0, LONG_MAX);
1125
1126	/* ... and any entries left in the cache */
1127	list_for_each_entry_safe(rg, trg, head, link) {
1128		list_del(&rg->link);
1129		kfree(rg);
1130	}
1131
1132	VM_BUG_ON(resv_map->adds_in_progress);
1133
1134	kfree(resv_map);
1135}
1136
1137static inline struct resv_map *inode_resv_map(struct inode *inode)
1138{
1139	/*
1140	 * At inode evict time, i_mapping may not point to the original
1141	 * address space within the inode.  This original address space
1142	 * contains the pointer to the resv_map.  So, always use the
1143	 * address space embedded within the inode.
1144	 * The VERY common case is inode->mapping == &inode->i_data but,
1145	 * this may not be true for device special inodes.
1146	 */
1147	return (struct resv_map *)(&inode->i_data)->i_private_data;
1148}
1149
1150static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
1151{
1152	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1153	if (vma->vm_flags & VM_MAYSHARE) {
1154		struct address_space *mapping = vma->vm_file->f_mapping;
1155		struct inode *inode = mapping->host;
1156
1157		return inode_resv_map(inode);
1158
1159	} else {
1160		return (struct resv_map *)(get_vma_private_data(vma) &
1161							~HPAGE_RESV_MASK);
1162	}
1163}
1164
1165static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
1166{
1167	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1168	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1169
1170	set_vma_private_data(vma, (unsigned long)map);
1171}
1172
1173static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
1174{
1175	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1176	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1177
1178	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1179}
1180
1181static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1182{
1183	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1184
1185	return (get_vma_private_data(vma) & flag) != 0;
1186}
1187
1188bool __vma_private_lock(struct vm_area_struct *vma)
1189{
1190	return !(vma->vm_flags & VM_MAYSHARE) &&
1191		get_vma_private_data(vma) & ~HPAGE_RESV_MASK &&
1192		is_vma_resv_set(vma, HPAGE_RESV_OWNER);
1193}
1194
1195void hugetlb_dup_vma_private(struct vm_area_struct *vma)
1196{
1197	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1198	/*
1199	 * Clear vm_private_data
1200	 * - For shared mappings this is a per-vma semaphore that may be
1201	 *   allocated in a subsequent call to hugetlb_vm_op_open.
1202	 *   Before clearing, make sure pointer is not associated with vma
1203	 *   as this will leak the structure.  This is the case when called
1204	 *   via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
1205	 *   been called to allocate a new structure.
1206	 * - For MAP_PRIVATE mappings, this is the reserve map which does
1207	 *   not apply to children.  Faults generated by the children are
1208	 *   not guaranteed to succeed, even if read-only.
1209	 */
1210	if (vma->vm_flags & VM_MAYSHARE) {
1211		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
1212
1213		if (vma_lock && vma_lock->vma != vma)
1214			vma->vm_private_data = NULL;
1215	} else
1216		vma->vm_private_data = NULL;
1217}
1218
1219/*
1220 * Reset and decrement one ref on hugepage private reservation.
1221 * Called with mm->mmap_lock writer semaphore held.
1222 * This function should be only used by move_vma() and operate on
1223 * same sized vma. It should never come here with last ref on the
1224 * reservation.
1225 */
1226void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1227{
1228	/*
1229	 * Clear the old hugetlb private page reservation.
1230	 * It has already been transferred to new_vma.
1231	 *
1232	 * During a mremap() operation of a hugetlb vma we call move_vma()
1233	 * which copies vma into new_vma and unmaps vma. After the copy
1234	 * operation both new_vma and vma share a reference to the resv_map
1235	 * struct, and at that point vma is about to be unmapped. We don't
1236	 * want to return the reservation to the pool at unmap of vma because
1237	 * the reservation still lives on in new_vma, so simply decrement the
1238	 * ref here and remove the resv_map reference from this vma.
1239	 */
1240	struct resv_map *reservations = vma_resv_map(vma);
1241
1242	if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1243		resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1244		kref_put(&reservations->refs, resv_map_release);
1245	}
1246
1247	hugetlb_dup_vma_private(vma);
1248}
1249
1250static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
1251{
1252	int nid = folio_nid(folio);
1253
1254	lockdep_assert_held(&hugetlb_lock);
1255	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1256
1257	list_move(&folio->lru, &h->hugepage_freelists[nid]);
1258	h->free_huge_pages++;
1259	h->free_huge_pages_node[nid]++;
1260	folio_set_hugetlb_freed(folio);
1261}
1262
1263static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
1264								int nid)
1265{
1266	struct folio *folio;
1267	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1268
1269	lockdep_assert_held(&hugetlb_lock);
1270	list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
1271		if (pin && !folio_is_longterm_pinnable(folio))
1272			continue;
1273
1274		if (folio_test_hwpoison(folio))
1275			continue;
1276
1277		if (is_migrate_isolate_page(&folio->page))
1278			continue;
1279
1280		list_move(&folio->lru, &h->hugepage_activelist);
1281		folio_ref_unfreeze(folio, 1);
1282		folio_clear_hugetlb_freed(folio);
1283		h->free_huge_pages--;
1284		h->free_huge_pages_node[nid]--;
1285		return folio;
1286	}
1287
1288	return NULL;
1289}
1290
1291static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
1292							int nid, nodemask_t *nmask)
1293{
1294	unsigned int cpuset_mems_cookie;
1295	struct zonelist *zonelist;
1296	struct zone *zone;
1297	struct zoneref *z;
1298	int node = NUMA_NO_NODE;
1299
1300	/* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */
1301	if (nid == NUMA_NO_NODE)
1302		nid = numa_node_id();
1303
1304	zonelist = node_zonelist(nid, gfp_mask);
1305
1306retry_cpuset:
1307	cpuset_mems_cookie = read_mems_allowed_begin();
1308	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1309		struct folio *folio;
1310
1311		if (!cpuset_zone_allowed(zone, gfp_mask))
1312			continue;
1313		/*
1314		 * no need to ask again on the same node. Pool is node rather than
1315		 * zone aware
1316		 */
1317		if (zone_to_nid(zone) == node)
1318			continue;
1319		node = zone_to_nid(zone);
1320
1321		folio = dequeue_hugetlb_folio_node_exact(h, node);
1322		if (folio)
1323			return folio;
1324	}
1325	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1326		goto retry_cpuset;
1327
1328	return NULL;
1329}
1330
1331static unsigned long available_huge_pages(struct hstate *h)
1332{
1333	return h->free_huge_pages - h->resv_huge_pages;
1334}
1335
1336static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
1337				struct vm_area_struct *vma,
1338				unsigned long address, long gbl_chg)
1339{
1340	struct folio *folio = NULL;
1341	struct mempolicy *mpol;
1342	gfp_t gfp_mask;
1343	nodemask_t *nodemask;
1344	int nid;
1345
1346	/*
1347	 * gbl_chg==1 means the allocation requires a new page that was not
1348	 * reserved before.  Making sure there's at least one free page.
1349	 */
1350	if (gbl_chg && !available_huge_pages(h))
1351		goto err;
1352
1353	gfp_mask = htlb_alloc_mask(h);
1354	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1355
1356	if (mpol_is_preferred_many(mpol)) {
1357		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1358							nid, nodemask);
1359
1360		/* Fallback to all nodes if page==NULL */
1361		nodemask = NULL;
1362	}
1363
1364	if (!folio)
1365		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1366							nid, nodemask);
1367
1368	mpol_cond_put(mpol);
1369	return folio;
1370
1371err:
1372	return NULL;
1373}
1374
1375/*
1376 * common helper functions for hstate_next_node_to_{alloc|free}.
1377 * We may have allocated or freed a huge page based on a different
1378 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
1379 * be outside of *nodes_allowed.  Ensure that we use an allowed
1380 * node for alloc or free.
1381 */
1382static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1383{
1384	nid = next_node_in(nid, *nodes_allowed);
1385	VM_BUG_ON(nid >= MAX_NUMNODES);
1386
1387	return nid;
1388}
1389
1390static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1391{
1392	if (!node_isset(nid, *nodes_allowed))
1393		nid = next_node_allowed(nid, nodes_allowed);
1394	return nid;
1395}
1396
1397/*
1398 * returns the previously saved node ["this node"] from which to
1399 * allocate a persistent huge page for the pool and advance the
1400 * next node from which to allocate, handling wrap at end of node
1401 * mask.
1402 */
1403static int hstate_next_node_to_alloc(int *next_node,
1404					nodemask_t *nodes_allowed)
1405{
1406	int nid;
1407
1408	VM_BUG_ON(!nodes_allowed);
1409
1410	nid = get_valid_node_allowed(*next_node, nodes_allowed);
1411	*next_node = next_node_allowed(nid, nodes_allowed);
1412
1413	return nid;
1414}
1415
1416/*
1417 * helper for remove_pool_hugetlb_folio() - return the previously saved
1418 * node ["this node"] from which to free a huge page.  Advance the
1419 * next node id whether or not we find a free huge page to free so
1420 * that the next attempt to free addresses the next node.
1421 */
1422static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1423{
1424	int nid;
1425
1426	VM_BUG_ON(!nodes_allowed);
1427
1428	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1429	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1430
1431	return nid;
1432}
1433
1434#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask)		\
1435	for (nr_nodes = nodes_weight(*mask);				\
1436		nr_nodes > 0 &&						\
1437		((node = hstate_next_node_to_alloc(next_node, mask)) || 1);	\
1438		nr_nodes--)
1439
1440#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
1441	for (nr_nodes = nodes_weight(*mask);				\
1442		nr_nodes > 0 &&						\
1443		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
1444		nr_nodes--)
1445
1446#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1447#ifdef CONFIG_CONTIG_ALLOC
1448static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
1449		int nid, nodemask_t *nodemask)
1450{
1451	struct folio *folio;
1452	int order = huge_page_order(h);
1453	bool retried = false;
1454
1455	if (nid == NUMA_NO_NODE)
1456		nid = numa_mem_id();
1457retry:
1458	folio = NULL;
1459#ifdef CONFIG_CMA
1460	{
1461		int node;
1462
1463		if (hugetlb_cma[nid])
1464			folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
1465
1466		if (!folio && !(gfp_mask & __GFP_THISNODE)) {
1467			for_each_node_mask(node, *nodemask) {
1468				if (node == nid || !hugetlb_cma[node])
1469					continue;
1470
1471				folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
1472				if (folio)
1473					break;
1474			}
1475		}
1476	}
1477#endif
1478	if (!folio) {
1479		folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
1480		if (!folio)
1481			return NULL;
1482	}
1483
1484	if (folio_ref_freeze(folio, 1))
1485		return folio;
1486
1487	pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio));
1488	hugetlb_free_folio(folio);
1489	if (!retried) {
1490		retried = true;
1491		goto retry;
1492	}
1493	return NULL;
1494}
1495
1496#else /* !CONFIG_CONTIG_ALLOC */
1497static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
1498					int nid, nodemask_t *nodemask)
1499{
1500	return NULL;
1501}
1502#endif /* CONFIG_CONTIG_ALLOC */
1503
1504#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1505static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
1506					int nid, nodemask_t *nodemask)
1507{
1508	return NULL;
1509}
1510#endif
1511
1512/*
1513 * Remove hugetlb folio from lists.
1514 * If vmemmap exists for the folio, clear the hugetlb flag so that the
1515 * folio appears as just a compound page.  Otherwise, wait until after
1516 * allocating vmemmap to clear the flag.
1517 *
1518 * Must be called with hugetlb lock held.
1519 */
1520static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
1521							bool adjust_surplus)
1522{
1523	int nid = folio_nid(folio);
1524
1525	VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
1526	VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
1527
1528	lockdep_assert_held(&hugetlb_lock);
1529	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1530		return;
1531
1532	list_del(&folio->lru);
1533
1534	if (folio_test_hugetlb_freed(folio)) {
1535		folio_clear_hugetlb_freed(folio);
1536		h->free_huge_pages--;
1537		h->free_huge_pages_node[nid]--;
1538	}
1539	if (adjust_surplus) {
1540		h->surplus_huge_pages--;
1541		h->surplus_huge_pages_node[nid]--;
1542	}
1543
1544	/*
1545	 * We can only clear the hugetlb flag after allocating vmemmap
1546	 * pages.  Otherwise, someone (memory error handling) may try to write
1547	 * to tail struct pages.
1548	 */
1549	if (!folio_test_hugetlb_vmemmap_optimized(folio))
1550		__folio_clear_hugetlb(folio);
1551
1552	h->nr_huge_pages--;
1553	h->nr_huge_pages_node[nid]--;
1554}
1555
1556static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
1557			     bool adjust_surplus)
1558{
1559	int nid = folio_nid(folio);
1560
1561	VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
1562
1563	lockdep_assert_held(&hugetlb_lock);
1564
1565	INIT_LIST_HEAD(&folio->lru);
1566	h->nr_huge_pages++;
1567	h->nr_huge_pages_node[nid]++;
1568
1569	if (adjust_surplus) {
1570		h->surplus_huge_pages++;
1571		h->surplus_huge_pages_node[nid]++;
1572	}
1573
1574	__folio_set_hugetlb(folio);
1575	folio_change_private(folio, NULL);
1576	/*
1577	 * We have to set hugetlb_vmemmap_optimized again as above
1578	 * folio_change_private(folio, NULL) cleared it.
1579	 */
1580	folio_set_hugetlb_vmemmap_optimized(folio);
1581
1582	arch_clear_hugetlb_flags(folio);
1583	enqueue_hugetlb_folio(h, folio);
1584}
1585
1586static void __update_and_free_hugetlb_folio(struct hstate *h,
1587						struct folio *folio)
1588{
1589	bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
1590
1591	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1592		return;
1593
1594	/*
1595	 * If we don't know which subpages are hwpoisoned, we can't free
1596	 * the hugepage, so it's leaked intentionally.
1597	 */
1598	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1599		return;
1600
1601	/*
1602	 * If folio is not vmemmap optimized (!clear_flag), then the folio
1603	 * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio
1604	 * can only be passed hugetlb pages and will BUG otherwise.
1605	 */
1606	if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
1607		spin_lock_irq(&hugetlb_lock);
1608		/*
1609		 * If we cannot allocate vmemmap pages, just refuse to free the
1610		 * page and put the page back on the hugetlb free list and treat
1611		 * as a surplus page.
1612		 */
1613		add_hugetlb_folio(h, folio, true);
1614		spin_unlock_irq(&hugetlb_lock);
1615		return;
1616	}
1617
1618	/*
1619	 * If vmemmap pages were allocated above, then we need to clear the
1620	 * hugetlb flag under the hugetlb lock.
1621	 */
1622	if (folio_test_hugetlb(folio)) {
1623		spin_lock_irq(&hugetlb_lock);
1624		__folio_clear_hugetlb(folio);
1625		spin_unlock_irq(&hugetlb_lock);
1626	}
1627
1628	/*
1629	 * Move PageHWPoison flag from head page to the raw error pages,
1630	 * which makes any healthy subpages reusable.
1631	 */
1632	if (unlikely(folio_test_hwpoison(folio)))
1633		folio_clear_hugetlb_hwpoison(folio);
1634
1635	folio_ref_unfreeze(folio, 1);
1636
1637	INIT_LIST_HEAD(&folio->_deferred_list);
1638	hugetlb_free_folio(folio);
1639}
1640
1641/*
1642 * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
1643 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
1644 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
1645 * the vmemmap pages.
1646 *
1647 * free_hpage_workfn() locklessly retrieves the linked list of pages to be
1648 * freed and frees them one-by-one. As the page->mapping pointer is going
1649 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
1650 * structure of a lockless linked list of huge pages to be freed.
1651 */
1652static LLIST_HEAD(hpage_freelist);
1653
1654static void free_hpage_workfn(struct work_struct *work)
1655{
1656	struct llist_node *node;
1657
1658	node = llist_del_all(&hpage_freelist);
1659
1660	while (node) {
1661		struct folio *folio;
1662		struct hstate *h;
1663
1664		folio = container_of((struct address_space **)node,
1665				     struct folio, mapping);
1666		node = node->next;
1667		folio->mapping = NULL;
1668		/*
1669		 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
1670		 * folio_hstate() is going to trigger because a previous call to
1671		 * remove_hugetlb_folio() will clear the hugetlb bit, so do
1672		 * not use folio_hstate() directly.
1673		 */
1674		h = size_to_hstate(folio_size(folio));
1675
1676		__update_and_free_hugetlb_folio(h, folio);
1677
1678		cond_resched();
1679	}
1680}
1681static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1682
1683static inline void flush_free_hpage_work(struct hstate *h)
1684{
1685	if (hugetlb_vmemmap_optimizable(h))
1686		flush_work(&free_hpage_work);
1687}
1688
1689static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
1690				 bool atomic)
1691{
1692	if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
1693		__update_and_free_hugetlb_folio(h, folio);
1694		return;
1695	}
1696
1697	/*
1698	 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
1699	 *
1700	 * Only call schedule_work() if hpage_freelist is previously
1701	 * empty. Otherwise, schedule_work() had been called but the workfn
1702	 * hasn't retrieved the list yet.
1703	 */
1704	if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
1705		schedule_work(&free_hpage_work);
1706}
1707
1708static void bulk_vmemmap_restore_error(struct hstate *h,
1709					struct list_head *folio_list,
1710					struct list_head *non_hvo_folios)
1711{
1712	struct folio *folio, *t_folio;
1713
1714	if (!list_empty(non_hvo_folios)) {
1715		/*
1716		 * Free any restored hugetlb pages so that restore of the
1717		 * entire list can be retried.
1718		 * The idea is that in the common case of ENOMEM errors freeing
1719		 * hugetlb pages with vmemmap we will free up memory so that we
1720		 * can allocate vmemmap for more hugetlb pages.
1721		 */
1722		list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
1723			list_del(&folio->lru);
1724			spin_lock_irq(&hugetlb_lock);
1725			__folio_clear_hugetlb(folio);
1726			spin_unlock_irq(&hugetlb_lock);
1727			update_and_free_hugetlb_folio(h, folio, false);
1728			cond_resched();
1729		}
1730	} else {
1731		/*
1732		 * In the case where there are no folios which can be
1733		 * immediately freed, we loop through the list trying to restore
1734		 * vmemmap individually in the hope that someone elsewhere may
1735		 * have done something to cause success (such as freeing some
1736		 * memory).  If unable to restore a hugetlb page, the hugetlb
1737		 * page is made a surplus page and removed from the list.
1738		 * If are able to restore vmemmap and free one hugetlb page, we
1739		 * quit processing the list to retry the bulk operation.
1740		 */
1741		list_for_each_entry_safe(folio, t_folio, folio_list, lru)
1742			if (hugetlb_vmemmap_restore_folio(h, folio)) {
1743				list_del(&folio->lru);
1744				spin_lock_irq(&hugetlb_lock);
1745				add_hugetlb_folio(h, folio, true);
1746				spin_unlock_irq(&hugetlb_lock);
1747			} else {
1748				list_del(&folio->lru);
1749				spin_lock_irq(&hugetlb_lock);
1750				__folio_clear_hugetlb(folio);
1751				spin_unlock_irq(&hugetlb_lock);
1752				update_and_free_hugetlb_folio(h, folio, false);
1753				cond_resched();
1754				break;
1755			}
1756	}
1757}
1758
1759static void update_and_free_pages_bulk(struct hstate *h,
1760						struct list_head *folio_list)
1761{
1762	long ret;
1763	struct folio *folio, *t_folio;
1764	LIST_HEAD(non_hvo_folios);
1765
1766	/*
1767	 * First allocate required vmemmmap (if necessary) for all folios.
1768	 * Carefully handle errors and free up any available hugetlb pages
1769	 * in an effort to make forward progress.
1770	 */
1771retry:
1772	ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
1773	if (ret < 0) {
1774		bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
1775		goto retry;
1776	}
1777
1778	/*
1779	 * At this point, list should be empty, ret should be >= 0 and there
1780	 * should only be pages on the non_hvo_folios list.
1781	 * Do note that the non_hvo_folios list could be empty.
1782	 * Without HVO enabled, ret will be 0 and there is no need to call
1783	 * __folio_clear_hugetlb as this was done previously.
1784	 */
1785	VM_WARN_ON(!list_empty(folio_list));
1786	VM_WARN_ON(ret < 0);
1787	if (!list_empty(&non_hvo_folios) && ret) {
1788		spin_lock_irq(&hugetlb_lock);
1789		list_for_each_entry(folio, &non_hvo_folios, lru)
1790			__folio_clear_hugetlb(folio);
1791		spin_unlock_irq(&hugetlb_lock);
1792	}
1793
1794	list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
1795		update_and_free_hugetlb_folio(h, folio, false);
1796		cond_resched();
1797	}
1798}
1799
1800struct hstate *size_to_hstate(unsigned long size)
1801{
1802	struct hstate *h;
1803
1804	for_each_hstate(h) {
1805		if (huge_page_size(h) == size)
1806			return h;
1807	}
1808	return NULL;
1809}
1810
1811void free_huge_folio(struct folio *folio)
1812{
1813	/*
1814	 * Can't pass hstate in here because it is called from the
1815	 * generic mm code.
1816	 */
1817	struct hstate *h = folio_hstate(folio);
1818	int nid = folio_nid(folio);
1819	struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
1820	bool restore_reserve;
1821	unsigned long flags;
1822
1823	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1824	VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
1825
1826	hugetlb_set_folio_subpool(folio, NULL);
1827	if (folio_test_anon(folio))
1828		__ClearPageAnonExclusive(&folio->page);
1829	folio->mapping = NULL;
1830	restore_reserve = folio_test_hugetlb_restore_reserve(folio);
1831	folio_clear_hugetlb_restore_reserve(folio);
1832
1833	/*
1834	 * If HPageRestoreReserve was set on page, page allocation consumed a
1835	 * reservation.  If the page was associated with a subpool, there
1836	 * would have been a page reserved in the subpool before allocation
1837	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
1838	 * reservation, do not call hugepage_subpool_put_pages() as this will
1839	 * remove the reserved page from the subpool.
1840	 */
1841	if (!restore_reserve) {
1842		/*
1843		 * A return code of zero implies that the subpool will be
1844		 * under its minimum size if the reservation is not restored
1845		 * after page is free.  Therefore, force restore_reserve
1846		 * operation.
1847		 */
1848		if (hugepage_subpool_put_pages(spool, 1) == 0)
1849			restore_reserve = true;
1850	}
1851
1852	spin_lock_irqsave(&hugetlb_lock, flags);
1853	folio_clear_hugetlb_migratable(folio);
1854	hugetlb_cgroup_uncharge_folio(hstate_index(h),
1855				     pages_per_huge_page(h), folio);
1856	hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
1857					  pages_per_huge_page(h), folio);
1858	lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
1859	mem_cgroup_uncharge(folio);
1860	if (restore_reserve)
1861		h->resv_huge_pages++;
1862
1863	if (folio_test_hugetlb_temporary(folio)) {
1864		remove_hugetlb_folio(h, folio, false);
1865		spin_unlock_irqrestore(&hugetlb_lock, flags);
1866		update_and_free_hugetlb_folio(h, folio, true);
1867	} else if (h->surplus_huge_pages_node[nid]) {
1868		/* remove the page from active list */
1869		remove_hugetlb_folio(h, folio, true);
1870		spin_unlock_irqrestore(&hugetlb_lock, flags);
1871		update_and_free_hugetlb_folio(h, folio, true);
1872	} else {
1873		arch_clear_hugetlb_flags(folio);
1874		enqueue_hugetlb_folio(h, folio);
1875		spin_unlock_irqrestore(&hugetlb_lock, flags);
1876	}
1877}
1878
1879/*
1880 * Must be called with the hugetlb lock held
1881 */
1882static void __prep_account_new_huge_page(struct hstate *h, int nid)
1883{
1884	lockdep_assert_held(&hugetlb_lock);
1885	h->nr_huge_pages++;
1886	h->nr_huge_pages_node[nid]++;
1887}
1888
1889static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
1890{
1891	__folio_set_hugetlb(folio);
1892	INIT_LIST_HEAD(&folio->lru);
1893	hugetlb_set_folio_subpool(folio, NULL);
1894	set_hugetlb_cgroup(folio, NULL);
1895	set_hugetlb_cgroup_rsvd(folio, NULL);
1896}
1897
1898static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
1899{
1900	init_new_hugetlb_folio(h, folio);
1901	hugetlb_vmemmap_optimize_folio(h, folio);
1902}
1903
1904static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
1905{
1906	__prep_new_hugetlb_folio(h, folio);
1907	spin_lock_irq(&hugetlb_lock);
1908	__prep_account_new_huge_page(h, nid);
1909	spin_unlock_irq(&hugetlb_lock);
1910}
1911
1912/*
1913 * Find and lock address space (mapping) in write mode.
1914 *
1915 * Upon entry, the folio is locked which means that folio_mapping() is
1916 * stable.  Due to locking order, we can only trylock_write.  If we can
1917 * not get the lock, simply return NULL to caller.
1918 */
1919struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
1920{
1921	struct address_space *mapping = folio_mapping(folio);
1922
1923	if (!mapping)
1924		return mapping;
1925
1926	if (i_mmap_trylock_write(mapping))
1927		return mapping;
1928
1929	return NULL;
1930}
1931
1932static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
1933		gfp_t gfp_mask, int nid, nodemask_t *nmask,
1934		nodemask_t *node_alloc_noretry)
1935{
1936	int order = huge_page_order(h);
1937	struct folio *folio;
1938	bool alloc_try_hard = true;
1939	bool retry = true;
1940
1941	/*
1942	 * By default we always try hard to allocate the folio with
1943	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in
1944	 * a loop (to adjust global huge page counts) and previous allocation
1945	 * failed, do not continue to try hard on the same node.  Use the
1946	 * node_alloc_noretry bitmap to manage this state information.
1947	 */
1948	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1949		alloc_try_hard = false;
1950	if (alloc_try_hard)
1951		gfp_mask |= __GFP_RETRY_MAYFAIL;
1952	if (nid == NUMA_NO_NODE)
1953		nid = numa_mem_id();
1954retry:
1955	folio = __folio_alloc(gfp_mask, order, nid, nmask);
1956	/* Ensure hugetlb folio won't have large_rmappable flag set. */
1957	if (folio)
1958		folio_clear_large_rmappable(folio);
1959
1960	if (folio && !folio_ref_freeze(folio, 1)) {
1961		folio_put(folio);
1962		if (retry) {	/* retry once */
1963			retry = false;
1964			goto retry;
1965		}
1966		/* WOW!  twice in a row. */
1967		pr_warn("HugeTLB unexpected inflated folio ref count\n");
1968		folio = NULL;
1969	}
1970
1971	/*
1972	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a
1973	 * folio this indicates an overall state change.  Clear bit so
1974	 * that we resume normal 'try hard' allocations.
1975	 */
1976	if (node_alloc_noretry && folio && !alloc_try_hard)
1977		node_clear(nid, *node_alloc_noretry);
1978
1979	/*
1980	 * If we tried hard to get a folio but failed, set bit so that
1981	 * subsequent attempts will not try as hard until there is an
1982	 * overall state change.
1983	 */
1984	if (node_alloc_noretry && !folio && alloc_try_hard)
1985		node_set(nid, *node_alloc_noretry);
1986
1987	if (!folio) {
1988		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1989		return NULL;
1990	}
1991
1992	__count_vm_event(HTLB_BUDDY_PGALLOC);
1993	return folio;
1994}
1995
1996static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
1997		gfp_t gfp_mask, int nid, nodemask_t *nmask,
1998		nodemask_t *node_alloc_noretry)
1999{
2000	struct folio *folio;
2001
2002	if (hstate_is_gigantic(h))
2003		folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
2004	else
2005		folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry);
2006	if (folio)
2007		init_new_hugetlb_folio(h, folio);
2008	return folio;
2009}
2010
2011/*
2012 * Common helper to allocate a fresh hugetlb page. All specific allocators
2013 * should use this function to get new hugetlb pages
2014 *
2015 * Note that returned page is 'frozen':  ref count of head page and all tail
2016 * pages is zero.
2017 */
2018static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
2019		gfp_t gfp_mask, int nid, nodemask_t *nmask)
2020{
2021	struct folio *folio;
2022
2023	if (hstate_is_gigantic(h))
2024		folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
2025	else
2026		folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2027	if (!folio)
2028		return NULL;
2029
2030	prep_new_hugetlb_folio(h, folio, folio_nid(folio));
2031	return folio;
2032}
2033
2034static void prep_and_add_allocated_folios(struct hstate *h,
2035					struct list_head *folio_list)
2036{
2037	unsigned long flags;
2038	struct folio *folio, *tmp_f;
2039
2040	/* Send list for bulk vmemmap optimization processing */
2041	hugetlb_vmemmap_optimize_folios(h, folio_list);
2042
2043	/* Add all new pool pages to free lists in one lock cycle */
2044	spin_lock_irqsave(&hugetlb_lock, flags);
2045	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
2046		__prep_account_new_huge_page(h, folio_nid(folio));
2047		enqueue_hugetlb_folio(h, folio);
2048	}
2049	spin_unlock_irqrestore(&hugetlb_lock, flags);
2050}
2051
2052/*
2053 * Allocates a fresh hugetlb page in a node interleaved manner.  The page
2054 * will later be added to the appropriate hugetlb pool.
2055 */
2056static struct folio *alloc_pool_huge_folio(struct hstate *h,
2057					nodemask_t *nodes_allowed,
2058					nodemask_t *node_alloc_noretry,
2059					int *next_node)
2060{
2061	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2062	int nr_nodes, node;
2063
2064	for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
2065		struct folio *folio;
2066
2067		folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
2068					nodes_allowed, node_alloc_noretry);
2069		if (folio)
2070			return folio;
2071	}
2072
2073	return NULL;
2074}
2075
2076/*
2077 * Remove huge page from pool from next node to free.  Attempt to keep
2078 * persistent huge pages more or less balanced over allowed nodes.
2079 * This routine only 'removes' the hugetlb page.  The caller must make
2080 * an additional call to free the page to low level allocators.
2081 * Called with hugetlb_lock locked.
2082 */
2083static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
2084		nodemask_t *nodes_allowed, bool acct_surplus)
2085{
2086	int nr_nodes, node;
2087	struct folio *folio = NULL;
2088
2089	lockdep_assert_held(&hugetlb_lock);
2090	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2091		/*
2092		 * If we're returning unused surplus pages, only examine
2093		 * nodes with surplus pages.
2094		 */
2095		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2096		    !list_empty(&h->hugepage_freelists[node])) {
2097			folio = list_entry(h->hugepage_freelists[node].next,
2098					  struct folio, lru);
2099			remove_hugetlb_folio(h, folio, acct_surplus);
2100			break;
2101		}
2102	}
2103
2104	return folio;
2105}
2106
2107/*
2108 * Dissolve a given free hugetlb folio into free buddy pages. This function
2109 * does nothing for in-use hugetlb folios and non-hugetlb folios.
2110 * This function returns values like below:
2111 *
2112 *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2113 *           when the system is under memory pressure and the feature of
2114 *           freeing unused vmemmap pages associated with each hugetlb page
2115 *           is enabled.
2116 *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
2117 *           (allocated or reserved.)
2118 *       0:  successfully dissolved free hugepages or the page is not a
2119 *           hugepage (considered as already dissolved)
2120 */
2121int dissolve_free_hugetlb_folio(struct folio *folio)
2122{
2123	int rc = -EBUSY;
2124
2125retry:
2126	/* Not to disrupt normal path by vainly holding hugetlb_lock */
2127	if (!folio_test_hugetlb(folio))
2128		return 0;
2129
2130	spin_lock_irq(&hugetlb_lock);
2131	if (!folio_test_hugetlb(folio)) {
2132		rc = 0;
2133		goto out;
2134	}
2135
2136	if (!folio_ref_count(folio)) {
2137		struct hstate *h = folio_hstate(folio);
2138		if (!available_huge_pages(h))
2139			goto out;
2140
2141		/*
2142		 * We should make sure that the page is already on the free list
2143		 * when it is dissolved.
2144		 */
2145		if (unlikely(!folio_test_hugetlb_freed(folio))) {
2146			spin_unlock_irq(&hugetlb_lock);
2147			cond_resched();
2148
2149			/*
2150			 * Theoretically, we should return -EBUSY when we
2151			 * encounter this race. In fact, we have a chance
2152			 * to successfully dissolve the page if we do a
2153			 * retry. Because the race window is quite small.
2154			 * If we seize this opportunity, it is an optimization
2155			 * for increasing the success rate of dissolving page.
2156			 */
2157			goto retry;
2158		}
2159
2160		remove_hugetlb_folio(h, folio, false);
2161		h->max_huge_pages--;
2162		spin_unlock_irq(&hugetlb_lock);
2163
2164		/*
2165		 * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
2166		 * before freeing the page.  update_and_free_hugtlb_folio will fail to
2167		 * free the page if it can not allocate required vmemmap.  We
2168		 * need to adjust max_huge_pages if the page is not freed.
2169		 * Attempt to allocate vmemmmap here so that we can take
2170		 * appropriate action on failure.
2171		 *
2172		 * The folio_test_hugetlb check here is because
2173		 * remove_hugetlb_folio will clear hugetlb folio flag for
2174		 * non-vmemmap optimized hugetlb folios.
2175		 */
2176		if (folio_test_hugetlb(folio)) {
2177			rc = hugetlb_vmemmap_restore_folio(h, folio);
2178			if (rc) {
2179				spin_lock_irq(&hugetlb_lock);
2180				add_hugetlb_folio(h, folio, false);
2181				h->max_huge_pages++;
2182				goto out;
2183			}
2184		} else
2185			rc = 0;
2186
2187		update_and_free_hugetlb_folio(h, folio, false);
2188		return rc;
2189	}
2190out:
2191	spin_unlock_irq(&hugetlb_lock);
2192	return rc;
2193}
2194
2195/*
2196 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
2197 * make specified memory blocks removable from the system.
2198 * Note that this will dissolve a free gigantic hugepage completely, if any
2199 * part of it lies within the given range.
2200 * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
2201 * free hugetlb folios that were dissolved before that error are lost.
2202 */
2203int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
2204{
2205	unsigned long pfn;
2206	struct folio *folio;
2207	int rc = 0;
2208	unsigned int order;
2209	struct hstate *h;
2210
2211	if (!hugepages_supported())
2212		return rc;
2213
2214	order = huge_page_order(&default_hstate);
2215	for_each_hstate(h)
2216		order = min(order, huge_page_order(h));
2217
2218	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
2219		folio = pfn_folio(pfn);
2220		rc = dissolve_free_hugetlb_folio(folio);
2221		if (rc)
2222			break;
2223	}
2224
2225	return rc;
2226}
2227
2228/*
2229 * Allocates a fresh surplus page from the page allocator.
2230 */
2231static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
2232				gfp_t gfp_mask,	int nid, nodemask_t *nmask)
2233{
2234	struct folio *folio = NULL;
2235
2236	if (hstate_is_gigantic(h))
2237		return NULL;
2238
2239	spin_lock_irq(&hugetlb_lock);
2240	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2241		goto out_unlock;
2242	spin_unlock_irq(&hugetlb_lock);
2243
2244	folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
2245	if (!folio)
2246		return NULL;
2247
2248	spin_lock_irq(&hugetlb_lock);
2249	/*
2250	 * We could have raced with the pool size change.
2251	 * Double check that and simply deallocate the new page
2252	 * if we would end up overcommiting the surpluses. Abuse
2253	 * temporary page to workaround the nasty free_huge_folio
2254	 * codeflow
2255	 */
2256	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2257		folio_set_hugetlb_temporary(folio);
2258		spin_unlock_irq(&hugetlb_lock);
2259		free_huge_folio(folio);
2260		return NULL;
2261	}
2262
2263	h->surplus_huge_pages++;
2264	h->surplus_huge_pages_node[folio_nid(folio)]++;
2265
2266out_unlock:
2267	spin_unlock_irq(&hugetlb_lock);
2268
2269	return folio;
2270}
2271
2272static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
2273				     int nid, nodemask_t *nmask)
2274{
2275	struct folio *folio;
2276
2277	if (hstate_is_gigantic(h))
2278		return NULL;
2279
2280	folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
2281	if (!folio)
2282		return NULL;
2283
2284	/* fresh huge pages are frozen */
2285	folio_ref_unfreeze(folio, 1);
2286	/*
2287	 * We do not account these pages as surplus because they are only
2288	 * temporary and will be released properly on the last reference
2289	 */
2290	folio_set_hugetlb_temporary(folio);
2291
2292	return folio;
2293}
2294
2295/*
2296 * Use the VMA's mpolicy to allocate a huge page from the buddy.
2297 */
2298static
2299struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
2300		struct vm_area_struct *vma, unsigned long addr)
2301{
2302	struct folio *folio = NULL;
2303	struct mempolicy *mpol;
2304	gfp_t gfp_mask = htlb_alloc_mask(h);
2305	int nid;
2306	nodemask_t *nodemask;
2307
2308	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2309	if (mpol_is_preferred_many(mpol)) {
2310		gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2311
2312		folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
2313
2314		/* Fallback to all nodes if page==NULL */
2315		nodemask = NULL;
2316	}
2317
2318	if (!folio)
2319		folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
2320	mpol_cond_put(mpol);
2321	return folio;
2322}
2323
2324struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
2325		nodemask_t *nmask, gfp_t gfp_mask)
2326{
2327	struct folio *folio;
2328
2329	spin_lock_irq(&hugetlb_lock);
2330	folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
2331					       nmask);
2332	if (folio) {
2333		VM_BUG_ON(!h->resv_huge_pages);
2334		h->resv_huge_pages--;
2335	}
2336
2337	spin_unlock_irq(&hugetlb_lock);
2338	return folio;
2339}
2340
2341/* folio migration callback function */
2342struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
2343		nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
2344{
2345	spin_lock_irq(&hugetlb_lock);
2346	if (available_huge_pages(h)) {
2347		struct folio *folio;
2348
2349		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
2350						preferred_nid, nmask);
2351		if (folio) {
2352			spin_unlock_irq(&hugetlb_lock);
2353			return folio;
2354		}
2355	}
2356	spin_unlock_irq(&hugetlb_lock);
2357
2358	/* We cannot fallback to other nodes, as we could break the per-node pool. */
2359	if (!allow_alloc_fallback)
2360		gfp_mask |= __GFP_THISNODE;
2361
2362	return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
2363}
2364
2365static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
2366{
2367#ifdef CONFIG_NUMA
2368	struct mempolicy *mpol = get_task_policy(current);
2369
2370	/*
2371	 * Only enforce MPOL_BIND policy which overlaps with cpuset policy
2372	 * (from policy_nodemask) specifically for hugetlb case
2373	 */
2374	if (mpol->mode == MPOL_BIND &&
2375		(apply_policy_zone(mpol, gfp_zone(gfp)) &&
2376		 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
2377		return &mpol->nodes;
2378#endif
2379	return NULL;
2380}
2381
2382/*
2383 * Increase the hugetlb pool such that it can accommodate a reservation
2384 * of size 'delta'.
2385 */
2386static int gather_surplus_pages(struct hstate *h, long delta)
2387	__must_hold(&hugetlb_lock)
2388{
2389	LIST_HEAD(surplus_list);
2390	struct folio *folio, *tmp;
2391	int ret;
2392	long i;
2393	long needed, allocated;
2394	bool alloc_ok = true;
2395	int node;
2396	nodemask_t *mbind_nodemask, alloc_nodemask;
2397
2398	mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
2399	if (mbind_nodemask)
2400		nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
2401	else
2402		alloc_nodemask = cpuset_current_mems_allowed;
2403
2404	lockdep_assert_held(&hugetlb_lock);
2405	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2406	if (needed <= 0) {
2407		h->resv_huge_pages += delta;
2408		return 0;
2409	}
2410
2411	allocated = 0;
2412
2413	ret = -ENOMEM;
2414retry:
2415	spin_unlock_irq(&hugetlb_lock);
2416	for (i = 0; i < needed; i++) {
2417		folio = NULL;
2418
2419		/* Prioritize current node */
2420		if (node_isset(numa_mem_id(), alloc_nodemask))
2421			folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2422					numa_mem_id(), NULL);
2423
2424		if (!folio) {
2425			for_each_node_mask(node, alloc_nodemask) {
2426				if (node == numa_mem_id())
2427					continue;
2428				folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2429						node, NULL);
2430				if (folio)
2431					break;
2432			}
2433		}
2434		if (!folio) {
2435			alloc_ok = false;
2436			break;
2437		}
2438		list_add(&folio->lru, &surplus_list);
2439		cond_resched();
2440	}
2441	allocated += i;
2442
2443	/*
2444	 * After retaking hugetlb_lock, we need to recalculate 'needed'
2445	 * because either resv_huge_pages or free_huge_pages may have changed.
2446	 */
2447	spin_lock_irq(&hugetlb_lock);
2448	needed = (h->resv_huge_pages + delta) -
2449			(h->free_huge_pages + allocated);
2450	if (needed > 0) {
2451		if (alloc_ok)
2452			goto retry;
2453		/*
2454		 * We were not able to allocate enough pages to
2455		 * satisfy the entire reservation so we free what
2456		 * we've allocated so far.
2457		 */
2458		goto free;
2459	}
2460	/*
2461	 * The surplus_list now contains _at_least_ the number of extra pages
2462	 * needed to accommodate the reservation.  Add the appropriate number
2463	 * of pages to the hugetlb pool and free the extras back to the buddy
2464	 * allocator.  Commit the entire reservation here to prevent another
2465	 * process from stealing the pages as they are added to the pool but
2466	 * before they are reserved.
2467	 */
2468	needed += allocated;
2469	h->resv_huge_pages += delta;
2470	ret = 0;
2471
2472	/* Free the needed pages to the hugetlb pool */
2473	list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
2474		if ((--needed) < 0)
2475			break;
2476		/* Add the page to the hugetlb allocator */
2477		enqueue_hugetlb_folio(h, folio);
2478	}
2479free:
2480	spin_unlock_irq(&hugetlb_lock);
2481
2482	/*
2483	 * Free unnecessary surplus pages to the buddy allocator.
2484	 * Pages have no ref count, call free_huge_folio directly.
2485	 */
2486	list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
2487		free_huge_folio(folio);
2488	spin_lock_irq(&hugetlb_lock);
2489
2490	return ret;
2491}
2492
2493/*
2494 * This routine has two main purposes:
2495 * 1) Decrement the reservation count (resv_huge_pages) by the value passed
2496 *    in unused_resv_pages.  This corresponds to the prior adjustments made
2497 *    to the associated reservation map.
2498 * 2) Free any unused surplus pages that may have been allocated to satisfy
2499 *    the reservation.  As many as unused_resv_pages may be freed.
2500 */
2501static void return_unused_surplus_pages(struct hstate *h,
2502					unsigned long unused_resv_pages)
2503{
2504	unsigned long nr_pages;
2505	LIST_HEAD(page_list);
2506
2507	lockdep_assert_held(&hugetlb_lock);
2508	/* Uncommit the reservation */
2509	h->resv_huge_pages -= unused_resv_pages;
2510
2511	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
2512		goto out;
2513
2514	/*
2515	 * Part (or even all) of the reservation could have been backed
2516	 * by pre-allocated pages. Only free surplus pages.
2517	 */
2518	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2519
2520	/*
2521	 * We want to release as many surplus pages as possible, spread
2522	 * evenly across all nodes with memory. Iterate across these nodes
2523	 * until we can no longer free unreserved surplus pages. This occurs
2524	 * when the nodes with surplus pages have no free pages.
2525	 * remove_pool_hugetlb_folio() will balance the freed pages across the
2526	 * on-line nodes with memory and will handle the hstate accounting.
2527	 */
2528	while (nr_pages--) {
2529		struct folio *folio;
2530
2531		folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
2532		if (!folio)
2533			goto out;
2534
2535		list_add(&folio->lru, &page_list);
2536	}
2537
2538out:
2539	spin_unlock_irq(&hugetlb_lock);
2540	update_and_free_pages_bulk(h, &page_list);
2541	spin_lock_irq(&hugetlb_lock);
2542}
2543
2544
2545/*
2546 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
2547 * are used by the huge page allocation routines to manage reservations.
2548 *
2549 * vma_needs_reservation is called to determine if the huge page at addr
2550 * within the vma has an associated reservation.  If a reservation is
2551 * needed, the value 1 is returned.  The caller is then responsible for
2552 * managing the global reservation and subpool usage counts.  After
2553 * the huge page has been allocated, vma_commit_reservation is called
2554 * to add the page to the reservation map.  If the page allocation fails,
2555 * the reservation must be ended instead of committed.  vma_end_reservation
2556 * is called in such cases.
2557 *
2558 * In the normal case, vma_commit_reservation returns the same value
2559 * as the preceding vma_needs_reservation call.  The only time this
2560 * is not the case is if a reserve map was changed between calls.  It
2561 * is the responsibility of the caller to notice the difference and
2562 * take appropriate action.
2563 *
2564 * vma_add_reservation is used in error paths where a reservation must
2565 * be restored when a newly allocated huge page must be freed.  It is
2566 * to be called after calling vma_needs_reservation to determine if a
2567 * reservation exists.
2568 *
2569 * vma_del_reservation is used in error paths where an entry in the reserve
2570 * map was created during huge page allocation and must be removed.  It is to
2571 * be called after calling vma_needs_reservation to determine if a reservation
2572 * exists.
2573 */
2574enum vma_resv_mode {
2575	VMA_NEEDS_RESV,
2576	VMA_COMMIT_RESV,
2577	VMA_END_RESV,
2578	VMA_ADD_RESV,
2579	VMA_DEL_RESV,
2580};
2581static long __vma_reservation_common(struct hstate *h,
2582				struct vm_area_struct *vma, unsigned long addr,
2583				enum vma_resv_mode mode)
2584{
2585	struct resv_map *resv;
2586	pgoff_t idx;
2587	long ret;
2588	long dummy_out_regions_needed;
2589
2590	resv = vma_resv_map(vma);
2591	if (!resv)
2592		return 1;
2593
2594	idx = vma_hugecache_offset(h, vma, addr);
2595	switch (mode) {
2596	case VMA_NEEDS_RESV:
2597		ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2598		/* We assume that vma_reservation_* routines always operate on
2599		 * 1 page, and that adding to resv map a 1 page entry can only
2600		 * ever require 1 region.
2601		 */
2602		VM_BUG_ON(dummy_out_regions_needed != 1);
2603		break;
2604	case VMA_COMMIT_RESV:
2605		ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2606		/* region_add calls of range 1 should never fail. */
2607		VM_BUG_ON(ret < 0);
2608		break;
2609	case VMA_END_RESV:
2610		region_abort(resv, idx, idx + 1, 1);
2611		ret = 0;
2612		break;
2613	case VMA_ADD_RESV:
2614		if (vma->vm_flags & VM_MAYSHARE) {
2615			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2616			/* region_add calls of range 1 should never fail. */
2617			VM_BUG_ON(ret < 0);
2618		} else {
2619			region_abort(resv, idx, idx + 1, 1);
2620			ret = region_del(resv, idx, idx + 1);
2621		}
2622		break;
2623	case VMA_DEL_RESV:
2624		if (vma->vm_flags & VM_MAYSHARE) {
2625			region_abort(resv, idx, idx + 1, 1);
2626			ret = region_del(resv, idx, idx + 1);
2627		} else {
2628			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2629			/* region_add calls of range 1 should never fail. */
2630			VM_BUG_ON(ret < 0);
2631		}
2632		break;
2633	default:
2634		BUG();
2635	}
2636
2637	if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2638		return ret;
2639	/*
2640	 * We know private mapping must have HPAGE_RESV_OWNER set.
2641	 *
2642	 * In most cases, reserves always exist for private mappings.
2643	 * However, a file associated with mapping could have been
2644	 * hole punched or truncated after reserves were consumed.
2645	 * As subsequent fault on such a range will not use reserves.
2646	 * Subtle - The reserve map for private mappings has the
2647	 * opposite meaning than that of shared mappings.  If NO
2648	 * entry is in the reserve map, it means a reservation exists.
2649	 * If an entry exists in the reserve map, it means the
2650	 * reservation has already been consumed.  As a result, the
2651	 * return value of this routine is the opposite of the
2652	 * value returned from reserve map manipulation routines above.
2653	 */
2654	if (ret > 0)
2655		return 0;
2656	if (ret == 0)
2657		return 1;
2658	return ret;
2659}
2660
2661static long vma_needs_reservation(struct hstate *h,
2662			struct vm_area_struct *vma, unsigned long addr)
2663{
2664	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2665}
2666
2667static long vma_commit_reservation(struct hstate *h,
2668			struct vm_area_struct *vma, unsigned long addr)
2669{
2670	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2671}
2672
2673static void vma_end_reservation(struct hstate *h,
2674			struct vm_area_struct *vma, unsigned long addr)
2675{
2676	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2677}
2678
2679static long vma_add_reservation(struct hstate *h,
2680			struct vm_area_struct *vma, unsigned long addr)
2681{
2682	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2683}
2684
2685static long vma_del_reservation(struct hstate *h,
2686			struct vm_area_struct *vma, unsigned long addr)
2687{
2688	return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2689}
2690
2691/*
2692 * This routine is called to restore reservation information on error paths.
2693 * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
2694 * and the hugetlb mutex should remain held when calling this routine.
2695 *
2696 * It handles two specific cases:
2697 * 1) A reservation was in place and the folio consumed the reservation.
2698 *    hugetlb_restore_reserve is set in the folio.
2699 * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
2700 *    not set.  However, alloc_hugetlb_folio always updates the reserve map.
2701 *
2702 * In case 1, free_huge_folio later in the error path will increment the
2703 * global reserve count.  But, free_huge_folio does not have enough context
2704 * to adjust the reservation map.  This case deals primarily with private
2705 * mappings.  Adjust the reserve map here to be consistent with global
2706 * reserve count adjustments to be made by free_huge_folio.  Make sure the
2707 * reserve map indicates there is a reservation present.
2708 *
2709 * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
2710 */
2711void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2712			unsigned long address, struct folio *folio)
2713{
2714	long rc = vma_needs_reservation(h, vma, address);
2715
2716	if (folio_test_hugetlb_restore_reserve(folio)) {
2717		if (unlikely(rc < 0))
2718			/*
2719			 * Rare out of memory condition in reserve map
2720			 * manipulation.  Clear hugetlb_restore_reserve so
2721			 * that global reserve count will not be incremented
2722			 * by free_huge_folio.  This will make it appear
2723			 * as though the reservation for this folio was
2724			 * consumed.  This may prevent the task from
2725			 * faulting in the folio at a later time.  This
2726			 * is better than inconsistent global huge page
2727			 * accounting of reserve counts.
2728			 */
2729			folio_clear_hugetlb_restore_reserve(folio);
2730		else if (rc)
2731			(void)vma_add_reservation(h, vma, address);
2732		else
2733			vma_end_reservation(h, vma, address);
2734	} else {
2735		if (!rc) {
2736			/*
2737			 * This indicates there is an entry in the reserve map
2738			 * not added by alloc_hugetlb_folio.  We know it was added
2739			 * before the alloc_hugetlb_folio call, otherwise
2740			 * hugetlb_restore_reserve would be set on the folio.
2741			 * Remove the entry so that a subsequent allocation
2742			 * does not consume a reservation.
2743			 */
2744			rc = vma_del_reservation(h, vma, address);
2745			if (rc < 0)
2746				/*
2747				 * VERY rare out of memory condition.  Since
2748				 * we can not delete the entry, set
2749				 * hugetlb_restore_reserve so that the reserve
2750				 * count will be incremented when the folio
2751				 * is freed.  This reserve will be consumed
2752				 * on a subsequent allocation.
2753				 */
2754				folio_set_hugetlb_restore_reserve(folio);
2755		} else if (rc < 0) {
2756			/*
2757			 * Rare out of memory condition from
2758			 * vma_needs_reservation call.  Memory allocation is
2759			 * only attempted if a new entry is needed.  Therefore,
2760			 * this implies there is not an entry in the
2761			 * reserve map.
2762			 *
2763			 * For shared mappings, no entry in the map indicates
2764			 * no reservation.  We are done.
2765			 */
2766			if (!(vma->vm_flags & VM_MAYSHARE))
2767				/*
2768				 * For private mappings, no entry indicates
2769				 * a reservation is present.  Since we can
2770				 * not add an entry, set hugetlb_restore_reserve
2771				 * on the folio so reserve count will be
2772				 * incremented when freed.  This reserve will
2773				 * be consumed on a subsequent allocation.
2774				 */
2775				folio_set_hugetlb_restore_reserve(folio);
2776		} else
2777			/*
2778			 * No reservation present, do nothing
2779			 */
2780			 vma_end_reservation(h, vma, address);
2781	}
2782}
2783
2784/*
2785 * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
2786 * the old one
2787 * @h: struct hstate old page belongs to
2788 * @old_folio: Old folio to dissolve
2789 * @list: List to isolate the page in case we need to
2790 * Returns 0 on success, otherwise negated error.
2791 */
2792static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
2793			struct folio *old_folio, struct list_head *list)
2794{
2795	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2796	int nid = folio_nid(old_folio);
2797	struct folio *new_folio = NULL;
2798	int ret = 0;
2799
2800retry:
2801	spin_lock_irq(&hugetlb_lock);
2802	if (!folio_test_hugetlb(old_folio)) {
2803		/*
2804		 * Freed from under us. Drop new_folio too.
2805		 */
2806		goto free_new;
2807	} else if (folio_ref_count(old_folio)) {
2808		bool isolated;
2809
2810		/*
2811		 * Someone has grabbed the folio, try to isolate it here.
2812		 * Fail with -EBUSY if not possible.
2813		 */
2814		spin_unlock_irq(&hugetlb_lock);
2815		isolated = folio_isolate_hugetlb(old_folio, list);
2816		ret = isolated ? 0 : -EBUSY;
2817		spin_lock_irq(&hugetlb_lock);
2818		goto free_new;
2819	} else if (!folio_test_hugetlb_freed(old_folio)) {
2820		/*
2821		 * Folio's refcount is 0 but it has not been enqueued in the
2822		 * freelist yet. Race window is small, so we can succeed here if
2823		 * we retry.
2824		 */
2825		spin_unlock_irq(&hugetlb_lock);
2826		cond_resched();
2827		goto retry;
2828	} else {
2829		if (!new_folio) {
2830			spin_unlock_irq(&hugetlb_lock);
2831			new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
2832							      NULL, NULL);
2833			if (!new_folio)
2834				return -ENOMEM;
2835			__prep_new_hugetlb_folio(h, new_folio);
2836			goto retry;
2837		}
2838
2839		/*
2840		 * Ok, old_folio is still a genuine free hugepage. Remove it from
2841		 * the freelist and decrease the counters. These will be
2842		 * incremented again when calling __prep_account_new_huge_page()
2843		 * and enqueue_hugetlb_folio() for new_folio. The counters will
2844		 * remain stable since this happens under the lock.
2845		 */
2846		remove_hugetlb_folio(h, old_folio, false);
2847
2848		/*
2849		 * Ref count on new_folio is already zero as it was dropped
2850		 * earlier.  It can be directly added to the pool free list.
2851		 */
2852		__prep_account_new_huge_page(h, nid);
2853		enqueue_hugetlb_folio(h, new_folio);
2854
2855		/*
2856		 * Folio has been replaced, we can safely free the old one.
2857		 */
2858		spin_unlock_irq(&hugetlb_lock);
2859		update_and_free_hugetlb_folio(h, old_folio, false);
2860	}
2861
2862	return ret;
2863
2864free_new:
2865	spin_unlock_irq(&hugetlb_lock);
2866	if (new_folio)
2867		update_and_free_hugetlb_folio(h, new_folio, false);
2868
2869	return ret;
2870}
2871
2872int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
2873{
2874	struct hstate *h;
2875	struct folio *folio = page_folio(page);
2876	int ret = -EBUSY;
2877
2878	/*
2879	 * The page might have been dissolved from under our feet, so make sure
2880	 * to carefully check the state under the lock.
2881	 * Return success when racing as if we dissolved the page ourselves.
2882	 */
2883	spin_lock_irq(&hugetlb_lock);
2884	if (folio_test_hugetlb(folio)) {
2885		h = folio_hstate(folio);
2886	} else {
2887		spin_unlock_irq(&hugetlb_lock);
2888		return 0;
2889	}
2890	spin_unlock_irq(&hugetlb_lock);
2891
2892	/*
2893	 * Fence off gigantic pages as there is a cyclic dependency between
2894	 * alloc_contig_range and them. Return -ENOMEM as this has the effect
2895	 * of bailing out right away without further retrying.
2896	 */
2897	if (hstate_is_gigantic(h))
2898		return -ENOMEM;
2899
2900	if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
2901		ret = 0;
2902	else if (!folio_ref_count(folio))
2903		ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
2904
2905	return ret;
2906}
2907
2908/*
2909 *  replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
2910 *  range with new folios.
2911 *  @start_pfn: start pfn of the given pfn range
2912 *  @end_pfn: end pfn of the given pfn range
2913 *  Returns 0 on success, otherwise negated error.
2914 */
2915int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
2916{
2917	struct hstate *h;
2918	struct folio *folio;
2919	int ret = 0;
2920
2921	LIST_HEAD(isolate_list);
2922
2923	while (start_pfn < end_pfn) {
2924		folio = pfn_folio(start_pfn);
2925		if (folio_test_hugetlb(folio)) {
2926			h = folio_hstate(folio);
2927		} else {
2928			start_pfn++;
2929			continue;
2930		}
2931
2932		if (!folio_ref_count(folio)) {
2933			ret = alloc_and_dissolve_hugetlb_folio(h, folio,
2934							       &isolate_list);
2935			if (ret)
2936				break;
2937
2938			putback_movable_pages(&isolate_list);
2939		}
2940		start_pfn++;
2941	}
2942
2943	return ret;
2944}
2945
2946typedef enum {
2947	/*
2948	 * For either 0/1: we checked the per-vma resv map, and one resv
2949	 * count either can be reused (0), or an extra needed (1).
2950	 */
2951	MAP_CHG_REUSE = 0,
2952	MAP_CHG_NEEDED = 1,
2953	/*
2954	 * Cannot use per-vma resv count can be used, hence a new resv
2955	 * count is enforced.
2956	 *
2957	 * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
2958	 * that currently vma_needs_reservation() has an unwanted side
2959	 * effect to either use end() or commit() to complete the
2960	 * transaction.	 Hence it needs to differenciate from NEEDED.
2961	 */
2962	MAP_CHG_ENFORCED = 2,
2963} map_chg_state;
2964
2965/*
2966 * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
2967 * faults of hugetlb private mappings on top of a non-page-cache folio (in
2968 * which case even if there's a private vma resv map it won't cover such
2969 * allocation).  New call sites should (probably) never set it to true!!
2970 * When it's set, the allocation will bypass all vma level reservations.
2971 */
2972struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
2973				    unsigned long addr, bool cow_from_owner)
2974{
2975	struct hugepage_subpool *spool = subpool_vma(vma);
2976	struct hstate *h = hstate_vma(vma);
2977	struct folio *folio;
2978	long retval, gbl_chg;
2979	map_chg_state map_chg;
2980	int ret, idx;
2981	struct hugetlb_cgroup *h_cg = NULL;
2982	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
2983
2984	idx = hstate_index(h);
2985
2986	/* Whether we need a separate per-vma reservation? */
2987	if (cow_from_owner) {
2988		/*
2989		 * Special case!  Since it's a CoW on top of a reserved
2990		 * page, the private resv map doesn't count.  So it cannot
2991		 * consume the per-vma resv map even if it's reserved.
2992		 */
2993		map_chg = MAP_CHG_ENFORCED;
2994	} else {
2995		/*
2996		 * Examine the region/reserve map to determine if the process
2997		 * has a reservation for the page to be allocated.  A return
2998		 * code of zero indicates a reservation exists (no change).
2999		 */
3000		retval = vma_needs_reservation(h, vma, addr);
3001		if (retval < 0)
3002			return ERR_PTR(-ENOMEM);
3003		map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
3004	}
3005
3006	/*
3007	 * Whether we need a separate global reservation?
3008	 *
3009	 * Processes that did not create the mapping will have no
3010	 * reserves as indicated by the region/reserve map. Check
3011	 * that the allocation will not exceed the subpool limit.
3012	 * Or if it can get one from the pool reservation directly.
3013	 */
3014	if (map_chg) {
3015		gbl_chg = hugepage_subpool_get_pages(spool, 1);
3016		if (gbl_chg < 0)
3017			goto out_end_reservation;
3018	} else {
3019		/*
3020		 * If we have the vma reservation ready, no need for extra
3021		 * global reservation.
3022		 */
3023		gbl_chg = 0;
3024	}
3025
3026	/*
3027	 * If this allocation is not consuming a per-vma reservation,
3028	 * charge the hugetlb cgroup now.
3029	 */
3030	if (map_chg) {
3031		ret = hugetlb_cgroup_charge_cgroup_rsvd(
3032			idx, pages_per_huge_page(h), &h_cg);
3033		if (ret)
3034			goto out_subpool_put;
3035	}
3036
3037	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
3038	if (ret)
3039		goto out_uncharge_cgroup_reservation;
3040
3041	spin_lock_irq(&hugetlb_lock);
3042	/*
3043	 * glb_chg is passed to indicate whether or not a page must be taken
3044	 * from the global free pool (global change).  gbl_chg == 0 indicates
3045	 * a reservation exists for the allocation.
3046	 */
3047	folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
3048	if (!folio) {
3049		spin_unlock_irq(&hugetlb_lock);
3050		folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
3051		if (!folio)
3052			goto out_uncharge_cgroup;
3053		spin_lock_irq(&hugetlb_lock);
3054		list_add(&folio->lru, &h->hugepage_activelist);
3055		folio_ref_unfreeze(folio, 1);
3056		/* Fall through */
3057	}
3058
3059	/*
3060	 * Either dequeued or buddy-allocated folio needs to add special
3061	 * mark to the folio when it consumes a global reservation.
3062	 */
3063	if (!gbl_chg) {
3064		folio_set_hugetlb_restore_reserve(folio);
3065		h->resv_huge_pages--;
3066	}
3067
3068	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
3069	/* If allocation is not consuming a reservation, also store the
3070	 * hugetlb_cgroup pointer on the page.
3071	 */
3072	if (map_chg) {
3073		hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
3074						  h_cg, folio);
3075	}
3076
3077	spin_unlock_irq(&hugetlb_lock);
3078
3079	hugetlb_set_folio_subpool(folio, spool);
3080
3081	if (map_chg != MAP_CHG_ENFORCED) {
3082		/* commit() is only needed if the map_chg is not enforced */
3083		retval = vma_commit_reservation(h, vma, addr);
3084		/*
3085		 * Check for possible race conditions. When it happens..
3086		 * The page was added to the reservation map between
3087		 * vma_needs_reservation and vma_commit_reservation.
3088		 * This indicates a race with hugetlb_reserve_pages.
3089		 * Adjust for the subpool count incremented above AND
3090		 * in hugetlb_reserve_pages for the same page.	Also,
3091		 * the reservation count added in hugetlb_reserve_pages
3092		 * no longer applies.
3093		 */
3094		if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
3095			long rsv_adjust;
3096
3097			rsv_adjust = hugepage_subpool_put_pages(spool, 1);
3098			hugetlb_acct_memory(h, -rsv_adjust);
3099			if (map_chg) {
3100				spin_lock_irq(&hugetlb_lock);
3101				hugetlb_cgroup_uncharge_folio_rsvd(
3102				    hstate_index(h), pages_per_huge_page(h),
3103				    folio);
3104				spin_unlock_irq(&hugetlb_lock);
3105			}
3106		}
3107	}
3108
3109	ret = mem_cgroup_charge_hugetlb(folio, gfp);
3110	/*
3111	 * Unconditionally increment NR_HUGETLB here. If it turns out that
3112	 * mem_cgroup_charge_hugetlb failed, then immediately free the page and
3113	 * decrement NR_HUGETLB.
3114	 */
3115	lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
3116
3117	if (ret == -ENOMEM) {
3118		free_huge_folio(folio);
3119		return ERR_PTR(-ENOMEM);
3120	}
3121
3122	return folio;
3123
3124out_uncharge_cgroup:
3125	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
3126out_uncharge_cgroup_reservation:
3127	if (map_chg)
3128		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
3129						    h_cg);
3130out_subpool_put:
3131	if (map_chg)
3132		hugepage_subpool_put_pages(spool, 1);
3133out_end_reservation:
3134	if (map_chg != MAP_CHG_ENFORCED)
3135		vma_end_reservation(h, vma, addr);
3136	return ERR_PTR(-ENOSPC);
3137}
3138
3139int alloc_bootmem_huge_page(struct hstate *h, int nid)
3140	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
3141int __alloc_bootmem_huge_page(struct hstate *h, int nid)
3142{
3143	struct huge_bootmem_page *m = NULL; /* initialize for clang */
3144	int nr_nodes, node = nid;
3145
3146	/* do node specific alloc */
3147	if (nid != NUMA_NO_NODE) {
3148		m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h),
3149				0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3150		if (!m)
3151			return 0;
3152		goto found;
3153	}
3154	/* allocate from next node when distributing huge pages */
3155	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
3156		m = memblock_alloc_try_nid_raw(
3157				huge_page_size(h), huge_page_size(h),
3158				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
3159		/*
3160		 * Use the beginning of the huge page to store the
3161		 * huge_bootmem_page struct (until gather_bootmem
3162		 * puts them into the mem_map).
3163		 */
3164		if (!m)
3165			return 0;
3166		goto found;
3167	}
3168
3169found:
3170
3171	/*
3172	 * Only initialize the head struct page in memmap_init_reserved_pages,
3173	 * rest of the struct pages will be initialized by the HugeTLB
3174	 * subsystem itself.
3175	 * The head struct page is used to get folio information by the HugeTLB
3176	 * subsystem like zone id and node id.
3177	 */
3178	memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
3179		huge_page_size(h) - PAGE_SIZE);
3180	/* Put them into a private list first because mem_map is not up yet */
3181	INIT_LIST_HEAD(&m->list);
3182	list_add(&m->list, &huge_boot_pages[node]);
3183	m->hstate = h;
3184	return 1;
3185}
3186
3187/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
3188static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
3189					unsigned long start_page_number,
3190					unsigned long end_page_number)
3191{
3192	enum zone_type zone = zone_idx(folio_zone(folio));
3193	int nid = folio_nid(folio);
3194	unsigned long head_pfn = folio_pfn(folio);
3195	unsigned long pfn, end_pfn = head_pfn + end_page_number;
3196	int ret;
3197
3198	for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
3199		struct page *page = pfn_to_page(pfn);
3200
3201		__ClearPageReserved(folio_page(folio, pfn - head_pfn));
3202		__init_single_page(page, pfn, zone, nid);
3203		prep_compound_tail((struct page *)folio, pfn - head_pfn);
3204		ret = page_ref_freeze(page, 1);
3205		VM_BUG_ON(!ret);
3206	}
3207}
3208
3209static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
3210					      struct hstate *h,
3211					      unsigned long nr_pages)
3212{
3213	int ret;
3214
3215	/* Prepare folio head */
3216	__folio_clear_reserved(folio);
3217	__folio_set_head(folio);
3218	ret = folio_ref_freeze(folio, 1);
3219	VM_BUG_ON(!ret);
3220	/* Initialize the necessary tail struct pages */
3221	hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
3222	prep_compound_head((struct page *)folio, huge_page_order(h));
3223}
3224
3225static void __init prep_and_add_bootmem_folios(struct hstate *h,
3226					struct list_head *folio_list)
3227{
3228	unsigned long flags;
3229	struct folio *folio, *tmp_f;
3230
3231	/* Send list for bulk vmemmap optimization processing */
3232	hugetlb_vmemmap_optimize_folios(h, folio_list);
3233
3234	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
3235		if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
3236			/*
3237			 * If HVO fails, initialize all tail struct pages
3238			 * We do not worry about potential long lock hold
3239			 * time as this is early in boot and there should
3240			 * be no contention.
3241			 */
3242			hugetlb_folio_init_tail_vmemmap(folio,
3243					HUGETLB_VMEMMAP_RESERVE_PAGES,
3244					pages_per_huge_page(h));
3245		}
3246		/* Subdivide locks to achieve better parallel performance */
3247		spin_lock_irqsave(&hugetlb_lock, flags);
3248		__prep_account_new_huge_page(h, folio_nid(folio));
3249		enqueue_hugetlb_folio(h, folio);
3250		spin_unlock_irqrestore(&hugetlb_lock, flags);
3251	}
3252}
3253
3254/*
3255 * Put bootmem huge pages into the standard lists after mem_map is up.
3256 * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
3257 */
3258static void __init gather_bootmem_prealloc_node(unsigned long nid)
3259{
3260	LIST_HEAD(folio_list);
3261	struct huge_bootmem_page *m;
3262	struct hstate *h = NULL, *prev_h = NULL;
3263
3264	list_for_each_entry(m, &huge_boot_pages[nid], list) {
3265		struct page *page = virt_to_page(m);
3266		struct folio *folio = (void *)page;
3267
3268		h = m->hstate;
3269		/*
3270		 * It is possible to have multiple huge page sizes (hstates)
3271		 * in this list.  If so, process each size separately.
3272		 */
3273		if (h != prev_h && prev_h != NULL)
3274			prep_and_add_bootmem_folios(prev_h, &folio_list);
3275		prev_h = h;
3276
3277		VM_BUG_ON(!hstate_is_gigantic(h));
3278		WARN_ON(folio_ref_count(folio) != 1);
3279
3280		hugetlb_folio_init_vmemmap(folio, h,
3281					   HUGETLB_VMEMMAP_RESERVE_PAGES);
3282		init_new_hugetlb_folio(h, folio);
3283		list_add(&folio->lru, &folio_list);
3284
3285		/*
3286		 * We need to restore the 'stolen' pages to totalram_pages
3287		 * in order to fix confusing memory reports from free(1) and
3288		 * other side-effects, like CommitLimit going negative.
3289		 */
3290		adjust_managed_page_count(page, pages_per_huge_page(h));
3291		cond_resched();
3292	}
3293
3294	prep_and_add_bootmem_folios(h, &folio_list);
3295}
3296
3297static void __init gather_bootmem_prealloc_parallel(unsigned long start,
3298						    unsigned long end, void *arg)
3299{
3300	int nid;
3301
3302	for (nid = start; nid < end; nid++)
3303		gather_bootmem_prealloc_node(nid);
3304}
3305
3306static void __init gather_bootmem_prealloc(void)
3307{
3308	struct padata_mt_job job = {
3309		.thread_fn	= gather_bootmem_prealloc_parallel,
3310		.fn_arg		= NULL,
3311		.start		= 0,
3312		.size		= nr_node_ids,
3313		.align		= 1,
3314		.min_chunk	= 1,
3315		.max_threads	= num_node_state(N_MEMORY),
3316		.numa_aware	= true,
3317	};
3318
3319	padata_do_multithreaded(&job);
3320}
3321
3322static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3323{
3324	unsigned long i;
3325	char buf[32];
3326	LIST_HEAD(folio_list);
3327
3328	for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3329		if (hstate_is_gigantic(h)) {
3330			if (!alloc_bootmem_huge_page(h, nid))
3331				break;
3332		} else {
3333			struct folio *folio;
3334			gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3335
3336			folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
3337					&node_states[N_MEMORY], NULL);
3338			if (!folio)
3339				break;
3340			list_add(&folio->lru, &folio_list);
3341		}
3342		cond_resched();
3343	}
3344
3345	if (!list_empty(&folio_list))
3346		prep_and_add_allocated_folios(h, &folio_list);
3347
3348	if (i == h->max_huge_pages_node[nid])
3349		return;
3350
3351	string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3352	pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
3353		h->max_huge_pages_node[nid], buf, nid, i);
3354	h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3355	h->max_huge_pages_node[nid] = i;
3356}
3357
3358static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
3359{
3360	int i;
3361	bool node_specific_alloc = false;
3362
3363	for_each_online_node(i) {
3364		if (h->max_huge_pages_node[i] > 0) {
3365			hugetlb_hstate_alloc_pages_onenode(h, i);
3366			node_specific_alloc = true;
3367		}
3368	}
3369
3370	return node_specific_alloc;
3371}
3372
3373static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
3374{
3375	if (allocated < h->max_huge_pages) {
3376		char buf[32];
3377
3378		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3379		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
3380			h->max_huge_pages, buf, allocated);
3381		h->max_huge_pages = allocated;
3382	}
3383}
3384
3385static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
3386{
3387	struct hstate *h = (struct hstate *)arg;
3388	int i, num = end - start;
3389	nodemask_t node_alloc_noretry;
3390	LIST_HEAD(folio_list);
3391	int next_node = first_online_node;
3392
3393	/* Bit mask controlling how hard we retry per-node allocations.*/
3394	nodes_clear(node_alloc_noretry);
3395
3396	for (i = 0; i < num; ++i) {
3397		struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
3398						&node_alloc_noretry, &next_node);
3399		if (!folio)
3400			break;
3401
3402		list_move(&folio->lru, &folio_list);
3403		cond_resched();
3404	}
3405
3406	prep_and_add_allocated_folios(h, &folio_list);
3407}
3408
3409static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
3410{
3411	unsigned long i;
3412
3413	for (i = 0; i < h->max_huge_pages; ++i) {
3414		if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3415			break;
3416		cond_resched();
3417	}
3418
3419	return i;
3420}
3421
3422static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
3423{
3424	struct padata_mt_job job = {
3425		.fn_arg		= h,
3426		.align		= 1,
3427		.numa_aware	= true
3428	};
3429
3430	job.thread_fn	= hugetlb_pages_alloc_boot_node;
3431	job.start	= 0;
3432	job.size	= h->max_huge_pages;
3433
3434	/*
3435	 * job.max_threads is twice the num_node_state(N_MEMORY),
3436	 *
3437	 * Tests below indicate that a multiplier of 2 significantly improves
3438	 * performance, and although larger values also provide improvements,
3439	 * the gains are marginal.
3440	 *
3441	 * Therefore, choosing 2 as the multiplier strikes a good balance between
3442	 * enhancing parallel processing capabilities and maintaining efficient
3443	 * resource management.
3444	 *
3445	 * +------------+-------+-------+-------+-------+-------+
3446	 * | multiplier |   1   |   2   |   3   |   4   |   5   |
3447	 * +------------+-------+-------+-------+-------+-------+
3448	 * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
3449	 * | 2T   4node | 979ms | 679ms | 543ms | 489ms | 481ms |
3450	 * | 50G  2node | 71ms  | 44ms  | 37ms  | 30ms  | 31ms  |
3451	 * +------------+-------+-------+-------+-------+-------+
3452	 */
3453	job.max_threads	= num_node_state(N_MEMORY) * 2;
3454	job.min_chunk	= h->max_huge_pages / num_node_state(N_MEMORY) / 2;
3455	padata_do_multithreaded(&job);
3456
3457	return h->nr_huge_pages;
3458}
3459
3460/*
3461 * NOTE: this routine is called in different contexts for gigantic and
3462 * non-gigantic pages.
3463 * - For gigantic pages, this is called early in the boot process and
3464 *   pages are allocated from memblock allocated or something similar.
3465 *   Gigantic pages are actually added to pools later with the routine
3466 *   gather_bootmem_prealloc.
3467 * - For non-gigantic pages, this is called later in the boot process after
3468 *   all of mm is up and functional.  Pages are allocated from buddy and
3469 *   then added to hugetlb pools.
3470 */
3471static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3472{
3473	unsigned long allocated;
3474	static bool initialized __initdata;
3475
3476	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
3477	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3478		pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3479		return;
3480	}
3481
3482	/* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
3483	if (!initialized) {
3484		int i = 0;
3485
3486		for (i = 0; i < MAX_NUMNODES; i++)
3487			INIT_LIST_HEAD(&huge_boot_pages[i]);
3488		initialized = true;
3489	}
3490
3491	/* do node specific alloc */
3492	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
3493		return;
3494
3495	/* below will do all node balanced alloc */
3496	if (hstate_is_gigantic(h))
3497		allocated = hugetlb_gigantic_pages_alloc_boot(h);
3498	else
3499		allocated = hugetlb_pages_alloc_boot(h);
3500
3501	hugetlb_hstate_alloc_pages_errcheck(allocated, h);
3502}
3503
3504static void __init hugetlb_init_hstates(void)
3505{
3506	struct hstate *h, *h2;
3507
3508	for_each_hstate(h) {
3509		/* oversize hugepages were init'ed in early boot */
3510		if (!hstate_is_gigantic(h))
3511			hugetlb_hstate_alloc_pages(h);
3512
3513		/*
3514		 * Set demote order for each hstate.  Note that
3515		 * h->demote_order is initially 0.
3516		 * - We can not demote gigantic pages if runtime freeing
3517		 *   is not supported, so skip this.
3518		 * - If CMA allocation is possible, we can not demote
3519		 *   HUGETLB_PAGE_ORDER or smaller size pages.
3520		 */
3521		if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3522			continue;
3523		if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3524			continue;
3525		for_each_hstate(h2) {
3526			if (h2 == h)
3527				continue;
3528			if (h2->order < h->order &&
3529			    h2->order > h->demote_order)
3530				h->demote_order = h2->order;
3531		}
3532	}
3533}
3534
3535static void __init report_hugepages(void)
3536{
3537	struct hstate *h;
3538
3539	for_each_hstate(h) {
3540		char buf[32];
3541
3542		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3543		pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
3544			buf, h->free_huge_pages);
3545		pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
3546			hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
3547	}
3548}
3549
3550#ifdef CONFIG_HIGHMEM
3551static void try_to_free_low(struct hstate *h, unsigned long count,
3552						nodemask_t *nodes_allowed)
3553{
3554	int i;
3555	LIST_HEAD(page_list);
3556
3557	lockdep_assert_held(&hugetlb_lock);
3558	if (hstate_is_gigantic(h))
3559		return;
3560
3561	/*
3562	 * Collect pages to be freed on a list, and free after dropping lock
3563	 */
3564	for_each_node_mask(i, *nodes_allowed) {
3565		struct folio *folio, *next;
3566		struct list_head *freel = &h->hugepage_freelists[i];
3567		list_for_each_entry_safe(folio, next, freel, lru) {
3568			if (count >= h->nr_huge_pages)
3569				goto out;
3570			if (folio_test_highmem(folio))
3571				continue;
3572			remove_hugetlb_folio(h, folio, false);
3573			list_add(&folio->lru, &page_list);
3574		}
3575	}
3576
3577out:
3578	spin_unlock_irq(&hugetlb_lock);
3579	update_and_free_pages_bulk(h, &page_list);
3580	spin_lock_irq(&hugetlb_lock);
3581}
3582#else
3583static inline void try_to_free_low(struct hstate *h, unsigned long count,
3584						nodemask_t *nodes_allowed)
3585{
3586}
3587#endif
3588
3589/*
3590 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
3591 * balanced by operating on them in a round-robin fashion.
3592 * Returns 1 if an adjustment was made.
3593 */
3594static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3595				int delta)
3596{
3597	int nr_nodes, node;
3598
3599	lockdep_assert_held(&hugetlb_lock);
3600	VM_BUG_ON(delta != -1 && delta != 1);
3601
3602	if (delta < 0) {
3603		for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
3604			if (h->surplus_huge_pages_node[node])
3605				goto found;
3606		}
3607	} else {
3608		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3609			if (h->surplus_huge_pages_node[node] <
3610					h->nr_huge_pages_node[node])
3611				goto found;
3612		}
3613	}
3614	return 0;
3615
3616found:
3617	h->surplus_huge_pages += delta;
3618	h->surplus_huge_pages_node[node] += delta;
3619	return 1;
3620}
3621
3622#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3623static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3624			      nodemask_t *nodes_allowed)
3625{
3626	unsigned long min_count;
3627	unsigned long allocated;
3628	struct folio *folio;
3629	LIST_HEAD(page_list);
3630	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3631
3632	/*
3633	 * Bit mask controlling how hard we retry per-node allocations.
3634	 * If we can not allocate the bit mask, do not attempt to allocate
3635	 * the requested huge pages.
3636	 */
3637	if (node_alloc_noretry)
3638		nodes_clear(*node_alloc_noretry);
3639	else
3640		return -ENOMEM;
3641
3642	/*
3643	 * resize_lock mutex prevents concurrent adjustments to number of
3644	 * pages in hstate via the proc/sysfs interfaces.
3645	 */
3646	mutex_lock(&h->resize_lock);
3647	flush_free_hpage_work(h);
3648	spin_lock_irq(&hugetlb_lock);
3649
3650	/*
3651	 * Check for a node specific request.
3652	 * Changing node specific huge page count may require a corresponding
3653	 * change to the global count.  In any case, the passed node mask
3654	 * (nodes_allowed) will restrict alloc/free to the specified node.
3655	 */
3656	if (nid != NUMA_NO_NODE) {
3657		unsigned long old_count = count;
3658
3659		count += persistent_huge_pages(h) -
3660			 (h->nr_huge_pages_node[nid] -
3661			  h->surplus_huge_pages_node[nid]);
3662		/*
3663		 * User may have specified a large count value which caused the
3664		 * above calculation to overflow.  In this case, they wanted
3665		 * to allocate as many huge pages as possible.  Set count to
3666		 * largest possible value to align with their intention.
3667		 */
3668		if (count < old_count)
3669			count = ULONG_MAX;
3670	}
3671
3672	/*
3673	 * Gigantic pages runtime allocation depend on the capability for large
3674	 * page range allocation.
3675	 * If the system does not provide this feature, return an error when
3676	 * the user tries to allocate gigantic pages but let the user free the
3677	 * boottime allocated gigantic pages.
3678	 */
3679	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3680		if (count > persistent_huge_pages(h)) {
3681			spin_unlock_irq(&hugetlb_lock);
3682			mutex_unlock(&h->resize_lock);
3683			NODEMASK_FREE(node_alloc_noretry);
3684			return -EINVAL;
3685		}
3686		/* Fall through to decrease pool */
3687	}
3688
3689	/*
3690	 * Increase the pool size
3691	 * First take pages out of surplus state.  Then make up the
3692	 * remaining difference by allocating fresh huge pages.
3693	 *
3694	 * We might race with alloc_surplus_hugetlb_folio() here and be unable
3695	 * to convert a surplus huge page to a normal huge page. That is
3696	 * not critical, though, it just means the overall size of the
3697	 * pool might be one hugepage larger than it needs to be, but
3698	 * within all the constraints specified by the sysctls.
3699	 */
3700	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3701		if (!adjust_pool_surplus(h, nodes_allowed, -1))
3702			break;
3703	}
3704
3705	allocated = 0;
3706	while (count > (persistent_huge_pages(h) + allocated)) {
3707		/*
3708		 * If this allocation races such that we no longer need the
3709		 * page, free_huge_folio will handle it by freeing the page
3710		 * and reducing the surplus.
3711		 */
3712		spin_unlock_irq(&hugetlb_lock);
3713
3714		/* yield cpu to avoid soft lockup */
3715		cond_resched();
3716
3717		folio = alloc_pool_huge_folio(h, nodes_allowed,
3718						node_alloc_noretry,
3719						&h->next_nid_to_alloc);
3720		if (!folio) {
3721			prep_and_add_allocated_folios(h, &page_list);
3722			spin_lock_irq(&hugetlb_lock);
3723			goto out;
3724		}
3725
3726		list_add(&folio->lru, &page_list);
3727		allocated++;
3728
3729		/* Bail for signals. Probably ctrl-c from user */
3730		if (signal_pending(current)) {
3731			prep_and_add_allocated_folios(h, &page_list);
3732			spin_lock_irq(&hugetlb_lock);
3733			goto out;
3734		}
3735
3736		spin_lock_irq(&hugetlb_lock);
3737	}
3738
3739	/* Add allocated pages to the pool */
3740	if (!list_empty(&page_list)) {
3741		spin_unlock_irq(&hugetlb_lock);
3742		prep_and_add_allocated_folios(h, &page_list);
3743		spin_lock_irq(&hugetlb_lock);
3744	}
3745
3746	/*
3747	 * Decrease the pool size
3748	 * First return free pages to the buddy allocator (being careful
3749	 * to keep enough around to satisfy reservations).  Then place
3750	 * pages into surplus state as needed so the pool will shrink
3751	 * to the desired size as pages become free.
3752	 *
3753	 * By placing pages into the surplus state independent of the
3754	 * overcommit value, we are allowing the surplus pool size to
3755	 * exceed overcommit. There are few sane options here. Since
3756	 * alloc_surplus_hugetlb_folio() is checking the global counter,
3757	 * though, we'll note that we're not allowed to exceed surplus
3758	 * and won't grow the pool anywhere else. Not until one of the
3759	 * sysctls are changed, or the surplus pages go out of use.
3760	 */
3761	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3762	min_count = max(count, min_count);
3763	try_to_free_low(h, min_count, nodes_allowed);
3764
3765	/*
3766	 * Collect pages to be removed on list without dropping lock
3767	 */
3768	while (min_count < persistent_huge_pages(h)) {
3769		folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
3770		if (!folio)
3771			break;
3772
3773		list_add(&folio->lru, &page_list);
3774	}
3775	/* free the pages after dropping lock */
3776	spin_unlock_irq(&hugetlb_lock);
3777	update_and_free_pages_bulk(h, &page_list);
3778	flush_free_hpage_work(h);
3779	spin_lock_irq(&hugetlb_lock);
3780
3781	while (count < persistent_huge_pages(h)) {
3782		if (!adjust_pool_surplus(h, nodes_allowed, 1))
3783			break;
3784	}
3785out:
3786	h->max_huge_pages = persistent_huge_pages(h);
3787	spin_unlock_irq(&hugetlb_lock);
3788	mutex_unlock(&h->resize_lock);
3789
3790	NODEMASK_FREE(node_alloc_noretry);
3791
3792	return 0;
3793}
3794
3795static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
3796				       struct list_head *src_list)
3797{
3798	long rc;
3799	struct folio *folio, *next;
3800	LIST_HEAD(dst_list);
3801	LIST_HEAD(ret_list);
3802
3803	rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
3804	list_splice_init(&ret_list, src_list);
3805
3806	/*
3807	 * Taking target hstate mutex synchronizes with set_max_huge_pages.
3808	 * Without the mutex, pages added to target hstate could be marked
3809	 * as surplus.
3810	 *
3811	 * Note that we already hold src->resize_lock.  To prevent deadlock,
3812	 * use the convention of always taking larger size hstate mutex first.
3813	 */
3814	mutex_lock(&dst->resize_lock);
3815
3816	list_for_each_entry_safe(folio, next, src_list, lru) {
3817		int i;
3818
3819		if (folio_test_hugetlb_vmemmap_optimized(folio))
3820			continue;
3821
3822		list_del(&folio->lru);
3823
3824		split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
3825		pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));
3826
3827		for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
3828			struct page *page = folio_page(folio, i);
3829			/* Careful: see __split_huge_page_tail() */
3830			struct folio *new_folio = (struct folio *)page;
3831
3832			clear_compound_head(page);
3833			prep_compound_page(page, dst->order);
3834
3835			new_folio->mapping = NULL;
3836			init_new_hugetlb_folio(dst, new_folio);
3837			list_add(&new_folio->lru, &dst_list);
3838		}
3839	}
3840
3841	prep_and_add_allocated_folios(dst, &dst_list);
3842
3843	mutex_unlock(&dst->resize_lock);
3844
3845	return rc;
3846}
3847
3848static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
3849				  unsigned long nr_to_demote)
3850	__must_hold(&hugetlb_lock)
3851{
3852	int nr_nodes, node;
3853	struct hstate *dst;
3854	long rc = 0;
3855	long nr_demoted = 0;
3856
3857	lockdep_assert_held(&hugetlb_lock);
3858
3859	/* We should never get here if no demote order */
3860	if (!src->demote_order) {
3861		pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3862		return -EINVAL;		/* internal error */
3863	}
3864	dst = size_to_hstate(PAGE_SIZE << src->demote_order);
3865
3866	for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
3867		LIST_HEAD(list);
3868		struct folio *folio, *next;
3869
3870		list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
3871			if (folio_test_hwpoison(folio))
3872				continue;
3873
3874			remove_hugetlb_folio(src, folio, false);
3875			list_add(&folio->lru, &list);
3876
3877			if (++nr_demoted == nr_to_demote)
3878				break;
3879		}
3880
3881		spin_unlock_irq(&hugetlb_lock);
3882
3883		rc = demote_free_hugetlb_folios(src, dst, &list);
3884
3885		spin_lock_irq(&hugetlb_lock);
3886
3887		list_for_each_entry_safe(folio, next, &list, lru) {
3888			list_del(&folio->lru);
3889			add_hugetlb_folio(src, folio, false);
3890
3891			nr_demoted--;
3892		}
3893
3894		if (rc < 0 || nr_demoted == nr_to_demote)
3895			break;
3896	}
3897
3898	/*
3899	 * Not absolutely necessary, but for consistency update max_huge_pages
3900	 * based on pool changes for the demoted page.
3901	 */
3902	src->max_huge_pages -= nr_demoted;
3903	dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));
3904
3905	if (rc < 0)
3906		return rc;
3907
3908	if (nr_demoted)
3909		return nr_demoted;
3910	/*
3911	 * Only way to get here is if all pages on free lists are poisoned.
3912	 * Return -EBUSY so that caller will not retry.
3913	 */
3914	return -EBUSY;
3915}
3916
3917#define HSTATE_ATTR_RO(_name) \
3918	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3919
3920#define HSTATE_ATTR_WO(_name) \
3921	static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3922
3923#define HSTATE_ATTR(_name) \
3924	static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
3925
3926static struct kobject *hugepages_kobj;
3927static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3928
3929static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3930
3931static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
3932{
3933	int i;
3934
3935	for (i = 0; i < HUGE_MAX_HSTATE; i++)
3936		if (hstate_kobjs[i] == kobj) {
3937			if (nidp)
3938				*nidp = NUMA_NO_NODE;
3939			return &hstates[i];
3940		}
3941
3942	return kobj_to_node_hstate(kobj, nidp);
3943}
3944
3945static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3946					struct kobj_attribute *attr, char *buf)
3947{
3948	struct hstate *h;
3949	unsigned long nr_huge_pages;
3950	int nid;
3951
3952	h = kobj_to_hstate(kobj, &nid);
3953	if (nid == NUMA_NO_NODE)
3954		nr_huge_pages = h->nr_huge_pages;
3955	else
3956		nr_huge_pages = h->nr_huge_pages_node[nid];
3957
3958	return sysfs_emit(buf, "%lu\n", nr_huge_pages);
3959}
3960
3961static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3962					   struct hstate *h, int nid,
3963					   unsigned long count, size_t len)
3964{
3965	int err;
3966	nodemask_t nodes_allowed, *n_mask;
3967
3968	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3969		return -EINVAL;
3970
3971	if (nid == NUMA_NO_NODE) {
3972		/*
3973		 * global hstate attribute
3974		 */
3975		if (!(obey_mempolicy &&
3976				init_nodemask_of_mempolicy(&nodes_allowed)))
3977			n_mask = &node_states[N_MEMORY];
3978		else
3979			n_mask = &nodes_allowed;
3980	} else {
3981		/*
3982		 * Node specific request.  count adjustment happens in
3983		 * set_max_huge_pages() after acquiring hugetlb_lock.
3984		 */
3985		init_nodemask_of_node(&nodes_allowed, nid);
3986		n_mask = &nodes_allowed;
3987	}
3988
3989	err = set_max_huge_pages(h, count, nid, n_mask);
3990
3991	return err ? err : len;
3992}
3993
3994static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3995					 struct kobject *kobj, const char *buf,
3996					 size_t len)
3997{
3998	struct hstate *h;
3999	unsigned long count;
4000	int nid;
4001	int err;
4002
4003	err = kstrtoul(buf, 10, &count);
4004	if (err)
4005		return err;
4006
4007	h = kobj_to_hstate(kobj, &nid);
4008	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
4009}
4010
4011static ssize_t nr_hugepages_show(struct kobject *kobj,
4012				       struct kobj_attribute *attr, char *buf)
4013{
4014	return nr_hugepages_show_common(kobj, attr, buf);
4015}
4016
4017static ssize_t nr_hugepages_store(struct kobject *kobj,
4018	       struct kobj_attribute *attr, const char *buf, size_t len)
4019{
4020	return nr_hugepages_store_common(false, kobj, buf, len);
4021}
4022HSTATE_ATTR(nr_hugepages);
4023
4024#ifdef CONFIG_NUMA
4025
4026/*
4027 * hstate attribute for optionally mempolicy-based constraint on persistent
4028 * huge page alloc/free.
4029 */
4030static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
4031					   struct kobj_attribute *attr,
4032					   char *buf)
4033{
4034	return nr_hugepages_show_common(kobj, attr, buf);
4035}
4036
4037static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
4038	       struct kobj_attribute *attr, const char *buf, size_t len)
4039{
4040	return nr_hugepages_store_common(true, kobj, buf, len);
4041}
4042HSTATE_ATTR(nr_hugepages_mempolicy);
4043#endif
4044
4045
4046static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
4047					struct kobj_attribute *attr, char *buf)
4048{
4049	struct hstate *h = kobj_to_hstate(kobj, NULL);
4050	return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
4051}
4052
4053static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
4054		struct kobj_attribute *attr, const char *buf, size_t count)
4055{
4056	int err;
4057	unsigned long input;
4058	struct hstate *h = kobj_to_hstate(kobj, NULL);
4059
4060	if (hstate_is_gigantic(h))
4061		return -EINVAL;
4062
4063	err = kstrtoul(buf, 10, &input);
4064	if (err)
4065		return err;
4066
4067	spin_lock_irq(&hugetlb_lock);
4068	h->nr_overcommit_huge_pages = input;
4069	spin_unlock_irq(&hugetlb_lock);
4070
4071	return count;
4072}
4073HSTATE_ATTR(nr_overcommit_hugepages);
4074
4075static ssize_t free_hugepages_show(struct kobject *kobj,
4076					struct kobj_attribute *attr, char *buf)
4077{
4078	struct hstate *h;
4079	unsigned long free_huge_pages;
4080	int nid;
4081
4082	h = kobj_to_hstate(kobj, &nid);
4083	if (nid == NUMA_NO_NODE)
4084		free_huge_pages = h->free_huge_pages;
4085	else
4086		free_huge_pages = h->free_huge_pages_node[nid];
4087
4088	return sysfs_emit(buf, "%lu\n", free_huge_pages);
4089}
4090HSTATE_ATTR_RO(free_hugepages);
4091
4092static ssize_t resv_hugepages_show(struct kobject *kobj,
4093					struct kobj_attribute *attr, char *buf)
4094{
4095	struct hstate *h = kobj_to_hstate(kobj, NULL);
4096	return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
4097}
4098HSTATE_ATTR_RO(resv_hugepages);
4099
4100static ssize_t surplus_hugepages_show(struct kobject *kobj,
4101					struct kobj_attribute *attr, char *buf)
4102{
4103	struct hstate *h;
4104	unsigned long surplus_huge_pages;
4105	int nid;
4106
4107	h = kobj_to_hstate(kobj, &nid);
4108	if (nid == NUMA_NO_NODE)
4109		surplus_huge_pages = h->surplus_huge_pages;
4110	else
4111		surplus_huge_pages = h->surplus_huge_pages_node[nid];
4112
4113	return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
4114}
4115HSTATE_ATTR_RO(surplus_hugepages);
4116
4117static ssize_t demote_store(struct kobject *kobj,
4118	       struct kobj_attribute *attr, const char *buf, size_t len)
4119{
4120	unsigned long nr_demote;
4121	unsigned long nr_available;
4122	nodemask_t nodes_allowed, *n_mask;
4123	struct hstate *h;
4124	int err;
4125	int nid;
4126
4127	err = kstrtoul(buf, 10, &nr_demote);
4128	if (err)
4129		return err;
4130	h = kobj_to_hstate(kobj, &nid);
4131
4132	if (nid != NUMA_NO_NODE) {
4133		init_nodemask_of_node(&nodes_allowed, nid);
4134		n_mask = &nodes_allowed;
4135	} else {
4136		n_mask = &node_states[N_MEMORY];
4137	}
4138
4139	/* Synchronize with other sysfs operations modifying huge pages */
4140	mutex_lock(&h->resize_lock);
4141	spin_lock_irq(&hugetlb_lock);
4142
4143	while (nr_demote) {
4144		long rc;
4145
4146		/*
4147		 * Check for available pages to demote each time thorough the
4148		 * loop as demote_pool_huge_page will drop hugetlb_lock.
4149		 */
4150		if (nid != NUMA_NO_NODE)
4151			nr_available = h->free_huge_pages_node[nid];
4152		else
4153			nr_available = h->free_huge_pages;
4154		nr_available -= h->resv_huge_pages;
4155		if (!nr_available)
4156			break;
4157
4158		rc = demote_pool_huge_page(h, n_mask, nr_demote);
4159		if (rc < 0) {
4160			err = rc;
4161			break;
4162		}
4163
4164		nr_demote -= rc;
4165	}
4166
4167	spin_unlock_irq(&hugetlb_lock);
4168	mutex_unlock(&h->resize_lock);
4169
4170	if (err)
4171		return err;
4172	return len;
4173}
4174HSTATE_ATTR_WO(demote);
4175
4176static ssize_t demote_size_show(struct kobject *kobj,
4177					struct kobj_attribute *attr, char *buf)
4178{
4179	struct hstate *h = kobj_to_hstate(kobj, NULL);
4180	unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
4181
4182	return sysfs_emit(buf, "%lukB\n", demote_size);
4183}
4184
4185static ssize_t demote_size_store(struct kobject *kobj,
4186					struct kobj_attribute *attr,
4187					const char *buf, size_t count)
4188{
4189	struct hstate *h, *demote_hstate;
4190	unsigned long demote_size;
4191	unsigned int demote_order;
4192
4193	demote_size = (unsigned long)memparse(buf, NULL);
4194
4195	demote_hstate = size_to_hstate(demote_size);
4196	if (!demote_hstate)
4197		return -EINVAL;
4198	demote_order = demote_hstate->order;
4199	if (demote_order < HUGETLB_PAGE_ORDER)
4200		return -EINVAL;
4201
4202	/* demote order must be smaller than hstate order */
4203	h = kobj_to_hstate(kobj, NULL);
4204	if (demote_order >= h->order)
4205		return -EINVAL;
4206
4207	/* resize_lock synchronizes access to demote size and writes */
4208	mutex_lock(&h->resize_lock);
4209	h->demote_order = demote_order;
4210	mutex_unlock(&h->resize_lock);
4211
4212	return count;
4213}
4214HSTATE_ATTR(demote_size);
4215
4216static struct attribute *hstate_attrs[] = {
4217	&nr_hugepages_attr.attr,
4218	&nr_overcommit_hugepages_attr.attr,
4219	&free_hugepages_attr.attr,
4220	&resv_hugepages_attr.attr,
4221	&surplus_hugepages_attr.attr,
4222#ifdef CONFIG_NUMA
4223	&nr_hugepages_mempolicy_attr.attr,
4224#endif
4225	NULL,
4226};
4227
4228static const struct attribute_group hstate_attr_group = {
4229	.attrs = hstate_attrs,
4230};
4231
4232static struct attribute *hstate_demote_attrs[] = {
4233	&demote_size_attr.attr,
4234	&demote_attr.attr,
4235	NULL,
4236};
4237
4238static const struct attribute_group hstate_demote_attr_group = {
4239	.attrs = hstate_demote_attrs,
4240};
4241
4242static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
4243				    struct kobject **hstate_kobjs,
4244				    const struct attribute_group *hstate_attr_group)
4245{
4246	int retval;
4247	int hi = hstate_index(h);
4248
4249	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
4250	if (!hstate_kobjs[hi])
4251		return -ENOMEM;
4252
4253	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
4254	if (retval) {
4255		kobject_put(hstate_kobjs[hi]);
4256		hstate_kobjs[hi] = NULL;
4257		return retval;
4258	}
4259
4260	if (h->demote_order) {
4261		retval = sysfs_create_group(hstate_kobjs[hi],
4262					    &hstate_demote_attr_group);
4263		if (retval) {
4264			pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
4265			sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
4266			kobject_put(hstate_kobjs[hi]);
4267			hstate_kobjs[hi] = NULL;
4268			return retval;
4269		}
4270	}
4271
4272	return 0;
4273}
4274
4275#ifdef CONFIG_NUMA
4276static bool hugetlb_sysfs_initialized __ro_after_init;
4277
4278/*
4279 * node_hstate/s - associate per node hstate attributes, via their kobjects,
4280 * with node devices in node_devices[] using a parallel array.  The array
4281 * index of a node device or _hstate == node id.
4282 * This is here to avoid any static dependency of the node device driver, in
4283 * the base kernel, on the hugetlb module.
4284 */
4285struct node_hstate {
4286	struct kobject		*hugepages_kobj;
4287	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
4288};
4289static struct node_hstate node_hstates[MAX_NUMNODES];
4290
4291/*
4292 * A subset of global hstate attributes for node devices
4293 */
4294static struct attribute *per_node_hstate_attrs[] = {
4295	&nr_hugepages_attr.attr,
4296	&free_hugepages_attr.attr,
4297	&surplus_hugepages_attr.attr,
4298	NULL,
4299};
4300
4301static const struct attribute_group per_node_hstate_attr_group = {
4302	.attrs = per_node_hstate_attrs,
4303};
4304
4305/*
4306 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
4307 * Returns node id via non-NULL nidp.
4308 */
4309static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4310{
4311	int nid;
4312
4313	for (nid = 0; nid < nr_node_ids; nid++) {
4314		struct node_hstate *nhs = &node_hstates[nid];
4315		int i;
4316		for (i = 0; i < HUGE_MAX_HSTATE; i++)
4317			if (nhs->hstate_kobjs[i] == kobj) {
4318				if (nidp)
4319					*nidp = nid;
4320				return &hstates[i];
4321			}
4322	}
4323
4324	BUG();
4325	return NULL;
4326}
4327
4328/*
4329 * Unregister hstate attributes from a single node device.
4330 * No-op if no hstate attributes attached.
4331 */
4332void hugetlb_unregister_node(struct node *node)
4333{
4334	struct hstate *h;
4335	struct node_hstate *nhs = &node_hstates[node->dev.id];
4336
4337	if (!nhs->hugepages_kobj)
4338		return;		/* no hstate attributes */
4339
4340	for_each_hstate(h) {
4341		int idx = hstate_index(h);
4342		struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
4343
4344		if (!hstate_kobj)
4345			continue;
4346		if (h->demote_order)
4347			sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
4348		sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
4349		kobject_put(hstate_kobj);
4350		nhs->hstate_kobjs[idx] = NULL;
4351	}
4352
4353	kobject_put(nhs->hugepages_kobj);
4354	nhs->hugepages_kobj = NULL;
4355}
4356
4357
4358/*
4359 * Register hstate attributes for a single node device.
4360 * No-op if attributes already registered.
4361 */
4362void hugetlb_register_node(struct node *node)
4363{
4364	struct hstate *h;
4365	struct node_hstate *nhs = &node_hstates[node->dev.id];
4366	int err;
4367
4368	if (!hugetlb_sysfs_initialized)
4369		return;
4370
4371	if (nhs->hugepages_kobj)
4372		return;		/* already allocated */
4373
4374	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
4375							&node->dev.kobj);
4376	if (!nhs->hugepages_kobj)
4377		return;
4378
4379	for_each_hstate(h) {
4380		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
4381						nhs->hstate_kobjs,
4382						&per_node_hstate_attr_group);
4383		if (err) {
4384			pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
4385				h->name, node->dev.id);
4386			hugetlb_unregister_node(node);
4387			break;
4388		}
4389	}
4390}
4391
4392/*
4393 * hugetlb init time:  register hstate attributes for all registered node
4394 * devices of nodes that have memory.  All on-line nodes should have
4395 * registered their associated device by this time.
4396 */
4397static void __init hugetlb_register_all_nodes(void)
4398{
4399	int nid;
4400
4401	for_each_online_node(nid)
4402		hugetlb_register_node(node_devices[nid]);
4403}
4404#else	/* !CONFIG_NUMA */
4405
4406static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4407{
4408	BUG();
4409	if (nidp)
4410		*nidp = -1;
4411	return NULL;
4412}
4413
4414static void hugetlb_register_all_nodes(void) { }
4415
4416#endif
4417
4418#ifdef CONFIG_CMA
4419static void __init hugetlb_cma_check(void);
4420#else
4421static inline __init void hugetlb_cma_check(void)
4422{
4423}
4424#endif
4425
4426static void __init hugetlb_sysfs_init(void)
4427{
4428	struct hstate *h;
4429	int err;
4430
4431	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
4432	if (!hugepages_kobj)
4433		return;
4434
4435	for_each_hstate(h) {
4436		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
4437					 hstate_kobjs, &hstate_attr_group);
4438		if (err)
4439			pr_err("HugeTLB: Unable to add hstate %s", h->name);
4440	}
4441
4442#ifdef CONFIG_NUMA
4443	hugetlb_sysfs_initialized = true;
4444#endif
4445	hugetlb_register_all_nodes();
4446}
4447
4448#ifdef CONFIG_SYSCTL
4449static void hugetlb_sysctl_init(void);
4450#else
4451static inline void hugetlb_sysctl_init(void) { }
4452#endif
4453
4454static int __init hugetlb_init(void)
4455{
4456	int i;
4457
4458	BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4459			__NR_HPAGEFLAGS);
4460
4461	if (!hugepages_supported()) {
4462		if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4463			pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4464		return 0;
4465	}
4466
4467	/*
4468	 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
4469	 * architectures depend on setup being done here.
4470	 */
4471	hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4472	if (!parsed_default_hugepagesz) {
4473		/*
4474		 * If we did not parse a default huge page size, set
4475		 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
4476		 * number of huge pages for this default size was implicitly
4477		 * specified, set that here as well.
4478		 * Note that the implicit setting will overwrite an explicit
4479		 * setting.  A warning will be printed in this case.
4480		 */
4481		default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4482		if (default_hstate_max_huge_pages) {
4483			if (default_hstate.max_huge_pages) {
4484				char buf[32];
4485
4486				string_get_size(huge_page_size(&default_hstate),
4487					1, STRING_UNITS_2, buf, 32);
4488				pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4489					default_hstate.max_huge_pages, buf);
4490				pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4491					default_hstate_max_huge_pages);
4492			}
4493			default_hstate.max_huge_pages =
4494				default_hstate_max_huge_pages;
4495
4496			for_each_online_node(i)
4497				default_hstate.max_huge_pages_node[i] =
4498					default_hugepages_in_node[i];
4499		}
4500	}
4501
4502	hugetlb_cma_check();
4503	hugetlb_init_hstates();
4504	gather_bootmem_prealloc();
4505	report_hugepages();
4506
4507	hugetlb_sysfs_init();
4508	hugetlb_cgroup_file_init();
4509	hugetlb_sysctl_init();
4510
4511#ifdef CONFIG_SMP
4512	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4513#else
4514	num_fault_mutexes = 1;
4515#endif
4516	hugetlb_fault_mutex_table =
4517		kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4518			      GFP_KERNEL);
4519	BUG_ON(!hugetlb_fault_mutex_table);
4520
4521	for (i = 0; i < num_fault_mutexes; i++)
4522		mutex_init(&hugetlb_fault_mutex_table[i]);
4523	return 0;
4524}
4525subsys_initcall(hugetlb_init);
4526
4527/* Overwritten by architectures with more huge page sizes */
4528bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4529{
4530	return size == HPAGE_SIZE;
4531}
4532
4533void __init hugetlb_add_hstate(unsigned int order)
4534{
4535	struct hstate *h;
4536	unsigned long i;
4537
4538	if (size_to_hstate(PAGE_SIZE << order)) {
4539		return;
4540	}
4541	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4542	BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
4543	h = &hstates[hugetlb_max_hstate++];
4544	__mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
4545	h->order = order;
4546	h->mask = ~(huge_page_size(h) - 1);
4547	for (i = 0; i < MAX_NUMNODES; ++i)
4548		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4549	INIT_LIST_HEAD(&h->hugepage_activelist);
4550	h->next_nid_to_alloc = first_memory_node;
4551	h->next_nid_to_free = first_memory_node;
4552	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4553					huge_page_size(h)/SZ_1K);
4554
4555	parsed_hstate = h;
4556}
4557
4558bool __init __weak hugetlb_node_alloc_supported(void)
4559{
4560	return true;
4561}
4562
4563static void __init hugepages_clear_pages_in_node(void)
4564{
4565	if (!hugetlb_max_hstate) {
4566		default_hstate_max_huge_pages = 0;
4567		memset(default_hugepages_in_node, 0,
4568			sizeof(default_hugepages_in_node));
4569	} else {
4570		parsed_hstate->max_huge_pages = 0;
4571		memset(parsed_hstate->max_huge_pages_node, 0,
4572			sizeof(parsed_hstate->max_huge_pages_node));
4573	}
4574}
4575
4576/*
4577 * hugepages command line processing
4578 * hugepages normally follows a valid hugepagsz or default_hugepagsz
4579 * specification.  If not, ignore the hugepages value.  hugepages can also
4580 * be the first huge page command line  option in which case it implicitly
4581 * specifies the number of huge pages for the default size.
4582 */
4583static int __init hugepages_setup(char *s)
4584{
4585	unsigned long *mhp;
4586	static unsigned long *last_mhp;
4587	int node = NUMA_NO_NODE;
4588	int count;
4589	unsigned long tmp;
4590	char *p = s;
4591
4592	if (!parsed_valid_hugepagesz) {
4593		pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4594		parsed_valid_hugepagesz = true;
4595		return 1;
4596	}
4597
4598	/*
4599	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
4600	 * yet, so this hugepages= parameter goes to the "default hstate".
4601	 * Otherwise, it goes with the previously parsed hugepagesz or
4602	 * default_hugepagesz.
4603	 */
4604	else if (!hugetlb_max_hstate)
4605		mhp = &default_hstate_max_huge_pages;
4606	else
4607		mhp = &parsed_hstate->max_huge_pages;
4608
4609	if (mhp == last_mhp) {
4610		pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4611		return 1;
4612	}
4613
4614	while (*p) {
4615		count = 0;
4616		if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4617			goto invalid;
4618		/* Parameter is node format */
4619		if (p[count] == ':') {
4620			if (!hugetlb_node_alloc_supported()) {
4621				pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4622				return 1;
4623			}
4624			if (tmp >= MAX_NUMNODES || !node_online(tmp))
4625				goto invalid;
4626			node = array_index_nospec(tmp, MAX_NUMNODES);
4627			p += count + 1;
4628			/* Parse hugepages */
4629			if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4630				goto invalid;
4631			if (!hugetlb_max_hstate)
4632				default_hugepages_in_node[node] = tmp;
4633			else
4634				parsed_hstate->max_huge_pages_node[node] = tmp;
4635			*mhp += tmp;
4636			/* Go to parse next node*/
4637			if (p[count] == ',')
4638				p += count + 1;
4639			else
4640				break;
4641		} else {
4642			if (p != s)
4643				goto invalid;
4644			*mhp = tmp;
4645			break;
4646		}
4647	}
4648
4649	/*
4650	 * Global state is always initialized later in hugetlb_init.
4651	 * But we need to allocate gigantic hstates here early to still
4652	 * use the bootmem allocator.
4653	 */
4654	if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
4655		hugetlb_hstate_alloc_pages(parsed_hstate);
4656
4657	last_mhp = mhp;
4658
4659	return 1;
4660
4661invalid:
4662	pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4663	hugepages_clear_pages_in_node();
4664	return 1;
4665}
4666__setup("hugepages=", hugepages_setup);
4667
4668/*
4669 * hugepagesz command line processing
4670 * A specific huge page size can only be specified once with hugepagesz.
4671 * hugepagesz is followed by hugepages on the command line.  The global
4672 * variable 'parsed_valid_hugepagesz' is used to determine if prior
4673 * hugepagesz argument was valid.
4674 */
4675static int __init hugepagesz_setup(char *s)
4676{
4677	unsigned long size;
4678	struct hstate *h;
4679
4680	parsed_valid_hugepagesz = false;
4681	size = (unsigned long)memparse(s, NULL);
4682
4683	if (!arch_hugetlb_valid_size(size)) {
4684		pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4685		return 1;
4686	}
4687
4688	h = size_to_hstate(size);
4689	if (h) {
4690		/*
4691		 * hstate for this size already exists.  This is normally
4692		 * an error, but is allowed if the existing hstate is the
4693		 * default hstate.  More specifically, it is only allowed if
4694		 * the number of huge pages for the default hstate was not
4695		 * previously specified.
4696		 */
4697		if (!parsed_default_hugepagesz ||  h != &default_hstate ||
4698		    default_hstate.max_huge_pages) {
4699			pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4700			return 1;
4701		}
4702
4703		/*
4704		 * No need to call hugetlb_add_hstate() as hstate already
4705		 * exists.  But, do set parsed_hstate so that a following
4706		 * hugepages= parameter will be applied to this hstate.
4707		 */
4708		parsed_hstate = h;
4709		parsed_valid_hugepagesz = true;
4710		return 1;
4711	}
4712
4713	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4714	parsed_valid_hugepagesz = true;
4715	return 1;
4716}
4717__setup("hugepagesz=", hugepagesz_setup);
4718
4719/*
4720 * default_hugepagesz command line input
4721 * Only one instance of default_hugepagesz allowed on command line.
4722 */
4723static int __init default_hugepagesz_setup(char *s)
4724{
4725	unsigned long size;
4726	int i;
4727
4728	parsed_valid_hugepagesz = false;
4729	if (parsed_default_hugepagesz) {
4730		pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4731		return 1;
4732	}
4733
4734	size = (unsigned long)memparse(s, NULL);
4735
4736	if (!arch_hugetlb_valid_size(size)) {
4737		pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4738		return 1;
4739	}
4740
4741	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4742	parsed_valid_hugepagesz = true;
4743	parsed_default_hugepagesz = true;
4744	default_hstate_idx = hstate_index(size_to_hstate(size));
4745
4746	/*
4747	 * The number of default huge pages (for this size) could have been
4748	 * specified as the first hugetlb parameter: hugepages=X.  If so,
4749	 * then default_hstate_max_huge_pages is set.  If the default huge
4750	 * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be
4751	 * allocated here from bootmem allocator.
4752	 */
4753	if (default_hstate_max_huge_pages) {
4754		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4755		for_each_online_node(i)
4756			default_hstate.max_huge_pages_node[i] =
4757				default_hugepages_in_node[i];
4758		if (hstate_is_gigantic(&default_hstate))
4759			hugetlb_hstate_alloc_pages(&default_hstate);
4760		default_hstate_max_huge_pages = 0;
4761	}
4762
4763	return 1;
4764}
4765__setup("default_hugepagesz=", default_hugepagesz_setup);
4766
4767static unsigned int allowed_mems_nr(struct hstate *h)
4768{
4769	int node;
4770	unsigned int nr = 0;
4771	nodemask_t *mbind_nodemask;
4772	unsigned int *array = h->free_huge_pages_node;
4773	gfp_t gfp_mask = htlb_alloc_mask(h);
4774
4775	mbind_nodemask = policy_mbind_nodemask(gfp_mask);
4776	for_each_node_mask(node, cpuset_current_mems_allowed) {
4777		if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
4778			nr += array[node];
4779	}
4780
4781	return nr;
4782}
4783
4784#ifdef CONFIG_SYSCTL
4785static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write,
4786					  void *buffer, size_t *length,
4787					  loff_t *ppos, unsigned long *out)
4788{
4789	struct ctl_table dup_table;
4790
4791	/*
4792	 * In order to avoid races with __do_proc_doulongvec_minmax(), we
4793	 * can duplicate the @table and alter the duplicate of it.
4794	 */
4795	dup_table = *table;
4796	dup_table.data = out;
4797
4798	return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4799}
4800
4801static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4802			 const struct ctl_table *table, int write,
4803			 void *buffer, size_t *length, loff_t *ppos)
4804{
4805	struct hstate *h = &default_hstate;
4806	unsigned long tmp = h->max_huge_pages;
4807	int ret;
4808
4809	if (!hugepages_supported())
4810		return -EOPNOTSUPP;
4811
4812	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4813					     &tmp);
4814	if (ret)
4815		goto out;
4816
4817	if (write)
4818		ret = __nr_hugepages_store_common(obey_mempolicy, h,
4819						  NUMA_NO_NODE, tmp, *length);
4820out:
4821	return ret;
4822}
4823
4824static int hugetlb_sysctl_handler(const struct ctl_table *table, int write,
4825			  void *buffer, size_t *length, loff_t *ppos)
4826{
4827
4828	return hugetlb_sysctl_handler_common(false, table, write,
4829							buffer, length, ppos);
4830}
4831
4832#ifdef CONFIG_NUMA
4833static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write,
4834			  void *buffer, size_t *length, loff_t *ppos)
4835{
4836	return hugetlb_sysctl_handler_common(true, table, write,
4837							buffer, length, ppos);
4838}
4839#endif /* CONFIG_NUMA */
4840
4841static int hugetlb_overcommit_handler(const struct ctl_table *table, int write,
4842		void *buffer, size_t *length, loff_t *ppos)
4843{
4844	struct hstate *h = &default_hstate;
4845	unsigned long tmp;
4846	int ret;
4847
4848	if (!hugepages_supported())
4849		return -EOPNOTSUPP;
4850
4851	tmp = h->nr_overcommit_huge_pages;
4852
4853	if (write && hstate_is_gigantic(h))
4854		return -EINVAL;
4855
4856	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4857					     &tmp);
4858	if (ret)
4859		goto out;
4860
4861	if (write) {
4862		spin_lock_irq(&hugetlb_lock);
4863		h->nr_overcommit_huge_pages = tmp;
4864		spin_unlock_irq(&hugetlb_lock);
4865	}
4866out:
4867	return ret;
4868}
4869
4870static const struct ctl_table hugetlb_table[] = {
4871	{
4872		.procname	= "nr_hugepages",
4873		.data		= NULL,
4874		.maxlen		= sizeof(unsigned long),
4875		.mode		= 0644,
4876		.proc_handler	= hugetlb_sysctl_handler,
4877	},
4878#ifdef CONFIG_NUMA
4879	{
4880		.procname       = "nr_hugepages_mempolicy",
4881		.data           = NULL,
4882		.maxlen         = sizeof(unsigned long),
4883		.mode           = 0644,
4884		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
4885	},
4886#endif
4887	{
4888		.procname	= "hugetlb_shm_group",
4889		.data		= &sysctl_hugetlb_shm_group,
4890		.maxlen		= sizeof(gid_t),
4891		.mode		= 0644,
4892		.proc_handler	= proc_dointvec,
4893	},
4894	{
4895		.procname	= "nr_overcommit_hugepages",
4896		.data		= NULL,
4897		.maxlen		= sizeof(unsigned long),
4898		.mode		= 0644,
4899		.proc_handler	= hugetlb_overcommit_handler,
4900	},
4901};
4902
4903static void hugetlb_sysctl_init(void)
4904{
4905	register_sysctl_init("vm", hugetlb_table);
4906}
4907#endif /* CONFIG_SYSCTL */
4908
4909void hugetlb_report_meminfo(struct seq_file *m)
4910{
4911	struct hstate *h;
4912	unsigned long total = 0;
4913
4914	if (!hugepages_supported())
4915		return;
4916
4917	for_each_hstate(h) {
4918		unsigned long count = h->nr_huge_pages;
4919
4920		total += huge_page_size(h) * count;
4921
4922		if (h == &default_hstate)
4923			seq_printf(m,
4924				   "HugePages_Total:   %5lu\n"
4925				   "HugePages_Free:    %5lu\n"
4926				   "HugePages_Rsvd:    %5lu\n"
4927				   "HugePages_Surp:    %5lu\n"
4928				   "Hugepagesize:   %8lu kB\n",
4929				   count,
4930				   h->free_huge_pages,
4931				   h->resv_huge_pages,
4932				   h->surplus_huge_pages,
4933				   huge_page_size(h) / SZ_1K);
4934	}
4935
4936	seq_printf(m, "Hugetlb:        %8lu kB\n", total / SZ_1K);
4937}
4938
4939int hugetlb_report_node_meminfo(char *buf, int len, int nid)
4940{
4941	struct hstate *h = &default_hstate;
4942
4943	if (!hugepages_supported())
4944		return 0;
4945
4946	return sysfs_emit_at(buf, len,
4947			     "Node %d HugePages_Total: %5u\n"
4948			     "Node %d HugePages_Free:  %5u\n"
4949			     "Node %d HugePages_Surp:  %5u\n",
4950			     nid, h->nr_huge_pages_node[nid],
4951			     nid, h->free_huge_pages_node[nid],
4952			     nid, h->surplus_huge_pages_node[nid]);
4953}
4954
4955void hugetlb_show_meminfo_node(int nid)
4956{
4957	struct hstate *h;
4958
4959	if (!hugepages_supported())
4960		return;
4961
4962	for_each_hstate(h)
4963		printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
4964			nid,
4965			h->nr_huge_pages_node[nid],
4966			h->free_huge_pages_node[nid],
4967			h->surplus_huge_pages_node[nid],
4968			huge_page_size(h) / SZ_1K);
4969}
4970
4971void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
4972{
4973	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
4974		   K(atomic_long_read(&mm->hugetlb_usage)));
4975}
4976
4977/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
4978unsigned long hugetlb_total_pages(void)
4979{
4980	struct hstate *h;
4981	unsigned long nr_total_pages = 0;
4982
4983	for_each_hstate(h)
4984		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4985	return nr_total_pages;
4986}
4987
4988static int hugetlb_acct_memory(struct hstate *h, long delta)
4989{
4990	int ret = -ENOMEM;
4991
4992	if (!delta)
4993		return 0;
4994
4995	spin_lock_irq(&hugetlb_lock);
4996	/*
4997	 * When cpuset is configured, it breaks the strict hugetlb page
4998	 * reservation as the accounting is done on a global variable. Such
4999	 * reservation is completely rubbish in the presence of cpuset because
5000	 * the reservation is not checked against page availability for the
5001	 * current cpuset. Application can still potentially OOM'ed by kernel
5002	 * with lack of free htlb page in cpuset that the task is in.
5003	 * Attempt to enforce strict accounting with cpuset is almost
5004	 * impossible (or too ugly) because cpuset is too fluid that
5005	 * task or memory node can be dynamically moved between cpusets.
5006	 *
5007	 * The change of semantics for shared hugetlb mapping with cpuset is
5008	 * undesirable. However, in order to preserve some of the semantics,
5009	 * we fall back to check against current free page availability as
5010	 * a best attempt and hopefully to minimize the impact of changing
5011	 * semantics that cpuset has.
5012	 *
5013	 * Apart from cpuset, we also have memory policy mechanism that
5014	 * also determines from which node the kernel will allocate memory
5015	 * in a NUMA system. So similar to cpuset, we also should consider
5016	 * the memory policy of the current task. Similar to the description
5017	 * above.
5018	 */
5019	if (delta > 0) {
5020		if (gather_surplus_pages(h, delta) < 0)
5021			goto out;
5022
5023		if (delta > allowed_mems_nr(h)) {
5024			return_unused_surplus_pages(h, delta);
5025			goto out;
5026		}
5027	}
5028
5029	ret = 0;
5030	if (delta < 0)
5031		return_unused_surplus_pages(h, (unsigned long) -delta);
5032
5033out:
5034	spin_unlock_irq(&hugetlb_lock);
5035	return ret;
5036}
5037
5038static void hugetlb_vm_op_open(struct vm_area_struct *vma)
5039{
5040	struct resv_map *resv = vma_resv_map(vma);
5041
5042	/*
5043	 * HPAGE_RESV_OWNER indicates a private mapping.
5044	 * This new VMA should share its siblings reservation map if present.
5045	 * The VMA will only ever have a valid reservation map pointer where
5046	 * it is being copied for another still existing VMA.  As that VMA
5047	 * has a reference to the reservation map it cannot disappear until
5048	 * after this open call completes.  It is therefore safe to take a
5049	 * new reference here without additional locking.
5050	 */
5051	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
5052		resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
5053		kref_get(&resv->refs);
5054	}
5055
5056	/*
5057	 * vma_lock structure for sharable mappings is vma specific.
5058	 * Clear old pointer (if copied via vm_area_dup) and allocate
5059	 * new structure.  Before clearing, make sure vma_lock is not
5060	 * for this vma.
5061	 */
5062	if (vma->vm_flags & VM_MAYSHARE) {
5063		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
5064
5065		if (vma_lock) {
5066			if (vma_lock->vma != vma) {
5067				vma->vm_private_data = NULL;
5068				hugetlb_vma_lock_alloc(vma);
5069			} else
5070				pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
5071		} else
5072			hugetlb_vma_lock_alloc(vma);
5073	}
5074}
5075
5076static void hugetlb_vm_op_close(struct vm_area_struct *vma)
5077{
5078	struct hstate *h = hstate_vma(vma);
5079	struct resv_map *resv;
5080	struct hugepage_subpool *spool = subpool_vma(vma);
5081	unsigned long reserve, start, end;
5082	long gbl_reserve;
5083
5084	hugetlb_vma_lock_free(vma);
5085
5086	resv = vma_resv_map(vma);
5087	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
5088		return;
5089
5090	start = vma_hugecache_offset(h, vma, vma->vm_start);
5091	end = vma_hugecache_offset(h, vma, vma->vm_end);
5092
5093	reserve = (end - start) - region_count(resv, start, end);
5094	hugetlb_cgroup_uncharge_counter(resv, start, end);
5095	if (reserve) {
5096		/*
5097		 * Decrement reserve counts.  The global reserve count may be
5098		 * adjusted if the subpool has a minimum size.
5099		 */
5100		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
5101		hugetlb_acct_memory(h, -gbl_reserve);
5102	}
5103
5104	kref_put(&resv->refs, resv_map_release);
5105}
5106
5107static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
5108{
5109	if (addr & ~(huge_page_mask(hstate_vma(vma))))
5110		return -EINVAL;
5111
5112	/*
5113	 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
5114	 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
5115	 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5116	 */
5117	if (addr & ~PUD_MASK) {
5118		/*
5119		 * hugetlb_vm_op_split is called right before we attempt to
5120		 * split the VMA. We will need to unshare PMDs in the old and
5121		 * new VMAs, so let's unshare before we split.
5122		 */
5123		unsigned long floor = addr & PUD_MASK;
5124		unsigned long ceil = floor + PUD_SIZE;
5125
5126		if (floor >= vma->vm_start && ceil <= vma->vm_end)
5127			hugetlb_unshare_pmds(vma, floor, ceil);
5128	}
5129
5130	return 0;
5131}
5132
5133static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
5134{
5135	return huge_page_size(hstate_vma(vma));
5136}
5137
5138/*
5139 * We cannot handle pagefaults against hugetlb pages at all.  They cause
5140 * handle_mm_fault() to try to instantiate regular-sized pages in the
5141 * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
5142 * this far.
5143 */
5144static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
5145{
5146	BUG();
5147	return 0;
5148}
5149
5150/*
5151 * When a new function is introduced to vm_operations_struct and added
5152 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
5153 * This is because under System V memory model, mappings created via
5154 * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
5155 * their original vm_ops are overwritten with shm_vm_ops.
5156 */
5157const struct vm_operations_struct hugetlb_vm_ops = {
5158	.fault = hugetlb_vm_op_fault,
5159	.open = hugetlb_vm_op_open,
5160	.close = hugetlb_vm_op_close,
5161	.may_split = hugetlb_vm_op_split,
5162	.pagesize = hugetlb_vm_op_pagesize,
5163};
5164
5165static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
5166		bool try_mkwrite)
5167{
5168	pte_t entry;
5169	unsigned int shift = huge_page_shift(hstate_vma(vma));
5170
5171	if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
5172		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
5173					 vma->vm_page_prot)));
5174	} else {
5175		entry = huge_pte_wrprotect(mk_huge_pte(page,
5176					   vma->vm_page_prot));
5177	}
5178	entry = pte_mkyoung(entry);
5179	entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
5180
5181	return entry;
5182}
5183
5184static void set_huge_ptep_writable(struct vm_area_struct *vma,
5185				   unsigned long address, pte_t *ptep)
5186{
5187	pte_t entry;
5188
5189	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep)));
5190	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
5191		update_mmu_cache(vma, address, ptep);
5192}
5193
5194static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
5195					 unsigned long address, pte_t *ptep)
5196{
5197	if (vma->vm_flags & VM_WRITE)
5198		set_huge_ptep_writable(vma, address, ptep);
5199}
5200
5201bool is_hugetlb_entry_migration(pte_t pte)
5202{
5203	swp_entry_t swp;
5204
5205	if (huge_pte_none(pte) || pte_present(pte))
5206		return false;
5207	swp = pte_to_swp_entry(pte);
5208	if (is_migration_entry(swp))
5209		return true;
5210	else
5211		return false;
5212}
5213
5214bool is_hugetlb_entry_hwpoisoned(pte_t pte)
5215{
5216	swp_entry_t swp;
5217
5218	if (huge_pte_none(pte) || pte_present(pte))
5219		return false;
5220	swp = pte_to_swp_entry(pte);
5221	if (is_hwpoison_entry(swp))
5222		return true;
5223	else
5224		return false;
5225}
5226
5227static void
5228hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
5229		      struct folio *new_folio, pte_t old, unsigned long sz)
5230{
5231	pte_t newpte = make_huge_pte(vma, &new_folio->page, true);
5232
5233	__folio_mark_uptodate(new_folio);
5234	hugetlb_add_new_anon_rmap(new_folio, vma, addr);
5235	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
5236		newpte = huge_pte_mkuffd_wp(newpte);
5237	set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
5238	hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
5239	folio_set_hugetlb_migratable(new_folio);
5240}
5241
5242int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
5243			    struct vm_area_struct *dst_vma,
5244			    struct vm_area_struct *src_vma)
5245{
5246	pte_t *src_pte, *dst_pte, entry;
5247	struct folio *pte_folio;
5248	unsigned long addr;
5249	bool cow = is_cow_mapping(src_vma->vm_flags);
5250	struct hstate *h = hstate_vma(src_vma);
5251	unsigned long sz = huge_page_size(h);
5252	unsigned long npages = pages_per_huge_page(h);
5253	struct mmu_notifier_range range;
5254	unsigned long last_addr_mask;
5255	int ret = 0;
5256
5257	if (cow) {
5258		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
5259					src_vma->vm_start,
5260					src_vma->vm_end);
5261		mmu_notifier_invalidate_range_start(&range);
5262		vma_assert_write_locked(src_vma);
5263		raw_write_seqcount_begin(&src->write_protect_seq);
5264	} else {
5265		/*
5266		 * For shared mappings the vma lock must be held before
5267		 * calling hugetlb_walk() in the src vma. Otherwise, the
5268		 * returned ptep could go away if part of a shared pmd and
5269		 * another thread calls huge_pmd_unshare.
5270		 */
5271		hugetlb_vma_lock_read(src_vma);
5272	}
5273
5274	last_addr_mask = hugetlb_mask_last_page(h);
5275	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
5276		spinlock_t *src_ptl, *dst_ptl;
5277		src_pte = hugetlb_walk(src_vma, addr, sz);
5278		if (!src_pte) {
5279			addr |= last_addr_mask;
5280			continue;
5281		}
5282		dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
5283		if (!dst_pte) {
5284			ret = -ENOMEM;
5285			break;
5286		}
5287
5288		/*
5289		 * If the pagetables are shared don't copy or take references.
5290		 *
5291		 * dst_pte == src_pte is the common case of src/dest sharing.
5292		 * However, src could have 'unshared' and dst shares with
5293		 * another vma. So page_count of ptep page is checked instead
5294		 * to reliably determine whether pte is shared.
5295		 */
5296		if (page_count(virt_to_page(dst_pte)) > 1) {
5297			addr |= last_addr_mask;
5298			continue;
5299		}
5300
5301		dst_ptl = huge_pte_lock(h, dst, dst_pte);
5302		src_ptl = huge_pte_lockptr(h, src, src_pte);
5303		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5304		entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5305again:
5306		if (huge_pte_none(entry)) {
5307			/*
5308			 * Skip if src entry none.
5309			 */
5310			;
5311		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
5312			if (!userfaultfd_wp(dst_vma))
5313				entry = huge_pte_clear_uffd_wp(entry);
5314			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5315		} else if (unlikely(is_hugetlb_entry_migration(entry))) {
5316			swp_entry_t swp_entry = pte_to_swp_entry(entry);
5317			bool uffd_wp = pte_swp_uffd_wp(entry);
5318
5319			if (!is_readable_migration_entry(swp_entry) && cow) {
5320				/*
5321				 * COW mappings require pages in both
5322				 * parent and child to be set to read.
5323				 */
5324				swp_entry = make_readable_migration_entry(
5325							swp_offset(swp_entry));
5326				entry = swp_entry_to_pte(swp_entry);
5327				if (userfaultfd_wp(src_vma) && uffd_wp)
5328					entry = pte_swp_mkuffd_wp(entry);
5329				set_huge_pte_at(src, addr, src_pte, entry, sz);
5330			}
5331			if (!userfaultfd_wp(dst_vma))
5332				entry = huge_pte_clear_uffd_wp(entry);
5333			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5334		} else if (unlikely(is_pte_marker(entry))) {
5335			pte_marker marker = copy_pte_marker(
5336				pte_to_swp_entry(entry), dst_vma);
5337
5338			if (marker)
5339				set_huge_pte_at(dst, addr, dst_pte,
5340						make_pte_marker(marker), sz);
5341		} else {
5342			entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5343			pte_folio = page_folio(pte_page(entry));
5344			folio_get(pte_folio);
5345
5346			/*
5347			 * Failing to duplicate the anon rmap is a rare case
5348			 * where we see pinned hugetlb pages while they're
5349			 * prone to COW. We need to do the COW earlier during
5350			 * fork.
5351			 *
5352			 * When pre-allocating the page or copying data, we
5353			 * need to be without the pgtable locks since we could
5354			 * sleep during the process.
5355			 */
5356			if (!folio_test_anon(pte_folio)) {
5357				hugetlb_add_file_rmap(pte_folio);
5358			} else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) {
5359				pte_t src_pte_old = entry;
5360				struct folio *new_folio;
5361
5362				spin_unlock(src_ptl);
5363				spin_unlock(dst_ptl);
5364				/* Do not use reserve as it's private owned */
5365				new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
5366				if (IS_ERR(new_folio)) {
5367					folio_put(pte_folio);
5368					ret = PTR_ERR(new_folio);
5369					break;
5370				}
5371				ret = copy_user_large_folio(new_folio, pte_folio,
5372							    addr, dst_vma);
5373				folio_put(pte_folio);
5374				if (ret) {
5375					folio_put(new_folio);
5376					break;
5377				}
5378
5379				/* Install the new hugetlb folio if src pte stable */
5380				dst_ptl = huge_pte_lock(h, dst, dst_pte);
5381				src_ptl = huge_pte_lockptr(h, src, src_pte);
5382				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5383				entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5384				if (!pte_same(src_pte_old, entry)) {
5385					restore_reserve_on_error(h, dst_vma, addr,
5386								new_folio);
5387					folio_put(new_folio);
5388					/* huge_ptep of dst_pte won't change as in child */
5389					goto again;
5390				}
5391				hugetlb_install_folio(dst_vma, dst_pte, addr,
5392						      new_folio, src_pte_old, sz);
5393				spin_unlock(src_ptl);
5394				spin_unlock(dst_ptl);
5395				continue;
5396			}
5397
5398			if (cow) {
5399				/*
5400				 * No need to notify as we are downgrading page
5401				 * table protection not changing it to point
5402				 * to a new page.
5403				 *
5404				 * See Documentation/mm/mmu_notifier.rst
5405				 */
5406				huge_ptep_set_wrprotect(src, addr, src_pte);
5407				entry = huge_pte_wrprotect(entry);
5408			}
5409
5410			if (!userfaultfd_wp(dst_vma))
5411				entry = huge_pte_clear_uffd_wp(entry);
5412
5413			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5414			hugetlb_count_add(npages, dst);
5415		}
5416		spin_unlock(src_ptl);
5417		spin_unlock(dst_ptl);
5418	}
5419
5420	if (cow) {
5421		raw_write_seqcount_end(&src->write_protect_seq);
5422		mmu_notifier_invalidate_range_end(&range);
5423	} else {
5424		hugetlb_vma_unlock_read(src_vma);
5425	}
5426
5427	return ret;
5428}
5429
5430static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
5431			  unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
5432			  unsigned long sz)
5433{
5434	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
5435	struct hstate *h = hstate_vma(vma);
5436	struct mm_struct *mm = vma->vm_mm;
5437	spinlock_t *src_ptl, *dst_ptl;
5438	pte_t pte;
5439
5440	dst_ptl = huge_pte_lock(h, mm, dst_pte);
5441	src_ptl = huge_pte_lockptr(h, mm, src_pte);
5442
5443	/*
5444	 * We don't have to worry about the ordering of src and dst ptlocks
5445	 * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
5446	 */
5447	if (src_ptl != dst_ptl)
5448		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5449
5450	pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
5451
5452	if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
5453		huge_pte_clear(mm, new_addr, dst_pte, sz);
5454	else {
5455		if (need_clear_uffd_wp) {
5456			if (pte_present(pte))
5457				pte = huge_pte_clear_uffd_wp(pte);
5458			else if (is_swap_pte(pte))
5459				pte = pte_swp_clear_uffd_wp(pte);
5460		}
5461		set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
5462	}
5463
5464	if (src_ptl != dst_ptl)
5465		spin_unlock(src_ptl);
5466	spin_unlock(dst_ptl);
5467}
5468
5469int move_hugetlb_page_tables(struct vm_area_struct *vma,
5470			     struct vm_area_struct *new_vma,
5471			     unsigned long old_addr, unsigned long new_addr,
5472			     unsigned long len)
5473{
5474	struct hstate *h = hstate_vma(vma);
5475	struct address_space *mapping = vma->vm_file->f_mapping;
5476	unsigned long sz = huge_page_size(h);
5477	struct mm_struct *mm = vma->vm_mm;
5478	unsigned long old_end = old_addr + len;
5479	unsigned long last_addr_mask;
5480	pte_t *src_pte, *dst_pte;
5481	struct mmu_notifier_range range;
5482	bool shared_pmd = false;
5483
5484	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
5485				old_end);
5486	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5487	/*
5488	 * In case of shared PMDs, we should cover the maximum possible
5489	 * range.
5490	 */
5491	flush_cache_range(vma, range.start, range.end);
5492
5493	mmu_notifier_invalidate_range_start(&range);
5494	last_addr_mask = hugetlb_mask_last_page(h);
5495	/* Prevent race with file truncation */
5496	hugetlb_vma_lock_write(vma);
5497	i_mmap_lock_write(mapping);
5498	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
5499		src_pte = hugetlb_walk(vma, old_addr, sz);
5500		if (!src_pte) {
5501			old_addr |= last_addr_mask;
5502			new_addr |= last_addr_mask;
5503			continue;
5504		}
5505		if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
5506			continue;
5507
5508		if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
5509			shared_pmd = true;
5510			old_addr |= last_addr_mask;
5511			new_addr |= last_addr_mask;
5512			continue;
5513		}
5514
5515		dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
5516		if (!dst_pte)
5517			break;
5518
5519		move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
5520	}
5521
5522	if (shared_pmd)
5523		flush_hugetlb_tlb_range(vma, range.start, range.end);
5524	else
5525		flush_hugetlb_tlb_range(vma, old_end - len, old_end);
5526	mmu_notifier_invalidate_range_end(&range);
5527	i_mmap_unlock_write(mapping);
5528	hugetlb_vma_unlock_write(vma);
5529
5530	return len + old_addr - old_end;
5531}
5532
5533void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
5534			    unsigned long start, unsigned long end,
5535			    struct page *ref_page, zap_flags_t zap_flags)
5536{
5537	struct mm_struct *mm = vma->vm_mm;
5538	unsigned long address;
5539	pte_t *ptep;
5540	pte_t pte;
5541	spinlock_t *ptl;
5542	struct page *page;
5543	struct hstate *h = hstate_vma(vma);
5544	unsigned long sz = huge_page_size(h);
5545	bool adjust_reservation = false;
5546	unsigned long last_addr_mask;
5547	bool force_flush = false;
5548
5549	WARN_ON(!is_vm_hugetlb_page(vma));
5550	BUG_ON(start & ~huge_page_mask(h));
5551	BUG_ON(end & ~huge_page_mask(h));
5552
5553	/*
5554	 * This is a hugetlb vma, all the pte entries should point
5555	 * to huge page.
5556	 */
5557	tlb_change_page_size(tlb, sz);
5558	tlb_start_vma(tlb, vma);
5559
5560	last_addr_mask = hugetlb_mask_last_page(h);
5561	address = start;
5562	for (; address < end; address += sz) {
5563		ptep = hugetlb_walk(vma, address, sz);
5564		if (!ptep) {
5565			address |= last_addr_mask;
5566			continue;
5567		}
5568
5569		ptl = huge_pte_lock(h, mm, ptep);
5570		if (huge_pmd_unshare(mm, vma, address, ptep)) {
5571			spin_unlock(ptl);
5572			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
5573			force_flush = true;
5574			address |= last_addr_mask;
5575			continue;
5576		}
5577
5578		pte = huge_ptep_get(mm, address, ptep);
5579		if (huge_pte_none(pte)) {
5580			spin_unlock(ptl);
5581			continue;
5582		}
5583
5584		/*
5585		 * Migrating hugepage or HWPoisoned hugepage is already
5586		 * unmapped and its refcount is dropped, so just clear pte here.
5587		 */
5588		if (unlikely(!pte_present(pte))) {
5589			/*
5590			 * If the pte was wr-protected by uffd-wp in any of the
5591			 * swap forms, meanwhile the caller does not want to
5592			 * drop the uffd-wp bit in this zap, then replace the
5593			 * pte with a marker.
5594			 */
5595			if (pte_swp_uffd_wp_any(pte) &&
5596			    !(zap_flags & ZAP_FLAG_DROP_MARKER))
5597				set_huge_pte_at(mm, address, ptep,
5598						make_pte_marker(PTE_MARKER_UFFD_WP),
5599						sz);
5600			else
5601				huge_pte_clear(mm, address, ptep, sz);
5602			spin_unlock(ptl);
5603			continue;
5604		}
5605
5606		page = pte_page(pte);
5607		/*
5608		 * If a reference page is supplied, it is because a specific
5609		 * page is being unmapped, not a range. Ensure the page we
5610		 * are about to unmap is the actual page of interest.
5611		 */
5612		if (ref_page) {
5613			if (page != ref_page) {
5614				spin_unlock(ptl);
5615				continue;
5616			}
5617			/*
5618			 * Mark the VMA as having unmapped its page so that
5619			 * future faults in this VMA will fail rather than
5620			 * looking like data was lost
5621			 */
5622			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5623		}
5624
5625		pte = huge_ptep_get_and_clear(mm, address, ptep);
5626		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5627		if (huge_pte_dirty(pte))
5628			set_page_dirty(page);
5629		/* Leave a uffd-wp pte marker if needed */
5630		if (huge_pte_uffd_wp(pte) &&
5631		    !(zap_flags & ZAP_FLAG_DROP_MARKER))
5632			set_huge_pte_at(mm, address, ptep,
5633					make_pte_marker(PTE_MARKER_UFFD_WP),
5634					sz);
5635		hugetlb_count_sub(pages_per_huge_page(h), mm);
5636		hugetlb_remove_rmap(page_folio(page));
5637
5638		/*
5639		 * Restore the reservation for anonymous page, otherwise the
5640		 * backing page could be stolen by someone.
5641		 * If there we are freeing a surplus, do not set the restore
5642		 * reservation bit.
5643		 */
5644		if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
5645		    folio_test_anon(page_folio(page))) {
5646			folio_set_hugetlb_restore_reserve(page_folio(page));
5647			/* Reservation to be adjusted after the spin lock */
5648			adjust_reservation = true;
5649		}
5650
5651		spin_unlock(ptl);
5652
5653		/*
5654		 * Adjust the reservation for the region that will have the
5655		 * reserve restored. Keep in mind that vma_needs_reservation() changes
5656		 * resv->adds_in_progress if it succeeds. If this is not done,
5657		 * do_exit() will not see it, and will keep the reservation
5658		 * forever.
5659		 */
5660		if (adjust_reservation) {
5661			int rc = vma_needs_reservation(h, vma, address);
5662
5663			if (rc < 0)
5664				/* Pressumably allocate_file_region_entries failed
5665				 * to allocate a file_region struct. Clear
5666				 * hugetlb_restore_reserve so that global reserve
5667				 * count will not be incremented by free_huge_folio.
5668				 * Act as if we consumed the reservation.
5669				 */
5670				folio_clear_hugetlb_restore_reserve(page_folio(page));
5671			else if (rc)
5672				vma_add_reservation(h, vma, address);
5673		}
5674
5675		tlb_remove_page_size(tlb, page, huge_page_size(h));
5676		/*
5677		 * Bail out after unmapping reference page if supplied
5678		 */
5679		if (ref_page)
5680			break;
5681	}
5682	tlb_end_vma(tlb, vma);
5683
5684	/*
5685	 * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
5686	 * could defer the flush until now, since by holding i_mmap_rwsem we
5687	 * guaranteed that the last refernece would not be dropped. But we must
5688	 * do the flushing before we return, as otherwise i_mmap_rwsem will be
5689	 * dropped and the last reference to the shared PMDs page might be
5690	 * dropped as well.
5691	 *
5692	 * In theory we could defer the freeing of the PMD pages as well, but
5693	 * huge_pmd_unshare() relies on the exact page_count for the PMD page to
5694	 * detect sharing, so we cannot defer the release of the page either.
5695	 * Instead, do flush now.
5696	 */
5697	if (force_flush)
5698		tlb_flush_mmu_tlbonly(tlb);
5699}
5700
5701void __hugetlb_zap_begin(struct vm_area_struct *vma,
5702			 unsigned long *start, unsigned long *end)
5703{
5704	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
5705		return;
5706
5707	adjust_range_if_pmd_sharing_possible(vma, start, end);
5708	hugetlb_vma_lock_write(vma);
5709	if (vma->vm_file)
5710		i_mmap_lock_write(vma->vm_file->f_mapping);
5711}
5712
5713void __hugetlb_zap_end(struct vm_area_struct *vma,
5714		       struct zap_details *details)
5715{
5716	zap_flags_t zap_flags = details ? details->zap_flags : 0;
5717
5718	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
5719		return;
5720
5721	if (zap_flags & ZAP_FLAG_UNMAP) {	/* final unmap */
5722		/*
5723		 * Unlock and free the vma lock before releasing i_mmap_rwsem.
5724		 * When the vma_lock is freed, this makes the vma ineligible
5725		 * for pmd sharing.  And, i_mmap_rwsem is required to set up
5726		 * pmd sharing.  This is important as page tables for this
5727		 * unmapped range will be asynchrously deleted.  If the page
5728		 * tables are shared, there will be issues when accessed by
5729		 * someone else.
5730		 */
5731		__hugetlb_vma_unlock_write_free(vma);
5732	} else {
5733		hugetlb_vma_unlock_write(vma);
5734	}
5735
5736	if (vma->vm_file)
5737		i_mmap_unlock_write(vma->vm_file->f_mapping);
5738}
5739
5740void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
5741			  unsigned long end, struct page *ref_page,
5742			  zap_flags_t zap_flags)
5743{
5744	struct mmu_notifier_range range;
5745	struct mmu_gather tlb;
5746
5747	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
5748				start, end);
5749	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5750	mmu_notifier_invalidate_range_start(&range);
5751	tlb_gather_mmu(&tlb, vma->vm_mm);
5752
5753	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
5754
5755	mmu_notifier_invalidate_range_end(&range);
5756	tlb_finish_mmu(&tlb);
5757}
5758
5759/*
5760 * This is called when the original mapper is failing to COW a MAP_PRIVATE
5761 * mapping it owns the reserve page for. The intention is to unmap the page
5762 * from other VMAs and let the children be SIGKILLed if they are faulting the
5763 * same region.
5764 */
5765static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
5766			      struct page *page, unsigned long address)
5767{
5768	struct hstate *h = hstate_vma(vma);
5769	struct vm_area_struct *iter_vma;
5770	struct address_space *mapping;
5771	pgoff_t pgoff;
5772
5773	/*
5774	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
5775	 * from page cache lookup which is in HPAGE_SIZE units.
5776	 */
5777	address = address & huge_page_mask(h);
5778	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5779			vma->vm_pgoff;
5780	mapping = vma->vm_file->f_mapping;
5781
5782	/*
5783	 * Take the mapping lock for the duration of the table walk. As
5784	 * this mapping should be shared between all the VMAs,
5785	 * __unmap_hugepage_range() is called as the lock is already held
5786	 */
5787	i_mmap_lock_write(mapping);
5788	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5789		/* Do not unmap the current VMA */
5790		if (iter_vma == vma)
5791			continue;
5792
5793		/*
5794		 * Shared VMAs have their own reserves and do not affect
5795		 * MAP_PRIVATE accounting but it is possible that a shared
5796		 * VMA is using the same page so check and skip such VMAs.
5797		 */
5798		if (iter_vma->vm_flags & VM_MAYSHARE)
5799			continue;
5800
5801		/*
5802		 * Unmap the page from other VMAs without their own reserves.
5803		 * They get marked to be SIGKILLed if they fault in these
5804		 * areas. This is because a future no-page fault on this VMA
5805		 * could insert a zeroed page instead of the data existing
5806		 * from the time of fork. This would look like data corruption
5807		 */
5808		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
5809			unmap_hugepage_range(iter_vma, address,
5810					     address + huge_page_size(h), page, 0);
5811	}
5812	i_mmap_unlock_write(mapping);
5813}
5814
5815/*
5816 * hugetlb_wp() should be called with page lock of the original hugepage held.
5817 * Called with hugetlb_fault_mutex_table held and pte_page locked so we
5818 * cannot race with other handlers or page migration.
5819 * Keep the pte_same checks anyway to make transition from the mutex easier.
5820 */
5821static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
5822		       struct vm_fault *vmf)
5823{
5824	struct vm_area_struct *vma = vmf->vma;
5825	struct mm_struct *mm = vma->vm_mm;
5826	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
5827	pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte);
5828	struct hstate *h = hstate_vma(vma);
5829	struct folio *old_folio;
5830	struct folio *new_folio;
5831	bool cow_from_owner = 0;
5832	vm_fault_t ret = 0;
5833	struct mmu_notifier_range range;
5834
5835	/*
5836	 * Never handle CoW for uffd-wp protected pages.  It should be only
5837	 * handled when the uffd-wp protection is removed.
5838	 *
5839	 * Note that only the CoW optimization path (in hugetlb_no_page())
5840	 * can trigger this, because hugetlb_fault() will always resolve
5841	 * uffd-wp bit first.
5842	 */
5843	if (!unshare && huge_pte_uffd_wp(pte))
5844		return 0;
5845
5846	/* Let's take out MAP_SHARED mappings first. */
5847	if (vma->vm_flags & VM_MAYSHARE) {
5848		set_huge_ptep_writable(vma, vmf->address, vmf->pte);
5849		return 0;
5850	}
5851
5852	old_folio = page_folio(pte_page(pte));
5853
5854	delayacct_wpcopy_start();
5855
5856retry_avoidcopy:
5857	/*
5858	 * If no-one else is actually using this page, we're the exclusive
5859	 * owner and can reuse this page.
5860	 *
5861	 * Note that we don't rely on the (safer) folio refcount here, because
5862	 * copying the hugetlb folio when there are unexpected (temporary)
5863	 * folio references could harm simple fork()+exit() users when
5864	 * we run out of free hugetlb folios: we would have to kill processes
5865	 * in scenarios that used to work. As a side effect, there can still
5866	 * be leaks between processes, for example, with FOLL_GET users.
5867	 */
5868	if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
5869		if (!PageAnonExclusive(&old_folio->page)) {
5870			folio_move_anon_rmap(old_folio, vma);
5871			SetPageAnonExclusive(&old_folio->page);
5872		}
5873		if (likely(!unshare))
5874			set_huge_ptep_maybe_writable(vma, vmf->address,
5875						     vmf->pte);
5876
5877		delayacct_wpcopy_end();
5878		return 0;
5879	}
5880	VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
5881		       PageAnonExclusive(&old_folio->page), &old_folio->page);
5882
5883	/*
5884	 * If the process that created a MAP_PRIVATE mapping is about to
5885	 * perform a COW due to a shared page count, attempt to satisfy
5886	 * the allocation without using the existing reserves. The pagecache
5887	 * page is used to determine if the reserve at this address was
5888	 * consumed or not. If reserves were used, a partial faulted mapping
5889	 * at the time of fork() could consume its reserves on COW instead
5890	 * of the full address range.
5891	 */
5892	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
5893			old_folio != pagecache_folio)
5894		cow_from_owner = true;
5895
5896	folio_get(old_folio);
5897
5898	/*
5899	 * Drop page table lock as buddy allocator may be called. It will
5900	 * be acquired again before returning to the caller, as expected.
5901	 */
5902	spin_unlock(vmf->ptl);
5903	new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
5904
5905	if (IS_ERR(new_folio)) {
5906		/*
5907		 * If a process owning a MAP_PRIVATE mapping fails to COW,
5908		 * it is due to references held by a child and an insufficient
5909		 * huge page pool. To guarantee the original mappers
5910		 * reliability, unmap the page from child processes. The child
5911		 * may get SIGKILLed if it later faults.
5912		 */
5913		if (cow_from_owner) {
5914			struct address_space *mapping = vma->vm_file->f_mapping;
5915			pgoff_t idx;
5916			u32 hash;
5917
5918			folio_put(old_folio);
5919			/*
5920			 * Drop hugetlb_fault_mutex and vma_lock before
5921			 * unmapping.  unmapping needs to hold vma_lock
5922			 * in write mode.  Dropping vma_lock in read mode
5923			 * here is OK as COW mappings do not interact with
5924			 * PMD sharing.
5925			 *
5926			 * Reacquire both after unmap operation.
5927			 */
5928			idx = vma_hugecache_offset(h, vma, vmf->address);
5929			hash = hugetlb_fault_mutex_hash(mapping, idx);
5930			hugetlb_vma_unlock_read(vma);
5931			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5932
5933			unmap_ref_private(mm, vma, &old_folio->page,
5934					vmf->address);
5935
5936			mutex_lock(&hugetlb_fault_mutex_table[hash]);
5937			hugetlb_vma_lock_read(vma);
5938			spin_lock(vmf->ptl);
5939			vmf->pte = hugetlb_walk(vma, vmf->address,
5940					huge_page_size(h));
5941			if (likely(vmf->pte &&
5942				   pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte)))
5943				goto retry_avoidcopy;
5944			/*
5945			 * race occurs while re-acquiring page table
5946			 * lock, and our job is done.
5947			 */
5948			delayacct_wpcopy_end();
5949			return 0;
5950		}
5951
5952		ret = vmf_error(PTR_ERR(new_folio));
5953		goto out_release_old;
5954	}
5955
5956	/*
5957	 * When the original hugepage is shared one, it does not have
5958	 * anon_vma prepared.
5959	 */
5960	ret = __vmf_anon_prepare(vmf);
5961	if (unlikely(ret))
5962		goto out_release_all;
5963
5964	if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
5965		ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));
5966		goto out_release_all;
5967	}
5968	__folio_mark_uptodate(new_folio);
5969
5970	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
5971				vmf->address + huge_page_size(h));
5972	mmu_notifier_invalidate_range_start(&range);
5973
5974	/*
5975	 * Retake the page table lock to check for racing updates
5976	 * before the page tables are altered
5977	 */
5978	spin_lock(vmf->ptl);
5979	vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
5980	if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
5981		pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
5982
5983		/* Break COW or unshare */
5984		huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
5985		hugetlb_remove_rmap(old_folio);
5986		hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
5987		if (huge_pte_uffd_wp(pte))
5988			newpte = huge_pte_mkuffd_wp(newpte);
5989		set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
5990				huge_page_size(h));
5991		folio_set_hugetlb_migratable(new_folio);
5992		/* Make the old page be freed below */
5993		new_folio = old_folio;
5994	}
5995	spin_unlock(vmf->ptl);
5996	mmu_notifier_invalidate_range_end(&range);
5997out_release_all:
5998	/*
5999	 * No restore in case of successful pagetable update (Break COW or
6000	 * unshare)
6001	 */
6002	if (new_folio != old_folio)
6003		restore_reserve_on_error(h, vma, vmf->address, new_folio);
6004	folio_put(new_folio);
6005out_release_old:
6006	folio_put(old_folio);
6007
6008	spin_lock(vmf->ptl); /* Caller expects lock to be held */
6009
6010	delayacct_wpcopy_end();
6011	return ret;
6012}
6013
6014/*
6015 * Return whether there is a pagecache page to back given address within VMA.
6016 */
6017bool hugetlbfs_pagecache_present(struct hstate *h,
6018				 struct vm_area_struct *vma, unsigned long address)
6019{
6020	struct address_space *mapping = vma->vm_file->f_mapping;
6021	pgoff_t idx = linear_page_index(vma, address);
6022	struct folio *folio;
6023
6024	folio = filemap_get_folio(mapping, idx);
6025	if (IS_ERR(folio))
6026		return false;
6027	folio_put(folio);
6028	return true;
6029}
6030
6031int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
6032			   pgoff_t idx)
6033{
6034	struct inode *inode = mapping->host;
6035	struct hstate *h = hstate_inode(inode);
6036	int err;
6037
6038	idx <<= huge_page_order(h);
6039	__folio_set_locked(folio);
6040	err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
6041
6042	if (unlikely(err)) {
6043		__folio_clear_locked(folio);
6044		return err;
6045	}
6046	folio_clear_hugetlb_restore_reserve(folio);
6047
6048	/*
6049	 * mark folio dirty so that it will not be removed from cache/file
6050	 * by non-hugetlbfs specific code paths.
6051	 */
6052	folio_mark_dirty(folio);
6053
6054	spin_lock(&inode->i_lock);
6055	inode->i_blocks += blocks_per_huge_page(h);
6056	spin_unlock(&inode->i_lock);
6057	return 0;
6058}
6059
6060static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf,
6061						  struct address_space *mapping,
6062						  unsigned long reason)
6063{
6064	u32 hash;
6065
6066	/*
6067	 * vma_lock and hugetlb_fault_mutex must be dropped before handling
6068	 * userfault. Also mmap_lock could be dropped due to handling
6069	 * userfault, any vma operation should be careful from here.
6070	 */
6071	hugetlb_vma_unlock_read(vmf->vma);
6072	hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
6073	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6074	return handle_userfault(vmf, reason);
6075}
6076
6077/*
6078 * Recheck pte with pgtable lock.  Returns true if pte didn't change, or
6079 * false if pte changed or is changing.
6080 */
6081static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr,
6082			       pte_t *ptep, pte_t old_pte)
6083{
6084	spinlock_t *ptl;
6085	bool same;
6086
6087	ptl = huge_pte_lock(h, mm, ptep);
6088	same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte);
6089	spin_unlock(ptl);
6090
6091	return same;
6092}
6093
6094static vm_fault_t hugetlb_no_page(struct address_space *mapping,
6095			struct vm_fault *vmf)
6096{
6097	struct vm_area_struct *vma = vmf->vma;
6098	struct mm_struct *mm = vma->vm_mm;
6099	struct hstate *h = hstate_vma(vma);
6100	vm_fault_t ret = VM_FAULT_SIGBUS;
6101	int anon_rmap = 0;
6102	unsigned long size;
6103	struct folio *folio;
6104	pte_t new_pte;
6105	bool new_folio, new_pagecache_folio = false;
6106	u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
6107
6108	/*
6109	 * Currently, we are forced to kill the process in the event the
6110	 * original mapper has unmapped pages from the child due to a failed
6111	 * COW/unsharing. Warn that such a situation has occurred as it may not
6112	 * be obvious.
6113	 */
6114	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
6115		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
6116			   current->pid);
6117		goto out;
6118	}
6119
6120	/*
6121	 * Use page lock to guard against racing truncation
6122	 * before we get page_table_lock.
6123	 */
6124	new_folio = false;
6125	folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
6126	if (IS_ERR(folio)) {
6127		size = i_size_read(mapping->host) >> huge_page_shift(h);
6128		if (vmf->pgoff >= size)
6129			goto out;
6130		/* Check for page in userfault range */
6131		if (userfaultfd_missing(vma)) {
6132			/*
6133			 * Since hugetlb_no_page() was examining pte
6134			 * without pgtable lock, we need to re-test under
6135			 * lock because the pte may not be stable and could
6136			 * have changed from under us.  Try to detect
6137			 * either changed or during-changing ptes and retry
6138			 * properly when needed.
6139			 *
6140			 * Note that userfaultfd is actually fine with
6141			 * false positives (e.g. caused by pte changed),
6142			 * but not wrong logical events (e.g. caused by
6143			 * reading a pte during changing).  The latter can
6144			 * confuse the userspace, so the strictness is very
6145			 * much preferred.  E.g., MISSING event should
6146			 * never happen on the page after UFFDIO_COPY has
6147			 * correctly installed the page and returned.
6148			 */
6149			if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
6150				ret = 0;
6151				goto out;
6152			}
6153
6154			return hugetlb_handle_userfault(vmf, mapping,
6155							VM_UFFD_MISSING);
6156		}
6157
6158		if (!(vma->vm_flags & VM_MAYSHARE)) {
6159			ret = __vmf_anon_prepare(vmf);
6160			if (unlikely(ret))
6161				goto out;
6162		}
6163
6164		folio = alloc_hugetlb_folio(vma, vmf->address, false);
6165		if (IS_ERR(folio)) {
6166			/*
6167			 * Returning error will result in faulting task being
6168			 * sent SIGBUS.  The hugetlb fault mutex prevents two
6169			 * tasks from racing to fault in the same page which
6170			 * could result in false unable to allocate errors.
6171			 * Page migration does not take the fault mutex, but
6172			 * does a clear then write of pte's under page table
6173			 * lock.  Page fault code could race with migration,
6174			 * notice the clear pte and try to allocate a page
6175			 * here.  Before returning error, get ptl and make
6176			 * sure there really is no pte entry.
6177			 */
6178			if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte))
6179				ret = vmf_error(PTR_ERR(folio));
6180			else
6181				ret = 0;
6182			goto out;
6183		}
6184		folio_zero_user(folio, vmf->real_address);
6185		__folio_mark_uptodate(folio);
6186		new_folio = true;
6187
6188		if (vma->vm_flags & VM_MAYSHARE) {
6189			int err = hugetlb_add_to_page_cache(folio, mapping,
6190							vmf->pgoff);
6191			if (err) {
6192				/*
6193				 * err can't be -EEXIST which implies someone
6194				 * else consumed the reservation since hugetlb
6195				 * fault mutex is held when add a hugetlb page
6196				 * to the page cache. So it's safe to call
6197				 * restore_reserve_on_error() here.
6198				 */
6199				restore_reserve_on_error(h, vma, vmf->address,
6200							folio);
6201				folio_put(folio);
6202				ret = VM_FAULT_SIGBUS;
6203				goto out;
6204			}
6205			new_pagecache_folio = true;
6206		} else {
6207			folio_lock(folio);
6208			anon_rmap = 1;
6209		}
6210	} else {
6211		/*
6212		 * If memory error occurs between mmap() and fault, some process
6213		 * don't have hwpoisoned swap entry for errored virtual address.
6214		 * So we need to block hugepage fault by PG_hwpoison bit check.
6215		 */
6216		if (unlikely(folio_test_hwpoison(folio))) {
6217			ret = VM_FAULT_HWPOISON_LARGE |
6218				VM_FAULT_SET_HINDEX(hstate_index(h));
6219			goto backout_unlocked;
6220		}
6221
6222		/* Check for page in userfault range. */
6223		if (userfaultfd_minor(vma)) {
6224			folio_unlock(folio);
6225			folio_put(folio);
6226			/* See comment in userfaultfd_missing() block above */
6227			if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
6228				ret = 0;
6229				goto out;
6230			}
6231			return hugetlb_handle_userfault(vmf, mapping,
6232							VM_UFFD_MINOR);
6233		}
6234	}
6235
6236	/*
6237	 * If we are going to COW a private mapping later, we examine the
6238	 * pending reservations for this page now. This will ensure that
6239	 * any allocations necessary to record that reservation occur outside
6240	 * the spinlock.
6241	 */
6242	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6243		if (vma_needs_reservation(h, vma, vmf->address) < 0) {
6244			ret = VM_FAULT_OOM;
6245			goto backout_unlocked;
6246		}
6247		/* Just decrements count, does not deallocate */
6248		vma_end_reservation(h, vma, vmf->address);
6249	}
6250
6251	vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
6252	ret = 0;
6253	/* If pte changed from under us, retry */
6254	if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
6255		goto backout;
6256
6257	if (anon_rmap)
6258		hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
6259	else
6260		hugetlb_add_file_rmap(folio);
6261	new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED);
6262	/*
6263	 * If this pte was previously wr-protected, keep it wr-protected even
6264	 * if populated.
6265	 */
6266	if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
6267		new_pte = huge_pte_mkuffd_wp(new_pte);
6268	set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
6269
6270	hugetlb_count_add(pages_per_huge_page(h), mm);
6271	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6272		/* Optimization, do the COW without a second fault */
6273		ret = hugetlb_wp(folio, vmf);
6274	}
6275
6276	spin_unlock(vmf->ptl);
6277
6278	/*
6279	 * Only set hugetlb_migratable in newly allocated pages.  Existing pages
6280	 * found in the pagecache may not have hugetlb_migratable if they have
6281	 * been isolated for migration.
6282	 */
6283	if (new_folio)
6284		folio_set_hugetlb_migratable(folio);
6285
6286	folio_unlock(folio);
6287out:
6288	hugetlb_vma_unlock_read(vma);
6289
6290	/*
6291	 * We must check to release the per-VMA lock. __vmf_anon_prepare() is
6292	 * the only way ret can be set to VM_FAULT_RETRY.
6293	 */
6294	if (unlikely(ret & VM_FAULT_RETRY))
6295		vma_end_read(vma);
6296
6297	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6298	return ret;
6299
6300backout:
6301	spin_unlock(vmf->ptl);
6302backout_unlocked:
6303	if (new_folio && !new_pagecache_folio)
6304		restore_reserve_on_error(h, vma, vmf->address, folio);
6305
6306	folio_unlock(folio);
6307	folio_put(folio);
6308	goto out;
6309}
6310
6311#ifdef CONFIG_SMP
6312u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6313{
6314	unsigned long key[2];
6315	u32 hash;
6316
6317	key[0] = (unsigned long) mapping;
6318	key[1] = idx;
6319
6320	hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
6321
6322	return hash & (num_fault_mutexes - 1);
6323}
6324#else
6325/*
6326 * For uniprocessor systems we always use a single mutex, so just
6327 * return 0 and avoid the hashing overhead.
6328 */
6329u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6330{
6331	return 0;
6332}
6333#endif
6334
6335vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
6336			unsigned long address, unsigned int flags)
6337{
6338	vm_fault_t ret;
6339	u32 hash;
6340	struct folio *folio = NULL;
6341	struct folio *pagecache_folio = NULL;
6342	struct hstate *h = hstate_vma(vma);
6343	struct address_space *mapping;
6344	int need_wait_lock = 0;
6345	struct vm_fault vmf = {
6346		.vma = vma,
6347		.address = address & huge_page_mask(h),
6348		.real_address = address,
6349		.flags = flags,
6350		.pgoff = vma_hugecache_offset(h, vma,
6351				address & huge_page_mask(h)),
6352		/* TODO: Track hugetlb faults using vm_fault */
6353
6354		/*
6355		 * Some fields may not be initialized, be careful as it may
6356		 * be hard to debug if called functions make assumptions
6357		 */
6358	};
6359
6360	/*
6361	 * Serialize hugepage allocation and instantiation, so that we don't
6362	 * get spurious allocation failures if two CPUs race to instantiate
6363	 * the same page in the page cache.
6364	 */
6365	mapping = vma->vm_file->f_mapping;
6366	hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff);
6367	mutex_lock(&hugetlb_fault_mutex_table[hash]);
6368
6369	/*
6370	 * Acquire vma lock before calling huge_pte_alloc and hold
6371	 * until finished with vmf.pte.  This prevents huge_pmd_unshare from
6372	 * being called elsewhere and making the vmf.pte no longer valid.
6373	 */
6374	hugetlb_vma_lock_read(vma);
6375	vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
6376	if (!vmf.pte) {
6377		hugetlb_vma_unlock_read(vma);
6378		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6379		return VM_FAULT_OOM;
6380	}
6381
6382	vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte);
6383	if (huge_pte_none_mostly(vmf.orig_pte)) {
6384		if (is_pte_marker(vmf.orig_pte)) {
6385			pte_marker marker =
6386				pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
6387
6388			if (marker & PTE_MARKER_POISONED) {
6389				ret = VM_FAULT_HWPOISON_LARGE |
6390				      VM_FAULT_SET_HINDEX(hstate_index(h));
6391				goto out_mutex;
6392			} else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
6393				/* This isn't supported in hugetlb. */
6394				ret = VM_FAULT_SIGSEGV;
6395				goto out_mutex;
6396			}
6397		}
6398
6399		/*
6400		 * Other PTE markers should be handled the same way as none PTE.
6401		 *
6402		 * hugetlb_no_page will drop vma lock and hugetlb fault
6403		 * mutex internally, which make us return immediately.
6404		 */
6405		return hugetlb_no_page(mapping, &vmf);
6406	}
6407
6408	ret = 0;
6409
6410	/*
6411	 * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
6412	 * point, so this check prevents the kernel from going below assuming
6413	 * that we have an active hugepage in pagecache. This goto expects
6414	 * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
6415	 * check will properly handle it.
6416	 */
6417	if (!pte_present(vmf.orig_pte)) {
6418		if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
6419			/*
6420			 * Release the hugetlb fault lock now, but retain
6421			 * the vma lock, because it is needed to guard the
6422			 * huge_pte_lockptr() later in
6423			 * migration_entry_wait_huge(). The vma lock will
6424			 * be released there.
6425			 */
6426			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6427			migration_entry_wait_huge(vma, vmf.address, vmf.pte);
6428			return 0;
6429		} else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
6430			ret = VM_FAULT_HWPOISON_LARGE |
6431			    VM_FAULT_SET_HINDEX(hstate_index(h));
6432		goto out_mutex;
6433	}
6434
6435	/*
6436	 * If we are going to COW/unshare the mapping later, we examine the
6437	 * pending reservations for this page now. This will ensure that any
6438	 * allocations necessary to record that reservation occur outside the
6439	 * spinlock. Also lookup the pagecache page now as it is used to
6440	 * determine if a reservation has been consumed.
6441	 */
6442	if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
6443	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
6444		if (vma_needs_reservation(h, vma, vmf.address) < 0) {
6445			ret = VM_FAULT_OOM;
6446			goto out_mutex;
6447		}
6448		/* Just decrements count, does not deallocate */
6449		vma_end_reservation(h, vma, vmf.address);
6450
6451		pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
6452							     vmf.pgoff);
6453		if (IS_ERR(pagecache_folio))
6454			pagecache_folio = NULL;
6455	}
6456
6457	vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
6458
6459	/* Check for a racing update before calling hugetlb_wp() */
6460	if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte))))
6461		goto out_ptl;
6462
6463	/* Handle userfault-wp first, before trying to lock more pages */
6464	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) &&
6465	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
6466		if (!userfaultfd_wp_async(vma)) {
6467			spin_unlock(vmf.ptl);
6468			if (pagecache_folio) {
6469				folio_unlock(pagecache_folio);
6470				folio_put(pagecache_folio);
6471			}
6472			hugetlb_vma_unlock_read(vma);
6473			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6474			return handle_userfault(&vmf, VM_UFFD_WP);
6475		}
6476
6477		vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
6478		set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
6479				huge_page_size(hstate_vma(vma)));
6480		/* Fallthrough to CoW */
6481	}
6482
6483	/*
6484	 * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
6485	 * pagecache_folio, so here we need take the former one
6486	 * when folio != pagecache_folio or !pagecache_folio.
6487	 */
6488	folio = page_folio(pte_page(vmf.orig_pte));
6489	if (folio != pagecache_folio)
6490		if (!folio_trylock(folio)) {
6491			need_wait_lock = 1;
6492			goto out_ptl;
6493		}
6494
6495	folio_get(folio);
6496
6497	if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
6498		if (!huge_pte_write(vmf.orig_pte)) {
6499			ret = hugetlb_wp(pagecache_folio, &vmf);
6500			goto out_put_page;
6501		} else if (likely(flags & FAULT_FLAG_WRITE)) {
6502			vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
6503		}
6504	}
6505	vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
6506	if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
6507						flags & FAULT_FLAG_WRITE))
6508		update_mmu_cache(vma, vmf.address, vmf.pte);
6509out_put_page:
6510	if (folio != pagecache_folio)
6511		folio_unlock(folio);
6512	folio_put(folio);
6513out_ptl:
6514	spin_unlock(vmf.ptl);
6515
6516	if (pagecache_folio) {
6517		folio_unlock(pagecache_folio);
6518		folio_put(pagecache_folio);
6519	}
6520out_mutex:
6521	hugetlb_vma_unlock_read(vma);
6522
6523	/*
6524	 * We must check to release the per-VMA lock. __vmf_anon_prepare() in
6525	 * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY.
6526	 */
6527	if (unlikely(ret & VM_FAULT_RETRY))
6528		vma_end_read(vma);
6529
6530	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6531	/*
6532	 * Generally it's safe to hold refcount during waiting page lock. But
6533	 * here we just wait to defer the next page fault to avoid busy loop and
6534	 * the page is not used after unlocked before returning from the current
6535	 * page fault. So we are safe from accessing freed page, even if we wait
6536	 * here without taking refcount.
6537	 */
6538	if (need_wait_lock)
6539		folio_wait_locked(folio);
6540	return ret;
6541}
6542
6543#ifdef CONFIG_USERFAULTFD
6544/*
6545 * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
6546 */
6547static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
6548		struct vm_area_struct *vma, unsigned long address)
6549{
6550	struct mempolicy *mpol;
6551	nodemask_t *nodemask;
6552	struct folio *folio;
6553	gfp_t gfp_mask;
6554	int node;
6555
6556	gfp_mask = htlb_alloc_mask(h);
6557	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
6558	/*
6559	 * This is used to allocate a temporary hugetlb to hold the copied
6560	 * content, which will then be copied again to the final hugetlb
6561	 * consuming a reservation. Set the alloc_fallback to false to indicate
6562	 * that breaking the per-node hugetlb pool is not allowed in this case.
6563	 */
6564	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
6565	mpol_cond_put(mpol);
6566
6567	return folio;
6568}
6569
6570/*
6571 * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
6572 * with modifications for hugetlb pages.
6573 */
6574int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
6575			     struct vm_area_struct *dst_vma,
6576			     unsigned long dst_addr,
6577			     unsigned long src_addr,
6578			     uffd_flags_t flags,
6579			     struct folio **foliop)
6580{
6581	struct mm_struct *dst_mm = dst_vma->vm_mm;
6582	bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
6583	bool wp_enabled = (flags & MFILL_ATOMIC_WP);
6584	struct hstate *h = hstate_vma(dst_vma);
6585	struct address_space *mapping = dst_vma->vm_file->f_mapping;
6586	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
6587	unsigned long size = huge_page_size(h);
6588	int vm_shared = dst_vma->vm_flags & VM_SHARED;
6589	pte_t _dst_pte;
6590	spinlock_t *ptl;
6591	int ret = -ENOMEM;
6592	struct folio *folio;
6593	bool folio_in_pagecache = false;
6594
6595	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
6596		ptl = huge_pte_lock(h, dst_mm, dst_pte);
6597
6598		/* Don't overwrite any existing PTEs (even markers) */
6599		if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
6600			spin_unlock(ptl);
6601			return -EEXIST;
6602		}
6603
6604		_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
6605		set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);
6606
6607		/* No need to invalidate - it was non-present before */
6608		update_mmu_cache(dst_vma, dst_addr, dst_pte);
6609
6610		spin_unlock(ptl);
6611		return 0;
6612	}
6613
6614	if (is_continue) {
6615		ret = -EFAULT;
6616		folio = filemap_lock_hugetlb_folio(h, mapping, idx);
6617		if (IS_ERR(folio))
6618			goto out;
6619		folio_in_pagecache = true;
6620	} else if (!*foliop) {
6621		/* If a folio already exists, then it's UFFDIO_COPY for
6622		 * a non-missing case. Return -EEXIST.
6623		 */
6624		if (vm_shared &&
6625		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
6626			ret = -EEXIST;
6627			goto out;
6628		}
6629
6630		folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
6631		if (IS_ERR(folio)) {
6632			ret = -ENOMEM;
6633			goto out;
6634		}
6635
6636		ret = copy_folio_from_user(folio, (const void __user *) src_addr,
6637					   false);
6638
6639		/* fallback to copy_from_user outside mmap_lock */
6640		if (unlikely(ret)) {
6641			ret = -ENOENT;
6642			/* Free the allocated folio which may have
6643			 * consumed a reservation.
6644			 */
6645			restore_reserve_on_error(h, dst_vma, dst_addr, folio);
6646			folio_put(folio);
6647
6648			/* Allocate a temporary folio to hold the copied
6649			 * contents.
6650			 */
6651			folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
6652			if (!folio) {
6653				ret = -ENOMEM;
6654				goto out;
6655			}
6656			*foliop = folio;
6657			/* Set the outparam foliop and return to the caller to
6658			 * copy the contents outside the lock. Don't free the
6659			 * folio.
6660			 */
6661			goto out;
6662		}
6663	} else {
6664		if (vm_shared &&
6665		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
6666			folio_put(*foliop);
6667			ret = -EEXIST;
6668			*foliop = NULL;
6669			goto out;
6670		}
6671
6672		folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
6673		if (IS_ERR(folio)) {
6674			folio_put(*foliop);
6675			ret = -ENOMEM;
6676			*foliop = NULL;
6677			goto out;
6678		}
6679		ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
6680		folio_put(*foliop);
6681		*foliop = NULL;
6682		if (ret) {
6683			folio_put(folio);
6684			goto out;
6685		}
6686	}
6687
6688	/*
6689	 * If we just allocated a new page, we need a memory barrier to ensure
6690	 * that preceding stores to the page become visible before the
6691	 * set_pte_at() write. The memory barrier inside __folio_mark_uptodate
6692	 * is what we need.
6693	 *
6694	 * In the case where we have not allocated a new page (is_continue),
6695	 * the page must already be uptodate. UFFDIO_CONTINUE already includes
6696	 * an earlier smp_wmb() to ensure that prior stores will be visible
6697	 * before the set_pte_at() write.
6698	 */
6699	if (!is_continue)
6700		__folio_mark_uptodate(folio);
6701	else
6702		WARN_ON_ONCE(!folio_test_uptodate(folio));
6703
6704	/* Add shared, newly allocated pages to the page cache. */
6705	if (vm_shared && !is_continue) {
6706		ret = -EFAULT;
6707		if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h)))
6708			goto out_release_nounlock;
6709
6710		/*
6711		 * Serialization between remove_inode_hugepages() and
6712		 * hugetlb_add_to_page_cache() below happens through the
6713		 * hugetlb_fault_mutex_table that here must be hold by
6714		 * the caller.
6715		 */
6716		ret = hugetlb_add_to_page_cache(folio, mapping, idx);
6717		if (ret)
6718			goto out_release_nounlock;
6719		folio_in_pagecache = true;
6720	}
6721
6722	ptl = huge_pte_lock(h, dst_mm, dst_pte);
6723
6724	ret = -EIO;
6725	if (folio_test_hwpoison(folio))
6726		goto out_release_unlock;
6727
6728	/*
6729	 * We allow to overwrite a pte marker: consider when both MISSING|WP
6730	 * registered, we firstly wr-protect a none pte which has no page cache
6731	 * page backing it, then access the page.
6732	 */
6733	ret = -EEXIST;
6734	if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte)))
6735		goto out_release_unlock;
6736
6737	if (folio_in_pagecache)
6738		hugetlb_add_file_rmap(folio);
6739	else
6740		hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr);
6741
6742	/*
6743	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
6744	 * with wp flag set, don't set pte write bit.
6745	 */
6746	_dst_pte = make_huge_pte(dst_vma, &folio->page,
6747				 !wp_enabled && !(is_continue && !vm_shared));
6748	/*
6749	 * Always mark UFFDIO_COPY page dirty; note that this may not be
6750	 * extremely important for hugetlbfs for now since swapping is not
6751	 * supported, but we should still be clear in that this page cannot be
6752	 * thrown away at will, even if write bit not set.
6753	 */
6754	_dst_pte = huge_pte_mkdirty(_dst_pte);
6755	_dst_pte = pte_mkyoung(_dst_pte);
6756
6757	if (wp_enabled)
6758		_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
6759
6760	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);
6761
6762	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
6763
6764	/* No need to invalidate - it was non-present before */
6765	update_mmu_cache(dst_vma, dst_addr, dst_pte);
6766
6767	spin_unlock(ptl);
6768	if (!is_continue)
6769		folio_set_hugetlb_migratable(folio);
6770	if (vm_shared || is_continue)
6771		folio_unlock(folio);
6772	ret = 0;
6773out:
6774	return ret;
6775out_release_unlock:
6776	spin_unlock(ptl);
6777	if (vm_shared || is_continue)
6778		folio_unlock(folio);
6779out_release_nounlock:
6780	if (!folio_in_pagecache)
6781		restore_reserve_on_error(h, dst_vma, dst_addr, folio);
6782	folio_put(folio);
6783	goto out;
6784}
6785#endif /* CONFIG_USERFAULTFD */
6786
6787long hugetlb_change_protection(struct vm_area_struct *vma,
6788		unsigned long address, unsigned long end,
6789		pgprot_t newprot, unsigned long cp_flags)
6790{
6791	struct mm_struct *mm = vma->vm_mm;
6792	unsigned long start = address;
6793	pte_t *ptep;
6794	pte_t pte;
6795	struct hstate *h = hstate_vma(vma);
6796	long pages = 0, psize = huge_page_size(h);
6797	bool shared_pmd = false;
6798	struct mmu_notifier_range range;
6799	unsigned long last_addr_mask;
6800	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
6801	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
6802
6803	/*
6804	 * In the case of shared PMDs, the area to flush could be beyond
6805	 * start/end.  Set range.start/range.end to cover the maximum possible
6806	 * range if PMD sharing is possible.
6807	 */
6808	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
6809				0, mm, start, end);
6810	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6811
6812	BUG_ON(address >= end);
6813	flush_cache_range(vma, range.start, range.end);
6814
6815	mmu_notifier_invalidate_range_start(&range);
6816	hugetlb_vma_lock_write(vma);
6817	i_mmap_lock_write(vma->vm_file->f_mapping);
6818	last_addr_mask = hugetlb_mask_last_page(h);
6819	for (; address < end; address += psize) {
6820		spinlock_t *ptl;
6821		ptep = hugetlb_walk(vma, address, psize);
6822		if (!ptep) {
6823			if (!uffd_wp) {
6824				address |= last_addr_mask;
6825				continue;
6826			}
6827			/*
6828			 * Userfaultfd wr-protect requires pgtable
6829			 * pre-allocations to install pte markers.
6830			 */
6831			ptep = huge_pte_alloc(mm, vma, address, psize);
6832			if (!ptep) {
6833				pages = -ENOMEM;
6834				break;
6835			}
6836		}
6837		ptl = huge_pte_lock(h, mm, ptep);
6838		if (huge_pmd_unshare(mm, vma, address, ptep)) {
6839			/*
6840			 * When uffd-wp is enabled on the vma, unshare
6841			 * shouldn't happen at all.  Warn about it if it
6842			 * happened due to some reason.
6843			 */
6844			WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
6845			pages++;
6846			spin_unlock(ptl);
6847			shared_pmd = true;
6848			address |= last_addr_mask;
6849			continue;
6850		}
6851		pte = huge_ptep_get(mm, address, ptep);
6852		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
6853			/* Nothing to do. */
6854		} else if (unlikely(is_hugetlb_entry_migration(pte))) {
6855			swp_entry_t entry = pte_to_swp_entry(pte);
6856			struct page *page = pfn_swap_entry_to_page(entry);
6857			pte_t newpte = pte;
6858
6859			if (is_writable_migration_entry(entry)) {
6860				if (PageAnon(page))
6861					entry = make_readable_exclusive_migration_entry(
6862								swp_offset(entry));
6863				else
6864					entry = make_readable_migration_entry(
6865								swp_offset(entry));
6866				newpte = swp_entry_to_pte(entry);
6867				pages++;
6868			}
6869
6870			if (uffd_wp)
6871				newpte = pte_swp_mkuffd_wp(newpte);
6872			else if (uffd_wp_resolve)
6873				newpte = pte_swp_clear_uffd_wp(newpte);
6874			if (!pte_same(pte, newpte))
6875				set_huge_pte_at(mm, address, ptep, newpte, psize);
6876		} else if (unlikely(is_pte_marker(pte))) {
6877			/*
6878			 * Do nothing on a poison marker; page is
6879			 * corrupted, permissons do not apply.  Here
6880			 * pte_marker_uffd_wp()==true implies !poison
6881			 * because they're mutual exclusive.
6882			 */
6883			if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
6884				/* Safe to modify directly (non-present->none). */
6885				huge_pte_clear(mm, address, ptep, psize);
6886		} else if (!huge_pte_none(pte)) {
6887			pte_t old_pte;
6888			unsigned int shift = huge_page_shift(hstate_vma(vma));
6889
6890			old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
6891			pte = huge_pte_modify(old_pte, newprot);
6892			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
6893			if (uffd_wp)
6894				pte = huge_pte_mkuffd_wp(pte);
6895			else if (uffd_wp_resolve)
6896				pte = huge_pte_clear_uffd_wp(pte);
6897			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
6898			pages++;
6899		} else {
6900			/* None pte */
6901			if (unlikely(uffd_wp))
6902				/* Safe to modify directly (none->non-present). */
6903				set_huge_pte_at(mm, address, ptep,
6904						make_pte_marker(PTE_MARKER_UFFD_WP),
6905						psize);
6906		}
6907		spin_unlock(ptl);
6908	}
6909	/*
6910	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
6911	 * may have cleared our pud entry and done put_page on the page table:
6912	 * once we release i_mmap_rwsem, another task can do the final put_page
6913	 * and that page table be reused and filled with junk.  If we actually
6914	 * did unshare a page of pmds, flush the range corresponding to the pud.
6915	 */
6916	if (shared_pmd)
6917		flush_hugetlb_tlb_range(vma, range.start, range.end);
6918	else
6919		flush_hugetlb_tlb_range(vma, start, end);
6920	/*
6921	 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
6922	 * downgrading page table protection not changing it to point to a new
6923	 * page.
6924	 *
6925	 * See Documentation/mm/mmu_notifier.rst
6926	 */
6927	i_mmap_unlock_write(vma->vm_file->f_mapping);
6928	hugetlb_vma_unlock_write(vma);
6929	mmu_notifier_invalidate_range_end(&range);
6930
6931	return pages > 0 ? (pages << h->order) : pages;
6932}
6933
6934/* Return true if reservation was successful, false otherwise.  */
6935bool hugetlb_reserve_pages(struct inode *inode,
6936					long from, long to,
6937					struct vm_area_struct *vma,
6938					vm_flags_t vm_flags)
6939{
6940	long chg = -1, add = -1;
6941	struct hstate *h = hstate_inode(inode);
6942	struct hugepage_subpool *spool = subpool_inode(inode);
6943	struct resv_map *resv_map;
6944	struct hugetlb_cgroup *h_cg = NULL;
6945	long gbl_reserve, regions_needed = 0;
6946
6947	/* This should never happen */
6948	if (from > to) {
6949		VM_WARN(1, "%s called with a negative range\n", __func__);
6950		return false;
6951	}
6952
6953	/*
6954	 * vma specific semaphore used for pmd sharing and fault/truncation
6955	 * synchronization
6956	 */
6957	hugetlb_vma_lock_alloc(vma);
6958
6959	/*
6960	 * Only apply hugepage reservation if asked. At fault time, an
6961	 * attempt will be made for VM_NORESERVE to allocate a page
6962	 * without using reserves
6963	 */
6964	if (vm_flags & VM_NORESERVE)
6965		return true;
6966
6967	/*
6968	 * Shared mappings base their reservation on the number of pages that
6969	 * are already allocated on behalf of the file. Private mappings need
6970	 * to reserve the full area even if read-only as mprotect() may be
6971	 * called to make the mapping read-write. Assume !vma is a shm mapping
6972	 */
6973	if (!vma || vma->vm_flags & VM_MAYSHARE) {
6974		/*
6975		 * resv_map can not be NULL as hugetlb_reserve_pages is only
6976		 * called for inodes for which resv_maps were created (see
6977		 * hugetlbfs_get_inode).
6978		 */
6979		resv_map = inode_resv_map(inode);
6980
6981		chg = region_chg(resv_map, from, to, &regions_needed);
6982	} else {
6983		/* Private mapping. */
6984		resv_map = resv_map_alloc();
6985		if (!resv_map)
6986			goto out_err;
6987
6988		chg = to - from;
6989
6990		set_vma_resv_map(vma, resv_map);
6991		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
6992	}
6993
6994	if (chg < 0)
6995		goto out_err;
6996
6997	if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
6998				chg * pages_per_huge_page(h), &h_cg) < 0)
6999		goto out_err;
7000
7001	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
7002		/* For private mappings, the hugetlb_cgroup uncharge info hangs
7003		 * of the resv_map.
7004		 */
7005		resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
7006	}
7007
7008	/*
7009	 * There must be enough pages in the subpool for the mapping. If
7010	 * the subpool has a minimum size, there may be some global
7011	 * reservations already in place (gbl_reserve).
7012	 */
7013	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
7014	if (gbl_reserve < 0)
7015		goto out_uncharge_cgroup;
7016
7017	/*
7018	 * Check enough hugepages are available for the reservation.
7019	 * Hand the pages back to the subpool if there are not
7020	 */
7021	if (hugetlb_acct_memory(h, gbl_reserve) < 0)
7022		goto out_put_pages;
7023
7024	/*
7025	 * Account for the reservations made. Shared mappings record regions
7026	 * that have reservations as they are shared by multiple VMAs.
7027	 * When the last VMA disappears, the region map says how much
7028	 * the reservation was and the page cache tells how much of
7029	 * the reservation was consumed. Private mappings are per-VMA and
7030	 * only the consumed reservations are tracked. When the VMA
7031	 * disappears, the original reservation is the VMA size and the
7032	 * consumed reservations are stored in the map. Hence, nothing
7033	 * else has to be done for private mappings here
7034	 */
7035	if (!vma || vma->vm_flags & VM_MAYSHARE) {
7036		add = region_add(resv_map, from, to, regions_needed, h, h_cg);
7037
7038		if (unlikely(add < 0)) {
7039			hugetlb_acct_memory(h, -gbl_reserve);
7040			goto out_put_pages;
7041		} else if (unlikely(chg > add)) {
7042			/*
7043			 * pages in this range were added to the reserve
7044			 * map between region_chg and region_add.  This
7045			 * indicates a race with alloc_hugetlb_folio.  Adjust
7046			 * the subpool and reserve counts modified above
7047			 * based on the difference.
7048			 */
7049			long rsv_adjust;
7050
7051			/*
7052			 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
7053			 * reference to h_cg->css. See comment below for detail.
7054			 */
7055			hugetlb_cgroup_uncharge_cgroup_rsvd(
7056				hstate_index(h),
7057				(chg - add) * pages_per_huge_page(h), h_cg);
7058
7059			rsv_adjust = hugepage_subpool_put_pages(spool,
7060								chg - add);
7061			hugetlb_acct_memory(h, -rsv_adjust);
7062		} else if (h_cg) {
7063			/*
7064			 * The file_regions will hold their own reference to
7065			 * h_cg->css. So we should release the reference held
7066			 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
7067			 * done.
7068			 */
7069			hugetlb_cgroup_put_rsvd_cgroup(h_cg);
7070		}
7071	}
7072	return true;
7073
7074out_put_pages:
7075	/* put back original number of pages, chg */
7076	(void)hugepage_subpool_put_pages(spool, chg);
7077out_uncharge_cgroup:
7078	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
7079					    chg * pages_per_huge_page(h), h_cg);
7080out_err:
7081	hugetlb_vma_lock_free(vma);
7082	if (!vma || vma->vm_flags & VM_MAYSHARE)
7083		/* Only call region_abort if the region_chg succeeded but the
7084		 * region_add failed or didn't run.
7085		 */
7086		if (chg >= 0 && add < 0)
7087			region_abort(resv_map, from, to, regions_needed);
7088	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
7089		kref_put(&resv_map->refs, resv_map_release);
7090		set_vma_resv_map(vma, NULL);
7091	}
7092	return false;
7093}
7094
7095long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
7096								long freed)
7097{
7098	struct hstate *h = hstate_inode(inode);
7099	struct resv_map *resv_map = inode_resv_map(inode);
7100	long chg = 0;
7101	struct hugepage_subpool *spool = subpool_inode(inode);
7102	long gbl_reserve;
7103
7104	/*
7105	 * Since this routine can be called in the evict inode path for all
7106	 * hugetlbfs inodes, resv_map could be NULL.
7107	 */
7108	if (resv_map) {
7109		chg = region_del(resv_map, start, end);
7110		/*
7111		 * region_del() can fail in the rare case where a region
7112		 * must be split and another region descriptor can not be
7113		 * allocated.  If end == LONG_MAX, it will not fail.
7114		 */
7115		if (chg < 0)
7116			return chg;
7117	}
7118
7119	spin_lock(&inode->i_lock);
7120	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
7121	spin_unlock(&inode->i_lock);
7122
7123	/*
7124	 * If the subpool has a minimum size, the number of global
7125	 * reservations to be released may be adjusted.
7126	 *
7127	 * Note that !resv_map implies freed == 0. So (chg - freed)
7128	 * won't go negative.
7129	 */
7130	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
7131	hugetlb_acct_memory(h, -gbl_reserve);
7132
7133	return 0;
7134}
7135
7136#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
7137static unsigned long page_table_shareable(struct vm_area_struct *svma,
7138				struct vm_area_struct *vma,
7139				unsigned long addr, pgoff_t idx)
7140{
7141	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
7142				svma->vm_start;
7143	unsigned long sbase = saddr & PUD_MASK;
7144	unsigned long s_end = sbase + PUD_SIZE;
7145
7146	/* Allow segments to share if only one is marked locked */
7147	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
7148	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
7149
7150	/*
7151	 * match the virtual addresses, permission and the alignment of the
7152	 * page table page.
7153	 *
7154	 * Also, vma_lock (vm_private_data) is required for sharing.
7155	 */
7156	if (pmd_index(addr) != pmd_index(saddr) ||
7157	    vm_flags != svm_flags ||
7158	    !range_in_vma(svma, sbase, s_end) ||
7159	    !svma->vm_private_data)
7160		return 0;
7161
7162	return saddr;
7163}
7164
7165bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
7166{
7167	unsigned long start = addr & PUD_MASK;
7168	unsigned long end = start + PUD_SIZE;
7169
7170#ifdef CONFIG_USERFAULTFD
7171	if (uffd_disable_huge_pmd_share(vma))
7172		return false;
7173#endif
7174	/*
7175	 * check on proper vm_flags and page table alignment
7176	 */
7177	if (!(vma->vm_flags & VM_MAYSHARE))
7178		return false;
7179	if (!vma->vm_private_data)	/* vma lock required for sharing */
7180		return false;
7181	if (!range_in_vma(vma, start, end))
7182		return false;
7183	return true;
7184}
7185
7186/*
7187 * Determine if start,end range within vma could be mapped by shared pmd.
7188 * If yes, adjust start and end to cover range associated with possible
7189 * shared pmd mappings.
7190 */
7191void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7192				unsigned long *start, unsigned long *end)
7193{
7194	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
7195		v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
7196
7197	/*
7198	 * vma needs to span at least one aligned PUD size, and the range
7199	 * must be at least partially within in.
7200	 */
7201	if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
7202		(*end <= v_start) || (*start >= v_end))
7203		return;
7204
7205	/* Extend the range to be PUD aligned for a worst case scenario */
7206	if (*start > v_start)
7207		*start = ALIGN_DOWN(*start, PUD_SIZE);
7208
7209	if (*end < v_end)
7210		*end = ALIGN(*end, PUD_SIZE);
7211}
7212
7213/*
7214 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
7215 * and returns the corresponding pte. While this is not necessary for the
7216 * !shared pmd case because we can allocate the pmd later as well, it makes the
7217 * code much cleaner. pmd allocation is essential for the shared case because
7218 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
7219 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
7220 * bad pmd for sharing.
7221 */
7222pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7223		      unsigned long addr, pud_t *pud)
7224{
7225	struct address_space *mapping = vma->vm_file->f_mapping;
7226	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
7227			vma->vm_pgoff;
7228	struct vm_area_struct *svma;
7229	unsigned long saddr;
7230	pte_t *spte = NULL;
7231	pte_t *pte;
7232
7233	i_mmap_lock_read(mapping);
7234	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
7235		if (svma == vma)
7236			continue;
7237
7238		saddr = page_table_shareable(svma, vma, addr, idx);
7239		if (saddr) {
7240			spte = hugetlb_walk(svma, saddr,
7241					    vma_mmu_pagesize(svma));
7242			if (spte) {
7243				ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
7244				break;
7245			}
7246		}
7247	}
7248
7249	if (!spte)
7250		goto out;
7251
7252	spin_lock(&mm->page_table_lock);
7253	if (pud_none(*pud)) {
7254		pud_populate(mm, pud,
7255				(pmd_t *)((unsigned long)spte & PAGE_MASK));
7256		mm_inc_nr_pmds(mm);
7257	} else {
7258		ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
7259	}
7260	spin_unlock(&mm->page_table_lock);
7261out:
7262	pte = (pte_t *)pmd_alloc(mm, pud, addr);
7263	i_mmap_unlock_read(mapping);
7264	return pte;
7265}
7266
7267/*
7268 * unmap huge page backed by shared pte.
7269 *
7270 * Called with page table lock held.
7271 *
7272 * returns: 1 successfully unmapped a shared pte page
7273 *	    0 the underlying pte page is not shared, or it is the last user
7274 */
7275int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
7276					unsigned long addr, pte_t *ptep)
7277{
7278	unsigned long sz = huge_page_size(hstate_vma(vma));
7279	pgd_t *pgd = pgd_offset(mm, addr);
7280	p4d_t *p4d = p4d_offset(pgd, addr);
7281	pud_t *pud = pud_offset(p4d, addr);
7282
7283	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7284	hugetlb_vma_assert_locked(vma);
7285	if (sz != PMD_SIZE)
7286		return 0;
7287	if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
7288		return 0;
7289
7290	pud_clear(pud);
7291	ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
7292	mm_dec_nr_pmds(mm);
7293	return 1;
7294}
7295
7296#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
7297
7298pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7299		      unsigned long addr, pud_t *pud)
7300{
7301	return NULL;
7302}
7303
7304int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
7305				unsigned long addr, pte_t *ptep)
7306{
7307	return 0;
7308}
7309
7310void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7311				unsigned long *start, unsigned long *end)
7312{
7313}
7314
7315bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
7316{
7317	return false;
7318}
7319#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
7320
7321#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
7322pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
7323			unsigned long addr, unsigned long sz)
7324{
7325	pgd_t *pgd;
7326	p4d_t *p4d;
7327	pud_t *pud;
7328	pte_t *pte = NULL;
7329
7330	pgd = pgd_offset(mm, addr);
7331	p4d = p4d_alloc(mm, pgd, addr);
7332	if (!p4d)
7333		return NULL;
7334	pud = pud_alloc(mm, p4d, addr);
7335	if (pud) {
7336		if (sz == PUD_SIZE) {
7337			pte = (pte_t *)pud;
7338		} else {
7339			BUG_ON(sz != PMD_SIZE);
7340			if (want_pmd_share(vma, addr) && pud_none(*pud))
7341				pte = huge_pmd_share(mm, vma, addr, pud);
7342			else
7343				pte = (pte_t *)pmd_alloc(mm, pud, addr);
7344		}
7345	}
7346
7347	if (pte) {
7348		pte_t pteval = ptep_get_lockless(pte);
7349
7350		BUG_ON(pte_present(pteval) && !pte_huge(pteval));
7351	}
7352
7353	return pte;
7354}
7355
7356/*
7357 * huge_pte_offset() - Walk the page table to resolve the hugepage
7358 * entry at address @addr
7359 *
7360 * Return: Pointer to page table entry (PUD or PMD) for
7361 * address @addr, or NULL if a !p*d_present() entry is encountered and the
7362 * size @sz doesn't match the hugepage size at this level of the page
7363 * table.
7364 */
7365pte_t *huge_pte_offset(struct mm_struct *mm,
7366		       unsigned long addr, unsigned long sz)
7367{
7368	pgd_t *pgd;
7369	p4d_t *p4d;
7370	pud_t *pud;
7371	pmd_t *pmd;
7372
7373	pgd = pgd_offset(mm, addr);
7374	if (!pgd_present(*pgd))
7375		return NULL;
7376	p4d = p4d_offset(pgd, addr);
7377	if (!p4d_present(*p4d))
7378		return NULL;
7379
7380	pud = pud_offset(p4d, addr);
7381	if (sz == PUD_SIZE)
7382		/* must be pud huge, non-present or none */
7383		return (pte_t *)pud;
7384	if (!pud_present(*pud))
7385		return NULL;
7386	/* must have a valid entry and size to go further */
7387
7388	pmd = pmd_offset(pud, addr);
7389	/* must be pmd huge, non-present or none */
7390	return (pte_t *)pmd;
7391}
7392
7393/*
7394 * Return a mask that can be used to update an address to the last huge
7395 * page in a page table page mapping size.  Used to skip non-present
7396 * page table entries when linearly scanning address ranges.  Architectures
7397 * with unique huge page to page table relationships can define their own
7398 * version of this routine.
7399 */
7400unsigned long hugetlb_mask_last_page(struct hstate *h)
7401{
7402	unsigned long hp_size = huge_page_size(h);
7403
7404	if (hp_size == PUD_SIZE)
7405		return P4D_SIZE - PUD_SIZE;
7406	else if (hp_size == PMD_SIZE)
7407		return PUD_SIZE - PMD_SIZE;
7408	else
7409		return 0UL;
7410}
7411
7412#else
7413
7414/* See description above.  Architectures can provide their own version. */
7415__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
7416{
7417#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
7418	if (huge_page_size(h) == PMD_SIZE)
7419		return PUD_SIZE - PMD_SIZE;
7420#endif
7421	return 0UL;
7422}
7423
7424#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
7425
7426/**
7427 * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
7428 * @folio: the folio to isolate
7429 * @list: the list to add the folio to on success
7430 *
7431 * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
7432 * isolated/non-migratable, and moving it from the active list to the
7433 * given list.
7434 *
7435 * Isolation will fail if @folio is not an allocated hugetlb folio, or if
7436 * it is already isolated/non-migratable.
7437 *
7438 * On success, an additional folio reference is taken that must be dropped
7439 * using folio_putback_hugetlb() to undo the isolation.
7440 *
7441 * Return: True if isolation worked, otherwise False.
7442 */
7443bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
7444{
7445	bool ret = true;
7446
7447	spin_lock_irq(&hugetlb_lock);
7448	if (!folio_test_hugetlb(folio) ||
7449	    !folio_test_hugetlb_migratable(folio) ||
7450	    !folio_try_get(folio)) {
7451		ret = false;
7452		goto unlock;
7453	}
7454	folio_clear_hugetlb_migratable(folio);
7455	list_move_tail(&folio->lru, list);
7456unlock:
7457	spin_unlock_irq(&hugetlb_lock);
7458	return ret;
7459}
7460
7461int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
7462{
7463	int ret = 0;
7464
7465	*hugetlb = false;
7466	spin_lock_irq(&hugetlb_lock);
7467	if (folio_test_hugetlb(folio)) {
7468		*hugetlb = true;
7469		if (folio_test_hugetlb_freed(folio))
7470			ret = 0;
7471		else if (folio_test_hugetlb_migratable(folio) || unpoison)
7472			ret = folio_try_get(folio);
7473		else
7474			ret = -EBUSY;
7475	}
7476	spin_unlock_irq(&hugetlb_lock);
7477	return ret;
7478}
7479
7480int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
7481				bool *migratable_cleared)
7482{
7483	int ret;
7484
7485	spin_lock_irq(&hugetlb_lock);
7486	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
7487	spin_unlock_irq(&hugetlb_lock);
7488	return ret;
7489}
7490
7491/**
7492 * folio_putback_hugetlb - unisolate a hugetlb folio
7493 * @folio: the isolated hugetlb folio
7494 *
7495 * Putback/un-isolate the hugetlb folio that was previous isolated using
7496 * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
7497 * back onto the active list.
7498 *
7499 * Will drop the additional folio reference obtained through
7500 * folio_isolate_hugetlb().
7501 */
7502void folio_putback_hugetlb(struct folio *folio)
7503{
7504	spin_lock_irq(&hugetlb_lock);
7505	folio_set_hugetlb_migratable(folio);
7506	list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
7507	spin_unlock_irq(&hugetlb_lock);
7508	folio_put(folio);
7509}
7510
7511void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
7512{
7513	struct hstate *h = folio_hstate(old_folio);
7514
7515	hugetlb_cgroup_migrate(old_folio, new_folio);
7516	set_page_owner_migrate_reason(&new_folio->page, reason);
7517
7518	/*
7519	 * transfer temporary state of the new hugetlb folio. This is
7520	 * reverse to other transitions because the newpage is going to
7521	 * be final while the old one will be freed so it takes over
7522	 * the temporary status.
7523	 *
7524	 * Also note that we have to transfer the per-node surplus state
7525	 * here as well otherwise the global surplus count will not match
7526	 * the per-node's.
7527	 */
7528	if (folio_test_hugetlb_temporary(new_folio)) {
7529		int old_nid = folio_nid(old_folio);
7530		int new_nid = folio_nid(new_folio);
7531
7532		folio_set_hugetlb_temporary(old_folio);
7533		folio_clear_hugetlb_temporary(new_folio);
7534
7535
7536		/*
7537		 * There is no need to transfer the per-node surplus state
7538		 * when we do not cross the node.
7539		 */
7540		if (new_nid == old_nid)
7541			return;
7542		spin_lock_irq(&hugetlb_lock);
7543		if (h->surplus_huge_pages_node[old_nid]) {
7544			h->surplus_huge_pages_node[old_nid]--;
7545			h->surplus_huge_pages_node[new_nid]++;
7546		}
7547		spin_unlock_irq(&hugetlb_lock);
7548	}
7549
7550	/*
7551	 * Our old folio is isolated and has "migratable" cleared until it
7552	 * is putback. As migration succeeded, set the new folio "migratable"
7553	 * and add it to the active list.
7554	 */
7555	spin_lock_irq(&hugetlb_lock);
7556	folio_set_hugetlb_migratable(new_folio);
7557	list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
7558	spin_unlock_irq(&hugetlb_lock);
7559}
7560
7561static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7562				   unsigned long start,
7563				   unsigned long end)
7564{
7565	struct hstate *h = hstate_vma(vma);
7566	unsigned long sz = huge_page_size(h);
7567	struct mm_struct *mm = vma->vm_mm;
7568	struct mmu_notifier_range range;
7569	unsigned long address;
7570	spinlock_t *ptl;
7571	pte_t *ptep;
7572
7573	if (!(vma->vm_flags & VM_MAYSHARE))
7574		return;
7575
7576	if (start >= end)
7577		return;
7578
7579	flush_cache_range(vma, start, end);
7580	/*
7581	 * No need to call adjust_range_if_pmd_sharing_possible(), because
7582	 * we have already done the PUD_SIZE alignment.
7583	 */
7584	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
7585				start, end);
7586	mmu_notifier_invalidate_range_start(&range);
7587	hugetlb_vma_lock_write(vma);
7588	i_mmap_lock_write(vma->vm_file->f_mapping);
7589	for (address = start; address < end; address += PUD_SIZE) {
7590		ptep = hugetlb_walk(vma, address, sz);
7591		if (!ptep)
7592			continue;
7593		ptl = huge_pte_lock(h, mm, ptep);
7594		huge_pmd_unshare(mm, vma, address, ptep);
7595		spin_unlock(ptl);
7596	}
7597	flush_hugetlb_tlb_range(vma, start, end);
7598	i_mmap_unlock_write(vma->vm_file->f_mapping);
7599	hugetlb_vma_unlock_write(vma);
7600	/*
7601	 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
7602	 * Documentation/mm/mmu_notifier.rst.
7603	 */
7604	mmu_notifier_invalidate_range_end(&range);
7605}
7606
7607/*
7608 * This function will unconditionally remove all the shared pmd pgtable entries
7609 * within the specific vma for a hugetlbfs memory range.
7610 */
7611void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7612{
7613	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7614			ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7615}
7616
7617#ifdef CONFIG_CMA
7618static bool cma_reserve_called __initdata;
7619
7620static int __init cmdline_parse_hugetlb_cma(char *p)
7621{
7622	int nid, count = 0;
7623	unsigned long tmp;
7624	char *s = p;
7625
7626	while (*s) {
7627		if (sscanf(s, "%lu%n", &tmp, &count) != 1)
7628			break;
7629
7630		if (s[count] == ':') {
7631			if (tmp >= MAX_NUMNODES)
7632				break;
7633			nid = array_index_nospec(tmp, MAX_NUMNODES);
7634
7635			s += count + 1;
7636			tmp = memparse(s, &s);
7637			hugetlb_cma_size_in_node[nid] = tmp;
7638			hugetlb_cma_size += tmp;
7639
7640			/*
7641			 * Skip the separator if have one, otherwise
7642			 * break the parsing.
7643			 */
7644			if (*s == ',')
7645				s++;
7646			else
7647				break;
7648		} else {
7649			hugetlb_cma_size = memparse(p, &p);
7650			break;
7651		}
7652	}
7653
7654	return 0;
7655}
7656
7657early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
7658
7659void __init hugetlb_cma_reserve(int order)
7660{
7661	unsigned long size, reserved, per_node;
7662	bool node_specific_cma_alloc = false;
7663	int nid;
7664
7665	/*
7666	 * HugeTLB CMA reservation is required for gigantic
7667	 * huge pages which could not be allocated via the
7668	 * page allocator. Just warn if there is any change
7669	 * breaking this assumption.
7670	 */
7671	VM_WARN_ON(order <= MAX_PAGE_ORDER);
7672	cma_reserve_called = true;
7673
7674	if (!hugetlb_cma_size)
7675		return;
7676
7677	for (nid = 0; nid < MAX_NUMNODES; nid++) {
7678		if (hugetlb_cma_size_in_node[nid] == 0)
7679			continue;
7680
7681		if (!node_online(nid)) {
7682			pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
7683			hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7684			hugetlb_cma_size_in_node[nid] = 0;
7685			continue;
7686		}
7687
7688		if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
7689			pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
7690				nid, (PAGE_SIZE << order) / SZ_1M);
7691			hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7692			hugetlb_cma_size_in_node[nid] = 0;
7693		} else {
7694			node_specific_cma_alloc = true;
7695		}
7696	}
7697
7698	/* Validate the CMA size again in case some invalid nodes specified. */
7699	if (!hugetlb_cma_size)
7700		return;
7701
7702	if (hugetlb_cma_size < (PAGE_SIZE << order)) {
7703		pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
7704			(PAGE_SIZE << order) / SZ_1M);
7705		hugetlb_cma_size = 0;
7706		return;
7707	}
7708
7709	if (!node_specific_cma_alloc) {
7710		/*
7711		 * If 3 GB area is requested on a machine with 4 numa nodes,
7712		 * let's allocate 1 GB on first three nodes and ignore the last one.
7713		 */
7714		per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
7715		pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
7716			hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
7717	}
7718
7719	reserved = 0;
7720	for_each_online_node(nid) {
7721		int res;
7722		char name[CMA_MAX_NAME];
7723
7724		if (node_specific_cma_alloc) {
7725			if (hugetlb_cma_size_in_node[nid] == 0)
7726				continue;
7727
7728			size = hugetlb_cma_size_in_node[nid];
7729		} else {
7730			size = min(per_node, hugetlb_cma_size - reserved);
7731		}
7732
7733		size = round_up(size, PAGE_SIZE << order);
7734
7735		snprintf(name, sizeof(name), "hugetlb%d", nid);
7736		/*
7737		 * Note that 'order per bit' is based on smallest size that
7738		 * may be returned to CMA allocator in the case of
7739		 * huge page demotion.
7740		 */
7741		res = cma_declare_contiguous_nid(0, size, 0,
7742					PAGE_SIZE << order,
7743					HUGETLB_PAGE_ORDER, false, name,
7744					&hugetlb_cma[nid], nid);
7745		if (res) {
7746			pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7747				res, nid);
7748			continue;
7749		}
7750
7751		reserved += size;
7752		pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7753			size / SZ_1M, nid);
7754
7755		if (reserved >= hugetlb_cma_size)
7756			break;
7757	}
7758
7759	if (!reserved)
7760		/*
7761		 * hugetlb_cma_size is used to determine if allocations from
7762		 * cma are possible.  Set to zero if no cma regions are set up.
7763		 */
7764		hugetlb_cma_size = 0;
7765}
7766
7767static void __init hugetlb_cma_check(void)
7768{
7769	if (!hugetlb_cma_size || cma_reserve_called)
7770		return;
7771
7772	pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7773}
7774
7775#endif /* CONFIG_CMA */
Configure Feed

Configure Feed