mm/internal.h at v6.6-rc5 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / internal.h
at v6.6-rc5 1157 lines 36 kB view raw
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/* internal.h: mm/ internal definitions
   3 *
   4 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
   5 * Written by David Howells (dhowells@redhat.com)
   6 */
   7#ifndef __MM_INTERNAL_H
   8#define __MM_INTERNAL_H
   9
  10#include <linux/fs.h>
  11#include <linux/mm.h>
  12#include <linux/pagemap.h>
  13#include <linux/rmap.h>
  14#include <linux/tracepoint-defs.h>
  15
  16struct folio_batch;
  17
  18/*
  19 * The set of flags that only affect watermark checking and reclaim
  20 * behaviour. This is used by the MM to obey the caller constraints
  21 * about IO, FS and watermark checking while ignoring placement
  22 * hints such as HIGHMEM usage.
  23 */
  24#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
  25			__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
  26			__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
  27			__GFP_NOLOCKDEP)
  28
  29/* The GFP flags allowed during early boot */
  30#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
  31
  32/* Control allocation cpuset and node placement constraints */
  33#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
  34
  35/* Do not use these with a slab allocator */
  36#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
  37
  38/*
  39 * Different from WARN_ON_ONCE(), no warning will be issued
  40 * when we specify __GFP_NOWARN.
  41 */
  42#define WARN_ON_ONCE_GFP(cond, gfp)	({				\
  43	static bool __section(".data.once") __warned;			\
  44	int __ret_warn_once = !!(cond);					\
  45									\
  46	if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
  47		__warned = true;					\
  48		WARN_ON(1);						\
  49	}								\
  50	unlikely(__ret_warn_once);					\
  51})
  52
  53void page_writeback_init(void);
  54
  55/*
  56 * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
  57 * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit
  58 * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
  59 * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
  60 */
  61#define COMPOUND_MAPPED		0x800000
  62#define FOLIO_PAGES_MAPPED	(COMPOUND_MAPPED - 1)
  63
  64/*
  65 * Flags passed to __show_mem() and show_free_areas() to suppress output in
  66 * various contexts.
  67 */
  68#define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
  69
  70/*
  71 * How many individual pages have an elevated _mapcount.  Excludes
  72 * the folio's entire_mapcount.
  73 */
  74static inline int folio_nr_pages_mapped(struct folio *folio)
  75{
  76	return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
  77}
  78
  79static inline void *folio_raw_mapping(struct folio *folio)
  80{
  81	unsigned long mapping = (unsigned long)folio->mapping;
  82
  83	return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
  84}
  85
  86void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
  87						int nr_throttled);
  88static inline void acct_reclaim_writeback(struct folio *folio)
  89{
  90	pg_data_t *pgdat = folio_pgdat(folio);
  91	int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
  92
  93	if (nr_throttled)
  94		__acct_reclaim_writeback(pgdat, folio, nr_throttled);
  95}
  96
  97static inline void wake_throttle_isolated(pg_data_t *pgdat)
  98{
  99	wait_queue_head_t *wqh;
 100
 101	wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
 102	if (waitqueue_active(wqh))
 103		wake_up(wqh);
 104}
 105
 106vm_fault_t do_swap_page(struct vm_fault *vmf);
 107void folio_rotate_reclaimable(struct folio *folio);
 108bool __folio_end_writeback(struct folio *folio);
 109void deactivate_file_folio(struct folio *folio);
 110void folio_activate(struct folio *folio);
 111
 112void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 113		   struct vm_area_struct *start_vma, unsigned long floor,
 114		   unsigned long ceiling, bool mm_wr_locked);
 115void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
 116
 117struct zap_details;
 118void unmap_page_range(struct mmu_gather *tlb,
 119			     struct vm_area_struct *vma,
 120			     unsigned long addr, unsigned long end,
 121			     struct zap_details *details);
 122
 123void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
 124		unsigned int order);
 125void force_page_cache_ra(struct readahead_control *, unsigned long nr);
 126static inline void force_page_cache_readahead(struct address_space *mapping,
 127		struct file *file, pgoff_t index, unsigned long nr_to_read)
 128{
 129	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
 130	force_page_cache_ra(&ractl, nr_to_read);
 131}
 132
 133unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
 134		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
 135unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
 136		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
 137void filemap_free_folio(struct address_space *mapping, struct folio *folio);
 138int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
 139bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
 140		loff_t end);
 141long invalidate_inode_page(struct page *page);
 142unsigned long mapping_try_invalidate(struct address_space *mapping,
 143		pgoff_t start, pgoff_t end, unsigned long *nr_failed);
 144
 145/**
 146 * folio_evictable - Test whether a folio is evictable.
 147 * @folio: The folio to test.
 148 *
 149 * Test whether @folio is evictable -- i.e., should be placed on
 150 * active/inactive lists vs unevictable list.
 151 *
 152 * Reasons folio might not be evictable:
 153 * 1. folio's mapping marked unevictable
 154 * 2. One of the pages in the folio is part of an mlocked VMA
 155 */
 156static inline bool folio_evictable(struct folio *folio)
 157{
 158	bool ret;
 159
 160	/* Prevent address_space of inode and swap cache from being freed */
 161	rcu_read_lock();
 162	ret = !mapping_unevictable(folio_mapping(folio)) &&
 163			!folio_test_mlocked(folio);
 164	rcu_read_unlock();
 165	return ret;
 166}
 167
 168/*
 169 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 170 * a count of one.
 171 */
 172static inline void set_page_refcounted(struct page *page)
 173{
 174	VM_BUG_ON_PAGE(PageTail(page), page);
 175	VM_BUG_ON_PAGE(page_ref_count(page), page);
 176	set_page_count(page, 1);
 177}
 178
 179/*
 180 * Return true if a folio needs ->release_folio() calling upon it.
 181 */
 182static inline bool folio_needs_release(struct folio *folio)
 183{
 184	struct address_space *mapping = folio_mapping(folio);
 185
 186	return folio_has_private(folio) ||
 187		(mapping && mapping_release_always(mapping));
 188}
 189
 190extern unsigned long highest_memmap_pfn;
 191
 192/*
 193 * Maximum number of reclaim retries without progress before the OOM
 194 * killer is consider the only way forward.
 195 */
 196#define MAX_RECLAIM_RETRIES 16
 197
 198/*
 199 * in mm/vmscan.c:
 200 */
 201bool isolate_lru_page(struct page *page);
 202bool folio_isolate_lru(struct folio *folio);
 203void putback_lru_page(struct page *page);
 204void folio_putback_lru(struct folio *folio);
 205extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
 206
 207/*
 208 * in mm/rmap.c:
 209 */
 210pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 211
 212/*
 213 * in mm/page_alloc.c
 214 */
 215#define K(x) ((x) << (PAGE_SHIFT-10))
 216
 217extern char * const zone_names[MAX_NR_ZONES];
 218
 219/* perform sanity checks on struct pages being allocated or freed */
 220DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
 221
 222extern int min_free_kbytes;
 223
 224void setup_per_zone_wmarks(void);
 225void calculate_min_free_kbytes(void);
 226int __meminit init_per_zone_wmark_min(void);
 227void page_alloc_sysctl_init(void);
 228
 229/*
 230 * Structure for holding the mostly immutable allocation parameters passed
 231 * between functions involved in allocations, including the alloc_pages*
 232 * family of functions.
 233 *
 234 * nodemask, migratetype and highest_zoneidx are initialized only once in
 235 * __alloc_pages() and then never change.
 236 *
 237 * zonelist, preferred_zone and highest_zoneidx are set first in
 238 * __alloc_pages() for the fast path, and might be later changed
 239 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 240 * by a const pointer.
 241 */
 242struct alloc_context {
 243	struct zonelist *zonelist;
 244	nodemask_t *nodemask;
 245	struct zoneref *preferred_zoneref;
 246	int migratetype;
 247
 248	/*
 249	 * highest_zoneidx represents highest usable zone index of
 250	 * the allocation request. Due to the nature of the zone,
 251	 * memory on lower zone than the highest_zoneidx will be
 252	 * protected by lowmem_reserve[highest_zoneidx].
 253	 *
 254	 * highest_zoneidx is also used by reclaim/compaction to limit
 255	 * the target zone since higher zone than this index cannot be
 256	 * usable for this allocation request.
 257	 */
 258	enum zone_type highest_zoneidx;
 259	bool spread_dirty_pages;
 260};
 261
 262/*
 263 * This function returns the order of a free page in the buddy system. In
 264 * general, page_zone(page)->lock must be held by the caller to prevent the
 265 * page from being allocated in parallel and returning garbage as the order.
 266 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 267 * page cannot be allocated or merged in parallel. Alternatively, it must
 268 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 269 */
 270static inline unsigned int buddy_order(struct page *page)
 271{
 272	/* PageBuddy() must be checked by the caller */
 273	return page_private(page);
 274}
 275
 276/*
 277 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 278 * PageBuddy() should be checked first by the caller to minimize race window,
 279 * and invalid values must be handled gracefully.
 280 *
 281 * READ_ONCE is used so that if the caller assigns the result into a local
 282 * variable and e.g. tests it for valid range before using, the compiler cannot
 283 * decide to remove the variable and inline the page_private(page) multiple
 284 * times, potentially observing different values in the tests and the actual
 285 * use of the result.
 286 */
 287#define buddy_order_unsafe(page)	READ_ONCE(page_private(page))
 288
 289/*
 290 * This function checks whether a page is free && is the buddy
 291 * we can coalesce a page and its buddy if
 292 * (a) the buddy is not in a hole (check before calling!) &&
 293 * (b) the buddy is in the buddy system &&
 294 * (c) a page and its buddy have the same order &&
 295 * (d) a page and its buddy are in the same zone.
 296 *
 297 * For recording whether a page is in the buddy system, we set PageBuddy.
 298 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 299 *
 300 * For recording page's order, we use page_private(page).
 301 */
 302static inline bool page_is_buddy(struct page *page, struct page *buddy,
 303				 unsigned int order)
 304{
 305	if (!page_is_guard(buddy) && !PageBuddy(buddy))
 306		return false;
 307
 308	if (buddy_order(buddy) != order)
 309		return false;
 310
 311	/*
 312	 * zone check is done late to avoid uselessly calculating
 313	 * zone/node ids for pages that could never merge.
 314	 */
 315	if (page_zone_id(page) != page_zone_id(buddy))
 316		return false;
 317
 318	VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
 319
 320	return true;
 321}
 322
 323/*
 324 * Locate the struct page for both the matching buddy in our
 325 * pair (buddy1) and the combined O(n+1) page they form (page).
 326 *
 327 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 328 * the following equation:
 329 *     B2 = B1 ^ (1 << O)
 330 * For example, if the starting buddy (buddy2) is #8 its order
 331 * 1 buddy is #10:
 332 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 333 *
 334 * 2) Any buddy B will have an order O+1 parent P which
 335 * satisfies the following equation:
 336 *     P = B & ~(1 << O)
 337 *
 338 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 339 */
 340static inline unsigned long
 341__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
 342{
 343	return page_pfn ^ (1 << order);
 344}
 345
 346/*
 347 * Find the buddy of @page and validate it.
 348 * @page: The input page
 349 * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
 350 *       function is used in the performance-critical __free_one_page().
 351 * @order: The order of the page
 352 * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
 353 *             page_to_pfn().
 354 *
 355 * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
 356 * not the same as @page. The validation is necessary before use it.
 357 *
 358 * Return: the found buddy page or NULL if not found.
 359 */
 360static inline struct page *find_buddy_page_pfn(struct page *page,
 361			unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
 362{
 363	unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
 364	struct page *buddy;
 365
 366	buddy = page + (__buddy_pfn - pfn);
 367	if (buddy_pfn)
 368		*buddy_pfn = __buddy_pfn;
 369
 370	if (page_is_buddy(page, buddy, order))
 371		return buddy;
 372	return NULL;
 373}
 374
 375extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
 376				unsigned long end_pfn, struct zone *zone);
 377
 378static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
 379				unsigned long end_pfn, struct zone *zone)
 380{
 381	if (zone->contiguous)
 382		return pfn_to_page(start_pfn);
 383
 384	return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
 385}
 386
 387void set_zone_contiguous(struct zone *zone);
 388
 389static inline void clear_zone_contiguous(struct zone *zone)
 390{
 391	zone->contiguous = false;
 392}
 393
 394extern int __isolate_free_page(struct page *page, unsigned int order);
 395extern void __putback_isolated_page(struct page *page, unsigned int order,
 396				    int mt);
 397extern void memblock_free_pages(struct page *page, unsigned long pfn,
 398					unsigned int order);
 399extern void __free_pages_core(struct page *page, unsigned int order);
 400
 401/*
 402 * This will have no effect, other than possibly generating a warning, if the
 403 * caller passes in a non-large folio.
 404 */
 405static inline void folio_set_order(struct folio *folio, unsigned int order)
 406{
 407	if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
 408		return;
 409
 410	folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
 411#ifdef CONFIG_64BIT
 412	folio->_folio_nr_pages = 1U << order;
 413#endif
 414}
 415
 416void folio_undo_large_rmappable(struct folio *folio);
 417
 418static inline void prep_compound_head(struct page *page, unsigned int order)
 419{
 420	struct folio *folio = (struct folio *)page;
 421
 422	folio_set_order(folio, order);
 423	atomic_set(&folio->_entire_mapcount, -1);
 424	atomic_set(&folio->_nr_pages_mapped, 0);
 425	atomic_set(&folio->_pincount, 0);
 426}
 427
 428static inline void prep_compound_tail(struct page *head, int tail_idx)
 429{
 430	struct page *p = head + tail_idx;
 431
 432	p->mapping = TAIL_MAPPING;
 433	set_compound_head(p, head);
 434	set_page_private(p, 0);
 435}
 436
 437extern void prep_compound_page(struct page *page, unsigned int order);
 438
 439extern void post_alloc_hook(struct page *page, unsigned int order,
 440					gfp_t gfp_flags);
 441extern int user_min_free_kbytes;
 442
 443extern void free_unref_page(struct page *page, unsigned int order);
 444extern void free_unref_page_list(struct list_head *list);
 445
 446extern void zone_pcp_reset(struct zone *zone);
 447extern void zone_pcp_disable(struct zone *zone);
 448extern void zone_pcp_enable(struct zone *zone);
 449extern void zone_pcp_init(struct zone *zone);
 450
 451extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
 452			  phys_addr_t min_addr,
 453			  int nid, bool exact_nid);
 454
 455void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
 456		unsigned long, enum meminit_context, struct vmem_altmap *, int);
 457
 458
 459int split_free_page(struct page *free_page,
 460			unsigned int order, unsigned long split_pfn_offset);
 461
 462#if defined CONFIG_COMPACTION || defined CONFIG_CMA
 463
 464/*
 465 * in mm/compaction.c
 466 */
 467/*
 468 * compact_control is used to track pages being migrated and the free pages
 469 * they are being migrated to during memory compaction. The free_pfn starts
 470 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 471 * are moved to the end of a zone during a compaction run and the run
 472 * completes when free_pfn <= migrate_pfn
 473 */
 474struct compact_control {
 475	struct list_head freepages;	/* List of free pages to migrate to */
 476	struct list_head migratepages;	/* List of pages being migrated */
 477	unsigned int nr_freepages;	/* Number of isolated free pages */
 478	unsigned int nr_migratepages;	/* Number of pages to migrate */
 479	unsigned long free_pfn;		/* isolate_freepages search base */
 480	/*
 481	 * Acts as an in/out parameter to page isolation for migration.
 482	 * isolate_migratepages uses it as a search base.
 483	 * isolate_migratepages_block will update the value to the next pfn
 484	 * after the last isolated one.
 485	 */
 486	unsigned long migrate_pfn;
 487	unsigned long fast_start_pfn;	/* a pfn to start linear scan from */
 488	struct zone *zone;
 489	unsigned long total_migrate_scanned;
 490	unsigned long total_free_scanned;
 491	unsigned short fast_search_fail;/* failures to use free list searches */
 492	short search_order;		/* order to start a fast search at */
 493	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 494	int order;			/* order a direct compactor needs */
 495	int migratetype;		/* migratetype of direct compactor */
 496	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
 497	const int highest_zoneidx;	/* zone index of a direct compactor */
 498	enum migrate_mode mode;		/* Async or sync migration mode */
 499	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 500	bool no_set_skip_hint;		/* Don't mark blocks for skipping */
 501	bool ignore_block_suitable;	/* Scan blocks considered unsuitable */
 502	bool direct_compaction;		/* False from kcompactd or /proc/... */
 503	bool proactive_compaction;	/* kcompactd proactive compaction */
 504	bool whole_zone;		/* Whole zone should/has been scanned */
 505	bool contended;			/* Signal lock contention */
 506	bool finish_pageblock;		/* Scan the remainder of a pageblock. Used
 507					 * when there are potentially transient
 508					 * isolation or migration failures to
 509					 * ensure forward progress.
 510					 */
 511	bool alloc_contig;		/* alloc_contig_range allocation */
 512};
 513
 514/*
 515 * Used in direct compaction when a page should be taken from the freelists
 516 * immediately when one is created during the free path.
 517 */
 518struct capture_control {
 519	struct compact_control *cc;
 520	struct page *page;
 521};
 522
 523unsigned long
 524isolate_freepages_range(struct compact_control *cc,
 525			unsigned long start_pfn, unsigned long end_pfn);
 526int
 527isolate_migratepages_range(struct compact_control *cc,
 528			   unsigned long low_pfn, unsigned long end_pfn);
 529
 530int __alloc_contig_migrate_range(struct compact_control *cc,
 531					unsigned long start, unsigned long end);
 532
 533/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 534void init_cma_reserved_pageblock(struct page *page);
 535
 536#endif /* CONFIG_COMPACTION || CONFIG_CMA */
 537
 538int find_suitable_fallback(struct free_area *area, unsigned int order,
 539			int migratetype, bool only_stealable, bool *can_steal);
 540
 541static inline bool free_area_empty(struct free_area *area, int migratetype)
 542{
 543	return list_empty(&area->free_list[migratetype]);
 544}
 545
 546/*
 547 * These three helpers classifies VMAs for virtual memory accounting.
 548 */
 549
 550/*
 551 * Executable code area - executable, not writable, not stack
 552 */
 553static inline bool is_exec_mapping(vm_flags_t flags)
 554{
 555	return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
 556}
 557
 558/*
 559 * Stack area (including shadow stacks)
 560 *
 561 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
 562 * do_mmap() forbids all other combinations.
 563 */
 564static inline bool is_stack_mapping(vm_flags_t flags)
 565{
 566	return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
 567}
 568
 569/*
 570 * Data area - private, writable, not stack
 571 */
 572static inline bool is_data_mapping(vm_flags_t flags)
 573{
 574	return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
 575}
 576
 577/* mm/util.c */
 578struct anon_vma *folio_anon_vma(struct folio *folio);
 579
 580#ifdef CONFIG_MMU
 581void unmap_mapping_folio(struct folio *folio);
 582extern long populate_vma_page_range(struct vm_area_struct *vma,
 583		unsigned long start, unsigned long end, int *locked);
 584extern long faultin_vma_page_range(struct vm_area_struct *vma,
 585				   unsigned long start, unsigned long end,
 586				   bool write, int *locked);
 587extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
 588			       unsigned long bytes);
 589/*
 590 * mlock_vma_folio() and munlock_vma_folio():
 591 * should be called with vma's mmap_lock held for read or write,
 592 * under page table lock for the pte/pmd being added or removed.
 593 *
 594 * mlock is usually called at the end of page_add_*_rmap(), munlock at
 595 * the end of page_remove_rmap(); but new anon folios are managed by
 596 * folio_add_lru_vma() calling mlock_new_folio().
 597 *
 598 * @compound is used to include pmd mappings of THPs, but filter out
 599 * pte mappings of THPs, which cannot be consistently counted: a pte
 600 * mapping of the THP head cannot be distinguished by the page alone.
 601 */
 602void mlock_folio(struct folio *folio);
 603static inline void mlock_vma_folio(struct folio *folio,
 604			struct vm_area_struct *vma, bool compound)
 605{
 606	/*
 607	 * The VM_SPECIAL check here serves two purposes.
 608	 * 1) VM_IO check prevents migration from double-counting during mlock.
 609	 * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
 610	 *    is never left set on a VM_SPECIAL vma, there is an interval while
 611	 *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
 612	 *    still be set while VM_SPECIAL bits are added: so ignore it then.
 613	 */
 614	if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
 615	    (compound || !folio_test_large(folio)))
 616		mlock_folio(folio);
 617}
 618
 619void munlock_folio(struct folio *folio);
 620static inline void munlock_vma_folio(struct folio *folio,
 621			struct vm_area_struct *vma, bool compound)
 622{
 623	if (unlikely(vma->vm_flags & VM_LOCKED) &&
 624	    (compound || !folio_test_large(folio)))
 625		munlock_folio(folio);
 626}
 627
 628void mlock_new_folio(struct folio *folio);
 629bool need_mlock_drain(int cpu);
 630void mlock_drain_local(void);
 631void mlock_drain_remote(int cpu);
 632
 633extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 634
 635/*
 636 * Return the start of user virtual address at the specific offset within
 637 * a vma.
 638 */
 639static inline unsigned long
 640vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
 641		  struct vm_area_struct *vma)
 642{
 643	unsigned long address;
 644
 645	if (pgoff >= vma->vm_pgoff) {
 646		address = vma->vm_start +
 647			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 648		/* Check for address beyond vma (or wrapped through 0?) */
 649		if (address < vma->vm_start || address >= vma->vm_end)
 650			address = -EFAULT;
 651	} else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
 652		/* Test above avoids possibility of wrap to 0 on 32-bit */
 653		address = vma->vm_start;
 654	} else {
 655		address = -EFAULT;
 656	}
 657	return address;
 658}
 659
 660/*
 661 * Return the start of user virtual address of a page within a vma.
 662 * Returns -EFAULT if all of the page is outside the range of vma.
 663 * If page is a compound head, the entire compound page is considered.
 664 */
 665static inline unsigned long
 666vma_address(struct page *page, struct vm_area_struct *vma)
 667{
 668	VM_BUG_ON_PAGE(PageKsm(page), page);	/* KSM page->index unusable */
 669	return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
 670}
 671
 672/*
 673 * Then at what user virtual address will none of the range be found in vma?
 674 * Assumes that vma_address() already returned a good starting address.
 675 */
 676static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
 677{
 678	struct vm_area_struct *vma = pvmw->vma;
 679	pgoff_t pgoff;
 680	unsigned long address;
 681
 682	/* Common case, plus ->pgoff is invalid for KSM */
 683	if (pvmw->nr_pages == 1)
 684		return pvmw->address + PAGE_SIZE;
 685
 686	pgoff = pvmw->pgoff + pvmw->nr_pages;
 687	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 688	/* Check for address beyond vma (or wrapped through 0?) */
 689	if (address < vma->vm_start || address > vma->vm_end)
 690		address = vma->vm_end;
 691	return address;
 692}
 693
 694static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
 695						    struct file *fpin)
 696{
 697	int flags = vmf->flags;
 698
 699	if (fpin)
 700		return fpin;
 701
 702	/*
 703	 * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
 704	 * anything, so we only pin the file and drop the mmap_lock if only
 705	 * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
 706	 */
 707	if (fault_flag_allow_retry_first(flags) &&
 708	    !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
 709		fpin = get_file(vmf->vma->vm_file);
 710		release_fault_lock(vmf);
 711	}
 712	return fpin;
 713}
 714#else /* !CONFIG_MMU */
 715static inline void unmap_mapping_folio(struct folio *folio) { }
 716static inline void mlock_new_folio(struct folio *folio) { }
 717static inline bool need_mlock_drain(int cpu) { return false; }
 718static inline void mlock_drain_local(void) { }
 719static inline void mlock_drain_remote(int cpu) { }
 720static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
 721{
 722}
 723#endif /* !CONFIG_MMU */
 724
 725/* Memory initialisation debug and verification */
 726#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 727DECLARE_STATIC_KEY_TRUE(deferred_pages);
 728
 729bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
 730#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 731
 732enum mminit_level {
 733	MMINIT_WARNING,
 734	MMINIT_VERIFY,
 735	MMINIT_TRACE
 736};
 737
 738#ifdef CONFIG_DEBUG_MEMORY_INIT
 739
 740extern int mminit_loglevel;
 741
 742#define mminit_dprintk(level, prefix, fmt, arg...) \
 743do { \
 744	if (level < mminit_loglevel) { \
 745		if (level <= MMINIT_WARNING) \
 746			pr_warn("mminit::" prefix " " fmt, ##arg);	\
 747		else \
 748			printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
 749	} \
 750} while (0)
 751
 752extern void mminit_verify_pageflags_layout(void);
 753extern void mminit_verify_zonelist(void);
 754#else
 755
 756static inline void mminit_dprintk(enum mminit_level level,
 757				const char *prefix, const char *fmt, ...)
 758{
 759}
 760
 761static inline void mminit_verify_pageflags_layout(void)
 762{
 763}
 764
 765static inline void mminit_verify_zonelist(void)
 766{
 767}
 768#endif /* CONFIG_DEBUG_MEMORY_INIT */
 769
 770#define NODE_RECLAIM_NOSCAN	-2
 771#define NODE_RECLAIM_FULL	-1
 772#define NODE_RECLAIM_SOME	0
 773#define NODE_RECLAIM_SUCCESS	1
 774
 775#ifdef CONFIG_NUMA
 776extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
 777extern int find_next_best_node(int node, nodemask_t *used_node_mask);
 778#else
 779static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
 780				unsigned int order)
 781{
 782	return NODE_RECLAIM_NOSCAN;
 783}
 784static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
 785{
 786	return NUMA_NO_NODE;
 787}
 788#endif
 789
 790/*
 791 * mm/memory-failure.c
 792 */
 793extern int hwpoison_filter(struct page *p);
 794
 795extern u32 hwpoison_filter_dev_major;
 796extern u32 hwpoison_filter_dev_minor;
 797extern u64 hwpoison_filter_flags_mask;
 798extern u64 hwpoison_filter_flags_value;
 799extern u64 hwpoison_filter_memcg;
 800extern u32 hwpoison_filter_enable;
 801
 802extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
 803        unsigned long, unsigned long,
 804        unsigned long, unsigned long);
 805
 806extern void set_pageblock_order(void);
 807unsigned long reclaim_pages(struct list_head *folio_list);
 808unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 809					    struct list_head *folio_list);
 810/* The ALLOC_WMARK bits are used as an index to zone->watermark */
 811#define ALLOC_WMARK_MIN		WMARK_MIN
 812#define ALLOC_WMARK_LOW		WMARK_LOW
 813#define ALLOC_WMARK_HIGH	WMARK_HIGH
 814#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
 815
 816/* Mask to get the watermark bits */
 817#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
 818
 819/*
 820 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 821 * cannot assume a reduced access to memory reserves is sufficient for
 822 * !MMU
 823 */
 824#ifdef CONFIG_MMU
 825#define ALLOC_OOM		0x08
 826#else
 827#define ALLOC_OOM		ALLOC_NO_WATERMARKS
 828#endif
 829
 830#define ALLOC_NON_BLOCK		 0x10 /* Caller cannot block. Allow access
 831				       * to 25% of the min watermark or
 832				       * 62.5% if __GFP_HIGH is set.
 833				       */
 834#define ALLOC_MIN_RESERVE	 0x20 /* __GFP_HIGH set. Allow access to 50%
 835				       * of the min watermark.
 836				       */
 837#define ALLOC_CPUSET		 0x40 /* check for correct cpuset */
 838#define ALLOC_CMA		 0x80 /* allow allocations from CMA areas */
 839#ifdef CONFIG_ZONE_DMA32
 840#define ALLOC_NOFRAGMENT	0x100 /* avoid mixing pageblock types */
 841#else
 842#define ALLOC_NOFRAGMENT	  0x0
 843#endif
 844#define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
 845#define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
 846
 847/* Flags that allow allocations below the min watermark. */
 848#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
 849
 850enum ttu_flags;
 851struct tlbflush_unmap_batch;
 852
 853
 854/*
 855 * only for MM internal work items which do not depend on
 856 * any allocations or locks which might depend on allocations
 857 */
 858extern struct workqueue_struct *mm_percpu_wq;
 859
 860#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 861void try_to_unmap_flush(void);
 862void try_to_unmap_flush_dirty(void);
 863void flush_tlb_batched_pending(struct mm_struct *mm);
 864#else
 865static inline void try_to_unmap_flush(void)
 866{
 867}
 868static inline void try_to_unmap_flush_dirty(void)
 869{
 870}
 871static inline void flush_tlb_batched_pending(struct mm_struct *mm)
 872{
 873}
 874#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 875
 876extern const struct trace_print_flags pageflag_names[];
 877extern const struct trace_print_flags pagetype_names[];
 878extern const struct trace_print_flags vmaflag_names[];
 879extern const struct trace_print_flags gfpflag_names[];
 880
 881static inline bool is_migrate_highatomic(enum migratetype migratetype)
 882{
 883	return migratetype == MIGRATE_HIGHATOMIC;
 884}
 885
 886static inline bool is_migrate_highatomic_page(struct page *page)
 887{
 888	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
 889}
 890
 891void setup_zone_pageset(struct zone *zone);
 892
 893struct migration_target_control {
 894	int nid;		/* preferred node id */
 895	nodemask_t *nmask;
 896	gfp_t gfp_mask;
 897};
 898
 899/*
 900 * mm/filemap.c
 901 */
 902size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
 903			      struct folio *folio, loff_t fpos, size_t size);
 904
 905/*
 906 * mm/vmalloc.c
 907 */
 908#ifdef CONFIG_MMU
 909void __init vmalloc_init(void);
 910int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 911                pgprot_t prot, struct page **pages, unsigned int page_shift);
 912#else
 913static inline void vmalloc_init(void)
 914{
 915}
 916
 917static inline
 918int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 919                pgprot_t prot, struct page **pages, unsigned int page_shift)
 920{
 921	return -EINVAL;
 922}
 923#endif
 924
 925int __must_check __vmap_pages_range_noflush(unsigned long addr,
 926			       unsigned long end, pgprot_t prot,
 927			       struct page **pages, unsigned int page_shift);
 928
 929void vunmap_range_noflush(unsigned long start, unsigned long end);
 930
 931void __vunmap_range_noflush(unsigned long start, unsigned long end);
 932
 933int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 934		      unsigned long addr, int page_nid, int *flags);
 935
 936void free_zone_device_page(struct page *page);
 937int migrate_device_coherent_page(struct page *page);
 938
 939/*
 940 * mm/gup.c
 941 */
 942struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
 943int __must_check try_grab_page(struct page *page, unsigned int flags);
 944
 945/*
 946 * mm/huge_memory.c
 947 */
 948struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 949				   unsigned long addr, pmd_t *pmd,
 950				   unsigned int flags);
 951
 952enum {
 953	/* mark page accessed */
 954	FOLL_TOUCH = 1 << 16,
 955	/* a retry, previous pass started an IO */
 956	FOLL_TRIED = 1 << 17,
 957	/* we are working on non-current tsk/mm */
 958	FOLL_REMOTE = 1 << 18,
 959	/* pages must be released via unpin_user_page */
 960	FOLL_PIN = 1 << 19,
 961	/* gup_fast: prevent fall-back to slow gup */
 962	FOLL_FAST_ONLY = 1 << 20,
 963	/* allow unlocking the mmap lock */
 964	FOLL_UNLOCKABLE = 1 << 21,
 965};
 966
 967/*
 968 * Indicates for which pages that are write-protected in the page table,
 969 * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
 970 * GUP pin will remain consistent with the pages mapped into the page tables
 971 * of the MM.
 972 *
 973 * Temporary unmapping of PageAnonExclusive() pages or clearing of
 974 * PageAnonExclusive() has to protect against concurrent GUP:
 975 * * Ordinary GUP: Using the PT lock
 976 * * GUP-fast and fork(): mm->write_protect_seq
 977 * * GUP-fast and KSM or temporary unmapping (swap, migration): see
 978 *    page_try_share_anon_rmap()
 979 *
 980 * Must be called with the (sub)page that's actually referenced via the
 981 * page table entry, which might not necessarily be the head page for a
 982 * PTE-mapped THP.
 983 *
 984 * If the vma is NULL, we're coming from the GUP-fast path and might have
 985 * to fallback to the slow path just to lookup the vma.
 986 */
 987static inline bool gup_must_unshare(struct vm_area_struct *vma,
 988				    unsigned int flags, struct page *page)
 989{
 990	/*
 991	 * FOLL_WRITE is implicitly handled correctly as the page table entry
 992	 * has to be writable -- and if it references (part of) an anonymous
 993	 * folio, that part is required to be marked exclusive.
 994	 */
 995	if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
 996		return false;
 997	/*
 998	 * Note: PageAnon(page) is stable until the page is actually getting
 999	 * freed.
1000	 */
1001	if (!PageAnon(page)) {
1002		/*
1003		 * We only care about R/O long-term pining: R/O short-term
1004		 * pinning does not have the semantics to observe successive
1005		 * changes through the process page tables.
1006		 */
1007		if (!(flags & FOLL_LONGTERM))
1008			return false;
1009
1010		/* We really need the vma ... */
1011		if (!vma)
1012			return true;
1013
1014		/*
1015		 * ... because we only care about writable private ("COW")
1016		 * mappings where we have to break COW early.
1017		 */
1018		return is_cow_mapping(vma->vm_flags);
1019	}
1020
1021	/* Paired with a memory barrier in page_try_share_anon_rmap(). */
1022	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
1023		smp_rmb();
1024
1025	/*
1026	 * During GUP-fast we might not get called on the head page for a
1027	 * hugetlb page that is mapped using cont-PTE, because GUP-fast does
1028	 * not work with the abstracted hugetlb PTEs that always point at the
1029	 * head page. For hugetlb, PageAnonExclusive only applies on the head
1030	 * page (as it cannot be partially COW-shared), so lookup the head page.
1031	 */
1032	if (unlikely(!PageHead(page) && PageHuge(page)))
1033		page = compound_head(page);
1034
1035	/*
1036	 * Note that PageKsm() pages cannot be exclusive, and consequently,
1037	 * cannot get pinned.
1038	 */
1039	return !PageAnonExclusive(page);
1040}
1041
1042extern bool mirrored_kernelcore;
1043extern bool memblock_has_mirror(void);
1044
1045static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
1046{
1047	/*
1048	 * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
1049	 * enablements, because when without soft-dirty being compiled in,
1050	 * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
1051	 * will be constantly true.
1052	 */
1053	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
1054		return false;
1055
1056	/*
1057	 * Soft-dirty is kind of special: its tracking is enabled when the
1058	 * vma flags not set.
1059	 */
1060	return !(vma->vm_flags & VM_SOFTDIRTY);
1061}
1062
1063static inline void vma_iter_config(struct vma_iterator *vmi,
1064		unsigned long index, unsigned long last)
1065{
1066	MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START &&
1067		   (vmi->mas.index > index || vmi->mas.last < index));
1068	__mas_set_range(&vmi->mas, index, last - 1);
1069}
1070
1071/*
1072 * VMA Iterator functions shared between nommu and mmap
1073 */
1074static inline int vma_iter_prealloc(struct vma_iterator *vmi,
1075		struct vm_area_struct *vma)
1076{
1077	return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
1078}
1079
1080static inline void vma_iter_clear(struct vma_iterator *vmi)
1081{
1082	mas_store_prealloc(&vmi->mas, NULL);
1083}
1084
1085static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
1086			unsigned long start, unsigned long end, gfp_t gfp)
1087{
1088	__mas_set_range(&vmi->mas, start, end - 1);
1089	mas_store_gfp(&vmi->mas, NULL, gfp);
1090	if (unlikely(mas_is_err(&vmi->mas)))
1091		return -ENOMEM;
1092
1093	return 0;
1094}
1095
1096static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
1097{
1098	return mas_walk(&vmi->mas);
1099}
1100
1101/* Store a VMA with preallocated memory */
1102static inline void vma_iter_store(struct vma_iterator *vmi,
1103				  struct vm_area_struct *vma)
1104{
1105
1106#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
1107	if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
1108			vmi->mas.index > vma->vm_start)) {
1109		pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
1110			vmi->mas.index, vma->vm_start, vma->vm_start,
1111			vma->vm_end, vmi->mas.index, vmi->mas.last);
1112	}
1113	if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
1114			vmi->mas.last <  vma->vm_start)) {
1115		pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
1116		       vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
1117		       vmi->mas.index, vmi->mas.last);
1118	}
1119#endif
1120
1121	if (vmi->mas.node != MAS_START &&
1122	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
1123		vma_iter_invalidate(vmi);
1124
1125	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
1126	mas_store_prealloc(&vmi->mas, vma);
1127}
1128
1129static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
1130			struct vm_area_struct *vma, gfp_t gfp)
1131{
1132	if (vmi->mas.node != MAS_START &&
1133	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
1134		vma_iter_invalidate(vmi);
1135
1136	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
1137	mas_store_gfp(&vmi->mas, vma, gfp);
1138	if (unlikely(mas_is_err(&vmi->mas)))
1139		return -ENOMEM;
1140
1141	return 0;
1142}
1143
1144/*
1145 * VMA lock generalization
1146 */
1147struct vma_prepare {
1148	struct vm_area_struct *vma;
1149	struct vm_area_struct *adj_next;
1150	struct file *file;
1151	struct address_space *mapping;
1152	struct anon_vma *anon_vma;
1153	struct vm_area_struct *insert;
1154	struct vm_area_struct *remove;
1155	struct vm_area_struct *remove2;
1156};
1157#endif	/* __MM_INTERNAL_H */