at v5.16 49 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_MMZONE_H 3#define _LINUX_MMZONE_H 4 5#ifndef __ASSEMBLY__ 6#ifndef __GENERATING_BOUNDS_H 7 8#include <linux/spinlock.h> 9#include <linux/list.h> 10#include <linux/wait.h> 11#include <linux/bitops.h> 12#include <linux/cache.h> 13#include <linux/threads.h> 14#include <linux/numa.h> 15#include <linux/init.h> 16#include <linux/seqlock.h> 17#include <linux/nodemask.h> 18#include <linux/pageblock-flags.h> 19#include <linux/page-flags-layout.h> 20#include <linux/atomic.h> 21#include <linux/mm_types.h> 22#include <linux/page-flags.h> 23#include <linux/local_lock.h> 24#include <asm/page.h> 25 26/* Free memory management - zoned buddy allocator. */ 27#ifndef CONFIG_FORCE_MAX_ZONEORDER 28#define MAX_ORDER 11 29#else 30#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER 31#endif 32#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) 33 34/* 35 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed 36 * costly to service. That is between allocation orders which should 37 * coalesce naturally under reasonable reclaim pressure and those which 38 * will not. 39 */ 40#define PAGE_ALLOC_COSTLY_ORDER 3 41 42enum migratetype { 43 MIGRATE_UNMOVABLE, 44 MIGRATE_MOVABLE, 45 MIGRATE_RECLAIMABLE, 46 MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ 47 MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, 48#ifdef CONFIG_CMA 49 /* 50 * MIGRATE_CMA migration type is designed to mimic the way 51 * ZONE_MOVABLE works. Only movable pages can be allocated 52 * from MIGRATE_CMA pageblocks and page allocator never 53 * implicitly change migration type of MIGRATE_CMA pageblock. 54 * 55 * The way to use it is to change migratetype of a range of 56 * pageblocks to MIGRATE_CMA which can be done by 57 * __free_pageblock_cma() function. What is important though 58 * is that a range of pageblocks must be aligned to 59 * MAX_ORDER_NR_PAGES should biggest page be bigger than 60 * a single pageblock. 61 */ 62 MIGRATE_CMA, 63#endif 64#ifdef CONFIG_MEMORY_ISOLATION 65 MIGRATE_ISOLATE, /* can't allocate from here */ 66#endif 67 MIGRATE_TYPES 68}; 69 70/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ 71extern const char * const migratetype_names[MIGRATE_TYPES]; 72 73#ifdef CONFIG_CMA 74# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) 75# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) 76#else 77# define is_migrate_cma(migratetype) false 78# define is_migrate_cma_page(_page) false 79#endif 80 81static inline bool is_migrate_movable(int mt) 82{ 83 return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; 84} 85 86#define for_each_migratetype_order(order, type) \ 87 for (order = 0; order < MAX_ORDER; order++) \ 88 for (type = 0; type < MIGRATE_TYPES; type++) 89 90extern int page_group_by_mobility_disabled; 91 92#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) 93 94#define get_pageblock_migratetype(page) \ 95 get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) 96 97struct free_area { 98 struct list_head free_list[MIGRATE_TYPES]; 99 unsigned long nr_free; 100}; 101 102static inline struct page *get_page_from_free_area(struct free_area *area, 103 int migratetype) 104{ 105 return list_first_entry_or_null(&area->free_list[migratetype], 106 struct page, lru); 107} 108 109static inline bool free_area_empty(struct free_area *area, int migratetype) 110{ 111 return list_empty(&area->free_list[migratetype]); 112} 113 114struct pglist_data; 115 116/* 117 * Add a wild amount of padding here to ensure data fall into separate 118 * cachelines. There are very few zone structures in the machine, so space 119 * consumption is not a concern here. 120 */ 121#if defined(CONFIG_SMP) 122struct zone_padding { 123 char x[0]; 124} ____cacheline_internodealigned_in_smp; 125#define ZONE_PADDING(name) struct zone_padding name; 126#else 127#define ZONE_PADDING(name) 128#endif 129 130#ifdef CONFIG_NUMA 131enum numa_stat_item { 132 NUMA_HIT, /* allocated in intended node */ 133 NUMA_MISS, /* allocated in non intended node */ 134 NUMA_FOREIGN, /* was intended here, hit elsewhere */ 135 NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ 136 NUMA_LOCAL, /* allocation from local node */ 137 NUMA_OTHER, /* allocation from other node */ 138 NR_VM_NUMA_EVENT_ITEMS 139}; 140#else 141#define NR_VM_NUMA_EVENT_ITEMS 0 142#endif 143 144enum zone_stat_item { 145 /* First 128 byte cacheline (assuming 64 bit words) */ 146 NR_FREE_PAGES, 147 NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ 148 NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, 149 NR_ZONE_ACTIVE_ANON, 150 NR_ZONE_INACTIVE_FILE, 151 NR_ZONE_ACTIVE_FILE, 152 NR_ZONE_UNEVICTABLE, 153 NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ 154 NR_MLOCK, /* mlock()ed pages found and moved off LRU */ 155 /* Second 128 byte cacheline */ 156 NR_BOUNCE, 157#if IS_ENABLED(CONFIG_ZSMALLOC) 158 NR_ZSPAGES, /* allocated in zsmalloc */ 159#endif 160 NR_FREE_CMA_PAGES, 161 NR_VM_ZONE_STAT_ITEMS }; 162 163enum node_stat_item { 164 NR_LRU_BASE, 165 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ 166 NR_ACTIVE_ANON, /* " " " " " */ 167 NR_INACTIVE_FILE, /* " " " " " */ 168 NR_ACTIVE_FILE, /* " " " " " */ 169 NR_UNEVICTABLE, /* " " " " " */ 170 NR_SLAB_RECLAIMABLE_B, 171 NR_SLAB_UNRECLAIMABLE_B, 172 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ 173 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ 174 WORKINGSET_NODES, 175 WORKINGSET_REFAULT_BASE, 176 WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, 177 WORKINGSET_REFAULT_FILE, 178 WORKINGSET_ACTIVATE_BASE, 179 WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, 180 WORKINGSET_ACTIVATE_FILE, 181 WORKINGSET_RESTORE_BASE, 182 WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, 183 WORKINGSET_RESTORE_FILE, 184 WORKINGSET_NODERECLAIM, 185 NR_ANON_MAPPED, /* Mapped anonymous pages */ 186 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. 187 only modified from process context */ 188 NR_FILE_PAGES, 189 NR_FILE_DIRTY, 190 NR_WRITEBACK, 191 NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ 192 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ 193 NR_SHMEM_THPS, 194 NR_SHMEM_PMDMAPPED, 195 NR_FILE_THPS, 196 NR_FILE_PMDMAPPED, 197 NR_ANON_THPS, 198 NR_VMSCAN_WRITE, 199 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ 200 NR_DIRTIED, /* page dirtyings since bootup */ 201 NR_WRITTEN, /* page writings since bootup */ 202 NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ 203 NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ 204 NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ 205 NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ 206 NR_KERNEL_STACK_KB, /* measured in KiB */ 207#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) 208 NR_KERNEL_SCS_KB, /* measured in KiB */ 209#endif 210 NR_PAGETABLE, /* used for pagetables */ 211#ifdef CONFIG_SWAP 212 NR_SWAPCACHE, 213#endif 214 NR_VM_NODE_STAT_ITEMS 215}; 216 217/* 218 * Returns true if the item should be printed in THPs (/proc/vmstat 219 * currently prints number of anon, file and shmem THPs. But the item 220 * is charged in pages). 221 */ 222static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) 223{ 224 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 225 return false; 226 227 return item == NR_ANON_THPS || 228 item == NR_FILE_THPS || 229 item == NR_SHMEM_THPS || 230 item == NR_SHMEM_PMDMAPPED || 231 item == NR_FILE_PMDMAPPED; 232} 233 234/* 235 * Returns true if the value is measured in bytes (most vmstat values are 236 * measured in pages). This defines the API part, the internal representation 237 * might be different. 238 */ 239static __always_inline bool vmstat_item_in_bytes(int idx) 240{ 241 /* 242 * Global and per-node slab counters track slab pages. 243 * It's expected that changes are multiples of PAGE_SIZE. 244 * Internally values are stored in pages. 245 * 246 * Per-memcg and per-lruvec counters track memory, consumed 247 * by individual slab objects. These counters are actually 248 * byte-precise. 249 */ 250 return (idx == NR_SLAB_RECLAIMABLE_B || 251 idx == NR_SLAB_UNRECLAIMABLE_B); 252} 253 254/* 255 * We do arithmetic on the LRU lists in various places in the code, 256 * so it is important to keep the active lists LRU_ACTIVE higher in 257 * the array than the corresponding inactive lists, and to keep 258 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. 259 * 260 * This has to be kept in sync with the statistics in zone_stat_item 261 * above and the descriptions in vmstat_text in mm/vmstat.c 262 */ 263#define LRU_BASE 0 264#define LRU_ACTIVE 1 265#define LRU_FILE 2 266 267enum lru_list { 268 LRU_INACTIVE_ANON = LRU_BASE, 269 LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, 270 LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, 271 LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, 272 LRU_UNEVICTABLE, 273 NR_LRU_LISTS 274}; 275 276enum vmscan_throttle_state { 277 VMSCAN_THROTTLE_WRITEBACK, 278 VMSCAN_THROTTLE_ISOLATED, 279 VMSCAN_THROTTLE_NOPROGRESS, 280 VMSCAN_THROTTLE_CONGESTED, 281 NR_VMSCAN_THROTTLE, 282}; 283 284#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) 285 286#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) 287 288static inline bool is_file_lru(enum lru_list lru) 289{ 290 return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); 291} 292 293static inline bool is_active_lru(enum lru_list lru) 294{ 295 return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); 296} 297 298#define ANON_AND_FILE 2 299 300enum lruvec_flags { 301 LRUVEC_CONGESTED, /* lruvec has many dirty pages 302 * backed by a congested BDI 303 */ 304}; 305 306struct lruvec { 307 struct list_head lists[NR_LRU_LISTS]; 308 /* per lruvec lru_lock for memcg */ 309 spinlock_t lru_lock; 310 /* 311 * These track the cost of reclaiming one LRU - file or anon - 312 * over the other. As the observed cost of reclaiming one LRU 313 * increases, the reclaim scan balance tips toward the other. 314 */ 315 unsigned long anon_cost; 316 unsigned long file_cost; 317 /* Non-resident age, driven by LRU movement */ 318 atomic_long_t nonresident_age; 319 /* Refaults at the time of last reclaim cycle */ 320 unsigned long refaults[ANON_AND_FILE]; 321 /* Various lruvec state flags (enum lruvec_flags) */ 322 unsigned long flags; 323#ifdef CONFIG_MEMCG 324 struct pglist_data *pgdat; 325#endif 326}; 327 328/* Isolate unmapped pages */ 329#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) 330/* Isolate for asynchronous migration */ 331#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) 332/* Isolate unevictable pages */ 333#define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) 334 335/* LRU Isolation modes. */ 336typedef unsigned __bitwise isolate_mode_t; 337 338enum zone_watermarks { 339 WMARK_MIN, 340 WMARK_LOW, 341 WMARK_HIGH, 342 NR_WMARK 343}; 344 345/* 346 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional 347 * for pageblock size for THP if configured. 348 */ 349#ifdef CONFIG_TRANSPARENT_HUGEPAGE 350#define NR_PCP_THP 1 351#else 352#define NR_PCP_THP 0 353#endif 354#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) 355 356/* 357 * Shift to encode migratetype and order in the same integer, with order 358 * in the least significant bits. 359 */ 360#define NR_PCP_ORDER_WIDTH 8 361#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1) 362 363#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) 364#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) 365#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) 366#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) 367 368/* Fields and list protected by pagesets local_lock in page_alloc.c */ 369struct per_cpu_pages { 370 int count; /* number of pages in the list */ 371 int high; /* high watermark, emptying needed */ 372 int batch; /* chunk size for buddy add/remove */ 373 short free_factor; /* batch scaling factor during free */ 374#ifdef CONFIG_NUMA 375 short expire; /* When 0, remote pagesets are drained */ 376#endif 377 378 /* Lists of pages, one per migrate type stored on the pcp-lists */ 379 struct list_head lists[NR_PCP_LISTS]; 380}; 381 382struct per_cpu_zonestat { 383#ifdef CONFIG_SMP 384 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; 385 s8 stat_threshold; 386#endif 387#ifdef CONFIG_NUMA 388 /* 389 * Low priority inaccurate counters that are only folded 390 * on demand. Use a large type to avoid the overhead of 391 * folding during refresh_cpu_vm_stats. 392 */ 393 unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; 394#endif 395}; 396 397struct per_cpu_nodestat { 398 s8 stat_threshold; 399 s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; 400}; 401 402#endif /* !__GENERATING_BOUNDS.H */ 403 404enum zone_type { 405 /* 406 * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able 407 * to DMA to all of the addressable memory (ZONE_NORMAL). 408 * On architectures where this area covers the whole 32 bit address 409 * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller 410 * DMA addressing constraints. This distinction is important as a 32bit 411 * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit 412 * platforms may need both zones as they support peripherals with 413 * different DMA addressing limitations. 414 */ 415#ifdef CONFIG_ZONE_DMA 416 ZONE_DMA, 417#endif 418#ifdef CONFIG_ZONE_DMA32 419 ZONE_DMA32, 420#endif 421 /* 422 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be 423 * performed on pages in ZONE_NORMAL if the DMA devices support 424 * transfers to all addressable memory. 425 */ 426 ZONE_NORMAL, 427#ifdef CONFIG_HIGHMEM 428 /* 429 * A memory area that is only addressable by the kernel through 430 * mapping portions into its own address space. This is for example 431 * used by i386 to allow the kernel to address the memory beyond 432 * 900MB. The kernel will set up special mappings (page 433 * table entries on i386) for each page that the kernel needs to 434 * access. 435 */ 436 ZONE_HIGHMEM, 437#endif 438 /* 439 * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains 440 * movable pages with few exceptional cases described below. Main use 441 * cases for ZONE_MOVABLE are to make memory offlining/unplug more 442 * likely to succeed, and to locally limit unmovable allocations - e.g., 443 * to increase the number of THP/huge pages. Notable special cases are: 444 * 445 * 1. Pinned pages: (long-term) pinning of movable pages might 446 * essentially turn such pages unmovable. Therefore, we do not allow 447 * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and 448 * faulted, they come from the right zone right away. However, it is 449 * still possible that address space already has pages in 450 * ZONE_MOVABLE at the time when pages are pinned (i.e. user has 451 * touches that memory before pinning). In such case we migrate them 452 * to a different zone. When migration fails - pinning fails. 453 * 2. memblock allocations: kernelcore/movablecore setups might create 454 * situations where ZONE_MOVABLE contains unmovable allocations 455 * after boot. Memory offlining and allocations fail early. 456 * 3. Memory holes: kernelcore/movablecore setups might create very rare 457 * situations where ZONE_MOVABLE contains memory holes after boot, 458 * for example, if we have sections that are only partially 459 * populated. Memory offlining and allocations fail early. 460 * 4. PG_hwpoison pages: while poisoned pages can be skipped during 461 * memory offlining, such pages cannot be allocated. 462 * 5. Unmovable PG_offline pages: in paravirtualized environments, 463 * hotplugged memory blocks might only partially be managed by the 464 * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The 465 * parts not manged by the buddy are unmovable PG_offline pages. In 466 * some cases (virtio-mem), such pages can be skipped during 467 * memory offlining, however, cannot be moved/allocated. These 468 * techniques might use alloc_contig_range() to hide previously 469 * exposed pages from the buddy again (e.g., to implement some sort 470 * of memory unplug in virtio-mem). 471 * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create 472 * situations where ZERO_PAGE(0) which is allocated differently 473 * on different platforms may end up in a movable zone. ZERO_PAGE(0) 474 * cannot be migrated. 475 * 7. Memory-hotplug: when using memmap_on_memory and onlining the 476 * memory to the MOVABLE zone, the vmemmap pages are also placed in 477 * such zone. Such pages cannot be really moved around as they are 478 * self-stored in the range, but they are treated as movable when 479 * the range they describe is about to be offlined. 480 * 481 * In general, no unmovable allocations that degrade memory offlining 482 * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) 483 * have to expect that migrating pages in ZONE_MOVABLE can fail (even 484 * if has_unmovable_pages() states that there are no unmovable pages, 485 * there can be false negatives). 486 */ 487 ZONE_MOVABLE, 488#ifdef CONFIG_ZONE_DEVICE 489 ZONE_DEVICE, 490#endif 491 __MAX_NR_ZONES 492 493}; 494 495#ifndef __GENERATING_BOUNDS_H 496 497#define ASYNC_AND_SYNC 2 498 499struct zone { 500 /* Read-mostly fields */ 501 502 /* zone watermarks, access with *_wmark_pages(zone) macros */ 503 unsigned long _watermark[NR_WMARK]; 504 unsigned long watermark_boost; 505 506 unsigned long nr_reserved_highatomic; 507 508 /* 509 * We don't know if the memory that we're going to allocate will be 510 * freeable or/and it will be released eventually, so to avoid totally 511 * wasting several GB of ram we must reserve some of the lower zone 512 * memory (otherwise we risk to run OOM on the lower zones despite 513 * there being tons of freeable ram on the higher zones). This array is 514 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl 515 * changes. 516 */ 517 long lowmem_reserve[MAX_NR_ZONES]; 518 519#ifdef CONFIG_NUMA 520 int node; 521#endif 522 struct pglist_data *zone_pgdat; 523 struct per_cpu_pages __percpu *per_cpu_pageset; 524 struct per_cpu_zonestat __percpu *per_cpu_zonestats; 525 /* 526 * the high and batch values are copied to individual pagesets for 527 * faster access 528 */ 529 int pageset_high; 530 int pageset_batch; 531 532#ifndef CONFIG_SPARSEMEM 533 /* 534 * Flags for a pageblock_nr_pages block. See pageblock-flags.h. 535 * In SPARSEMEM, this map is stored in struct mem_section 536 */ 537 unsigned long *pageblock_flags; 538#endif /* CONFIG_SPARSEMEM */ 539 540 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 541 unsigned long zone_start_pfn; 542 543 /* 544 * spanned_pages is the total pages spanned by the zone, including 545 * holes, which is calculated as: 546 * spanned_pages = zone_end_pfn - zone_start_pfn; 547 * 548 * present_pages is physical pages existing within the zone, which 549 * is calculated as: 550 * present_pages = spanned_pages - absent_pages(pages in holes); 551 * 552 * present_early_pages is present pages existing within the zone 553 * located on memory available since early boot, excluding hotplugged 554 * memory. 555 * 556 * managed_pages is present pages managed by the buddy system, which 557 * is calculated as (reserved_pages includes pages allocated by the 558 * bootmem allocator): 559 * managed_pages = present_pages - reserved_pages; 560 * 561 * cma pages is present pages that are assigned for CMA use 562 * (MIGRATE_CMA). 563 * 564 * So present_pages may be used by memory hotplug or memory power 565 * management logic to figure out unmanaged pages by checking 566 * (present_pages - managed_pages). And managed_pages should be used 567 * by page allocator and vm scanner to calculate all kinds of watermarks 568 * and thresholds. 569 * 570 * Locking rules: 571 * 572 * zone_start_pfn and spanned_pages are protected by span_seqlock. 573 * It is a seqlock because it has to be read outside of zone->lock, 574 * and it is done in the main allocator path. But, it is written 575 * quite infrequently. 576 * 577 * The span_seq lock is declared along with zone->lock because it is 578 * frequently read in proximity to zone->lock. It's good to 579 * give them a chance of being in the same cacheline. 580 * 581 * Write access to present_pages at runtime should be protected by 582 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of 583 * present_pages should get_online_mems() to get a stable value. 584 */ 585 atomic_long_t managed_pages; 586 unsigned long spanned_pages; 587 unsigned long present_pages; 588#if defined(CONFIG_MEMORY_HOTPLUG) 589 unsigned long present_early_pages; 590#endif 591#ifdef CONFIG_CMA 592 unsigned long cma_pages; 593#endif 594 595 const char *name; 596 597#ifdef CONFIG_MEMORY_ISOLATION 598 /* 599 * Number of isolated pageblock. It is used to solve incorrect 600 * freepage counting problem due to racy retrieving migratetype 601 * of pageblock. Protected by zone->lock. 602 */ 603 unsigned long nr_isolate_pageblock; 604#endif 605 606#ifdef CONFIG_MEMORY_HOTPLUG 607 /* see spanned/present_pages for more description */ 608 seqlock_t span_seqlock; 609#endif 610 611 int initialized; 612 613 /* Write-intensive fields used from the page allocator */ 614 ZONE_PADDING(_pad1_) 615 616 /* free areas of different sizes */ 617 struct free_area free_area[MAX_ORDER]; 618 619 /* zone flags, see below */ 620 unsigned long flags; 621 622 /* Primarily protects free_area */ 623 spinlock_t lock; 624 625 /* Write-intensive fields used by compaction and vmstats. */ 626 ZONE_PADDING(_pad2_) 627 628 /* 629 * When free pages are below this point, additional steps are taken 630 * when reading the number of free pages to avoid per-cpu counter 631 * drift allowing watermarks to be breached 632 */ 633 unsigned long percpu_drift_mark; 634 635#if defined CONFIG_COMPACTION || defined CONFIG_CMA 636 /* pfn where compaction free scanner should start */ 637 unsigned long compact_cached_free_pfn; 638 /* pfn where compaction migration scanner should start */ 639 unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; 640 unsigned long compact_init_migrate_pfn; 641 unsigned long compact_init_free_pfn; 642#endif 643 644#ifdef CONFIG_COMPACTION 645 /* 646 * On compaction failure, 1<<compact_defer_shift compactions 647 * are skipped before trying again. The number attempted since 648 * last failure is tracked with compact_considered. 649 * compact_order_failed is the minimum compaction failed order. 650 */ 651 unsigned int compact_considered; 652 unsigned int compact_defer_shift; 653 int compact_order_failed; 654#endif 655 656#if defined CONFIG_COMPACTION || defined CONFIG_CMA 657 /* Set to true when the PG_migrate_skip bits should be cleared */ 658 bool compact_blockskip_flush; 659#endif 660 661 bool contiguous; 662 663 ZONE_PADDING(_pad3_) 664 /* Zone statistics */ 665 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 666 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; 667} ____cacheline_internodealigned_in_smp; 668 669enum pgdat_flags { 670 PGDAT_DIRTY, /* reclaim scanning has recently found 671 * many dirty file pages at the tail 672 * of the LRU. 673 */ 674 PGDAT_WRITEBACK, /* reclaim scanning has recently found 675 * many pages under writeback 676 */ 677 PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 678}; 679 680enum zone_flags { 681 ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. 682 * Cleared when kswapd is woken. 683 */ 684 ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ 685}; 686 687static inline unsigned long zone_managed_pages(struct zone *zone) 688{ 689 return (unsigned long)atomic_long_read(&zone->managed_pages); 690} 691 692static inline unsigned long zone_cma_pages(struct zone *zone) 693{ 694#ifdef CONFIG_CMA 695 return zone->cma_pages; 696#else 697 return 0; 698#endif 699} 700 701static inline unsigned long zone_end_pfn(const struct zone *zone) 702{ 703 return zone->zone_start_pfn + zone->spanned_pages; 704} 705 706static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) 707{ 708 return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); 709} 710 711static inline bool zone_is_initialized(struct zone *zone) 712{ 713 return zone->initialized; 714} 715 716static inline bool zone_is_empty(struct zone *zone) 717{ 718 return zone->spanned_pages == 0; 719} 720 721/* 722 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty 723 * intersection with the given zone 724 */ 725static inline bool zone_intersects(struct zone *zone, 726 unsigned long start_pfn, unsigned long nr_pages) 727{ 728 if (zone_is_empty(zone)) 729 return false; 730 if (start_pfn >= zone_end_pfn(zone) || 731 start_pfn + nr_pages <= zone->zone_start_pfn) 732 return false; 733 734 return true; 735} 736 737/* 738 * The "priority" of VM scanning is how much of the queues we will scan in one 739 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 740 * queues ("queue_length >> 12") during an aging round. 741 */ 742#define DEF_PRIORITY 12 743 744/* Maximum number of zones on a zonelist */ 745#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) 746 747enum { 748 ZONELIST_FALLBACK, /* zonelist with fallback */ 749#ifdef CONFIG_NUMA 750 /* 751 * The NUMA zonelists are doubled because we need zonelists that 752 * restrict the allocations to a single node for __GFP_THISNODE. 753 */ 754 ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ 755#endif 756 MAX_ZONELISTS 757}; 758 759/* 760 * This struct contains information about a zone in a zonelist. It is stored 761 * here to avoid dereferences into large structures and lookups of tables 762 */ 763struct zoneref { 764 struct zone *zone; /* Pointer to actual zone */ 765 int zone_idx; /* zone_idx(zoneref->zone) */ 766}; 767 768/* 769 * One allocation request operates on a zonelist. A zonelist 770 * is a list of zones, the first one is the 'goal' of the 771 * allocation, the other zones are fallback zones, in decreasing 772 * priority. 773 * 774 * To speed the reading of the zonelist, the zonerefs contain the zone index 775 * of the entry being read. Helper functions to access information given 776 * a struct zoneref are 777 * 778 * zonelist_zone() - Return the struct zone * for an entry in _zonerefs 779 * zonelist_zone_idx() - Return the index of the zone for an entry 780 * zonelist_node_idx() - Return the index of the node for an entry 781 */ 782struct zonelist { 783 struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; 784}; 785 786/* 787 * The array of struct pages for flatmem. 788 * It must be declared for SPARSEMEM as well because there are configurations 789 * that rely on that. 790 */ 791extern struct page *mem_map; 792 793#ifdef CONFIG_TRANSPARENT_HUGEPAGE 794struct deferred_split { 795 spinlock_t split_queue_lock; 796 struct list_head split_queue; 797 unsigned long split_queue_len; 798}; 799#endif 800 801/* 802 * On NUMA machines, each NUMA node would have a pg_data_t to describe 803 * it's memory layout. On UMA machines there is a single pglist_data which 804 * describes the whole memory. 805 * 806 * Memory statistics and page replacement data structures are maintained on a 807 * per-zone basis. 808 */ 809typedef struct pglist_data { 810 /* 811 * node_zones contains just the zones for THIS node. Not all of the 812 * zones may be populated, but it is the full list. It is referenced by 813 * this node's node_zonelists as well as other node's node_zonelists. 814 */ 815 struct zone node_zones[MAX_NR_ZONES]; 816 817 /* 818 * node_zonelists contains references to all zones in all nodes. 819 * Generally the first zones will be references to this node's 820 * node_zones. 821 */ 822 struct zonelist node_zonelists[MAX_ZONELISTS]; 823 824 int nr_zones; /* number of populated zones in this node */ 825#ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ 826 struct page *node_mem_map; 827#ifdef CONFIG_PAGE_EXTENSION 828 struct page_ext *node_page_ext; 829#endif 830#endif 831#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) 832 /* 833 * Must be held any time you expect node_start_pfn, 834 * node_present_pages, node_spanned_pages or nr_zones to stay constant. 835 * Also synchronizes pgdat->first_deferred_pfn during deferred page 836 * init. 837 * 838 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to 839 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG 840 * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. 841 * 842 * Nests above zone->lock and zone->span_seqlock 843 */ 844 spinlock_t node_size_lock; 845#endif 846 unsigned long node_start_pfn; 847 unsigned long node_present_pages; /* total number of physical pages */ 848 unsigned long node_spanned_pages; /* total size of physical page 849 range, including holes */ 850 int node_id; 851 wait_queue_head_t kswapd_wait; 852 wait_queue_head_t pfmemalloc_wait; 853 854 /* workqueues for throttling reclaim for different reasons. */ 855 wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; 856 857 atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ 858 unsigned long nr_reclaim_start; /* nr pages written while throttled 859 * when throttling started. */ 860 struct task_struct *kswapd; /* Protected by 861 mem_hotplug_begin/end() */ 862 int kswapd_order; 863 enum zone_type kswapd_highest_zoneidx; 864 865 int kswapd_failures; /* Number of 'reclaimed == 0' runs */ 866 867#ifdef CONFIG_COMPACTION 868 int kcompactd_max_order; 869 enum zone_type kcompactd_highest_zoneidx; 870 wait_queue_head_t kcompactd_wait; 871 struct task_struct *kcompactd; 872 bool proactive_compact_trigger; 873#endif 874 /* 875 * This is a per-node reserve of pages that are not available 876 * to userspace allocations. 877 */ 878 unsigned long totalreserve_pages; 879 880#ifdef CONFIG_NUMA 881 /* 882 * node reclaim becomes active if more unmapped pages exist. 883 */ 884 unsigned long min_unmapped_pages; 885 unsigned long min_slab_pages; 886#endif /* CONFIG_NUMA */ 887 888 /* Write-intensive fields used by page reclaim */ 889 ZONE_PADDING(_pad1_) 890 891#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 892 /* 893 * If memory initialisation on large machines is deferred then this 894 * is the first PFN that needs to be initialised. 895 */ 896 unsigned long first_deferred_pfn; 897#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 898 899#ifdef CONFIG_TRANSPARENT_HUGEPAGE 900 struct deferred_split deferred_split_queue; 901#endif 902 903 /* Fields commonly accessed by the page reclaim scanner */ 904 905 /* 906 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. 907 * 908 * Use mem_cgroup_lruvec() to look up lruvecs. 909 */ 910 struct lruvec __lruvec; 911 912 unsigned long flags; 913 914 ZONE_PADDING(_pad2_) 915 916 /* Per-node vmstats */ 917 struct per_cpu_nodestat __percpu *per_cpu_nodestats; 918 atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; 919} pg_data_t; 920 921#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 922#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 923#ifdef CONFIG_FLATMEM 924#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) 925#else 926#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) 927#endif 928#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 929 930#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 931#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) 932 933static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) 934{ 935 return pgdat->node_start_pfn + pgdat->node_spanned_pages; 936} 937 938static inline bool pgdat_is_empty(pg_data_t *pgdat) 939{ 940 return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; 941} 942 943#include <linux/memory_hotplug.h> 944 945void build_all_zonelists(pg_data_t *pgdat); 946void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, 947 enum zone_type highest_zoneidx); 948bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 949 int highest_zoneidx, unsigned int alloc_flags, 950 long free_pages); 951bool zone_watermark_ok(struct zone *z, unsigned int order, 952 unsigned long mark, int highest_zoneidx, 953 unsigned int alloc_flags); 954bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 955 unsigned long mark, int highest_zoneidx); 956/* 957 * Memory initialization context, use to differentiate memory added by 958 * the platform statically or via memory hotplug interface. 959 */ 960enum meminit_context { 961 MEMINIT_EARLY, 962 MEMINIT_HOTPLUG, 963}; 964 965extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, 966 unsigned long size); 967 968extern void lruvec_init(struct lruvec *lruvec); 969 970static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) 971{ 972#ifdef CONFIG_MEMCG 973 return lruvec->pgdat; 974#else 975 return container_of(lruvec, struct pglist_data, __lruvec); 976#endif 977} 978 979#ifdef CONFIG_HAVE_MEMORYLESS_NODES 980int local_memory_node(int node_id); 981#else 982static inline int local_memory_node(int node_id) { return node_id; }; 983#endif 984 985/* 986 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 987 */ 988#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 989 990#ifdef CONFIG_ZONE_DEVICE 991static inline bool zone_is_zone_device(struct zone *zone) 992{ 993 return zone_idx(zone) == ZONE_DEVICE; 994} 995#else 996static inline bool zone_is_zone_device(struct zone *zone) 997{ 998 return false; 999} 1000#endif 1001 1002/* 1003 * Returns true if a zone has pages managed by the buddy allocator. 1004 * All the reclaim decisions have to use this function rather than 1005 * populated_zone(). If the whole zone is reserved then we can easily 1006 * end up with populated_zone() && !managed_zone(). 1007 */ 1008static inline bool managed_zone(struct zone *zone) 1009{ 1010 return zone_managed_pages(zone); 1011} 1012 1013/* Returns true if a zone has memory */ 1014static inline bool populated_zone(struct zone *zone) 1015{ 1016 return zone->present_pages; 1017} 1018 1019#ifdef CONFIG_NUMA 1020static inline int zone_to_nid(struct zone *zone) 1021{ 1022 return zone->node; 1023} 1024 1025static inline void zone_set_nid(struct zone *zone, int nid) 1026{ 1027 zone->node = nid; 1028} 1029#else 1030static inline int zone_to_nid(struct zone *zone) 1031{ 1032 return 0; 1033} 1034 1035static inline void zone_set_nid(struct zone *zone, int nid) {} 1036#endif 1037 1038extern int movable_zone; 1039 1040static inline int is_highmem_idx(enum zone_type idx) 1041{ 1042#ifdef CONFIG_HIGHMEM 1043 return (idx == ZONE_HIGHMEM || 1044 (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); 1045#else 1046 return 0; 1047#endif 1048} 1049 1050/** 1051 * is_highmem - helper function to quickly check if a struct zone is a 1052 * highmem zone or not. This is an attempt to keep references 1053 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 1054 * @zone: pointer to struct zone variable 1055 * Return: 1 for a highmem zone, 0 otherwise 1056 */ 1057static inline int is_highmem(struct zone *zone) 1058{ 1059#ifdef CONFIG_HIGHMEM 1060 return is_highmem_idx(zone_idx(zone)); 1061#else 1062 return 0; 1063#endif 1064} 1065 1066/* These two functions are used to setup the per zone pages min values */ 1067struct ctl_table; 1068 1069int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, 1070 loff_t *); 1071int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, 1072 size_t *, loff_t *); 1073extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; 1074int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, 1075 size_t *, loff_t *); 1076int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int, 1077 void *, size_t *, loff_t *); 1078int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, 1079 void *, size_t *, loff_t *); 1080int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, 1081 void *, size_t *, loff_t *); 1082int numa_zonelist_order_handler(struct ctl_table *, int, 1083 void *, size_t *, loff_t *); 1084extern int percpu_pagelist_high_fraction; 1085extern char numa_zonelist_order[]; 1086#define NUMA_ZONELIST_ORDER_LEN 16 1087 1088#ifndef CONFIG_NUMA 1089 1090extern struct pglist_data contig_page_data; 1091static inline struct pglist_data *NODE_DATA(int nid) 1092{ 1093 return &contig_page_data; 1094} 1095#define NODE_MEM_MAP(nid) mem_map 1096 1097#else /* CONFIG_NUMA */ 1098 1099#include <asm/mmzone.h> 1100 1101#endif /* !CONFIG_NUMA */ 1102 1103extern struct pglist_data *first_online_pgdat(void); 1104extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); 1105extern struct zone *next_zone(struct zone *zone); 1106 1107/** 1108 * for_each_online_pgdat - helper macro to iterate over all online nodes 1109 * @pgdat: pointer to a pg_data_t variable 1110 */ 1111#define for_each_online_pgdat(pgdat) \ 1112 for (pgdat = first_online_pgdat(); \ 1113 pgdat; \ 1114 pgdat = next_online_pgdat(pgdat)) 1115/** 1116 * for_each_zone - helper macro to iterate over all memory zones 1117 * @zone: pointer to struct zone variable 1118 * 1119 * The user only needs to declare the zone variable, for_each_zone 1120 * fills it in. 1121 */ 1122#define for_each_zone(zone) \ 1123 for (zone = (first_online_pgdat())->node_zones; \ 1124 zone; \ 1125 zone = next_zone(zone)) 1126 1127#define for_each_populated_zone(zone) \ 1128 for (zone = (first_online_pgdat())->node_zones; \ 1129 zone; \ 1130 zone = next_zone(zone)) \ 1131 if (!populated_zone(zone)) \ 1132 ; /* do nothing */ \ 1133 else 1134 1135static inline struct zone *zonelist_zone(struct zoneref *zoneref) 1136{ 1137 return zoneref->zone; 1138} 1139 1140static inline int zonelist_zone_idx(struct zoneref *zoneref) 1141{ 1142 return zoneref->zone_idx; 1143} 1144 1145static inline int zonelist_node_idx(struct zoneref *zoneref) 1146{ 1147 return zone_to_nid(zoneref->zone); 1148} 1149 1150struct zoneref *__next_zones_zonelist(struct zoneref *z, 1151 enum zone_type highest_zoneidx, 1152 nodemask_t *nodes); 1153 1154/** 1155 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point 1156 * @z: The cursor used as a starting point for the search 1157 * @highest_zoneidx: The zone index of the highest zone to return 1158 * @nodes: An optional nodemask to filter the zonelist with 1159 * 1160 * This function returns the next zone at or below a given zone index that is 1161 * within the allowed nodemask using a cursor as the starting point for the 1162 * search. The zoneref returned is a cursor that represents the current zone 1163 * being examined. It should be advanced by one before calling 1164 * next_zones_zonelist again. 1165 * 1166 * Return: the next zone at or below highest_zoneidx within the allowed 1167 * nodemask using a cursor within a zonelist as a starting point 1168 */ 1169static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, 1170 enum zone_type highest_zoneidx, 1171 nodemask_t *nodes) 1172{ 1173 if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) 1174 return z; 1175 return __next_zones_zonelist(z, highest_zoneidx, nodes); 1176} 1177 1178/** 1179 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist 1180 * @zonelist: The zonelist to search for a suitable zone 1181 * @highest_zoneidx: The zone index of the highest zone to return 1182 * @nodes: An optional nodemask to filter the zonelist with 1183 * 1184 * This function returns the first zone at or below a given zone index that is 1185 * within the allowed nodemask. The zoneref returned is a cursor that can be 1186 * used to iterate the zonelist with next_zones_zonelist by advancing it by 1187 * one before calling. 1188 * 1189 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is 1190 * never NULL). This may happen either genuinely, or due to concurrent nodemask 1191 * update due to cpuset modification. 1192 * 1193 * Return: Zoneref pointer for the first suitable zone found 1194 */ 1195static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, 1196 enum zone_type highest_zoneidx, 1197 nodemask_t *nodes) 1198{ 1199 return next_zones_zonelist(zonelist->_zonerefs, 1200 highest_zoneidx, nodes); 1201} 1202 1203/** 1204 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask 1205 * @zone: The current zone in the iterator 1206 * @z: The current pointer within zonelist->_zonerefs being iterated 1207 * @zlist: The zonelist being iterated 1208 * @highidx: The zone index of the highest zone to return 1209 * @nodemask: Nodemask allowed by the allocator 1210 * 1211 * This iterator iterates though all zones at or below a given zone index and 1212 * within a given nodemask 1213 */ 1214#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ 1215 for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ 1216 zone; \ 1217 z = next_zones_zonelist(++z, highidx, nodemask), \ 1218 zone = zonelist_zone(z)) 1219 1220#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ 1221 for (zone = z->zone; \ 1222 zone; \ 1223 z = next_zones_zonelist(++z, highidx, nodemask), \ 1224 zone = zonelist_zone(z)) 1225 1226 1227/** 1228 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index 1229 * @zone: The current zone in the iterator 1230 * @z: The current pointer within zonelist->zones being iterated 1231 * @zlist: The zonelist being iterated 1232 * @highidx: The zone index of the highest zone to return 1233 * 1234 * This iterator iterates though all zones at or below a given zone index. 1235 */ 1236#define for_each_zone_zonelist(zone, z, zlist, highidx) \ 1237 for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) 1238 1239/* Whether the 'nodes' are all movable nodes */ 1240static inline bool movable_only_nodes(nodemask_t *nodes) 1241{ 1242 struct zonelist *zonelist; 1243 struct zoneref *z; 1244 int nid; 1245 1246 if (nodes_empty(*nodes)) 1247 return false; 1248 1249 /* 1250 * We can chose arbitrary node from the nodemask to get a 1251 * zonelist as they are interlinked. We just need to find 1252 * at least one zone that can satisfy kernel allocations. 1253 */ 1254 nid = first_node(*nodes); 1255 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; 1256 z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); 1257 return (!z->zone) ? true : false; 1258} 1259 1260 1261#ifdef CONFIG_SPARSEMEM 1262#include <asm/sparsemem.h> 1263#endif 1264 1265#ifdef CONFIG_FLATMEM 1266#define pfn_to_nid(pfn) (0) 1267#endif 1268 1269#ifdef CONFIG_SPARSEMEM 1270 1271/* 1272 * PA_SECTION_SHIFT physical address to/from section number 1273 * PFN_SECTION_SHIFT pfn to/from section number 1274 */ 1275#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) 1276#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) 1277 1278#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) 1279 1280#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) 1281#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) 1282 1283#define SECTION_BLOCKFLAGS_BITS \ 1284 ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) 1285 1286#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS 1287#error Allocator MAX_ORDER exceeds SECTION_SIZE 1288#endif 1289 1290static inline unsigned long pfn_to_section_nr(unsigned long pfn) 1291{ 1292 return pfn >> PFN_SECTION_SHIFT; 1293} 1294static inline unsigned long section_nr_to_pfn(unsigned long sec) 1295{ 1296 return sec << PFN_SECTION_SHIFT; 1297} 1298 1299#define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) 1300#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1301 1302#define SUBSECTION_SHIFT 21 1303#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) 1304 1305#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) 1306#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) 1307#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) 1308 1309#if SUBSECTION_SHIFT > SECTION_SIZE_BITS 1310#error Subsection size exceeds section size 1311#else 1312#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) 1313#endif 1314 1315#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) 1316#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) 1317 1318struct mem_section_usage { 1319#ifdef CONFIG_SPARSEMEM_VMEMMAP 1320 DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); 1321#endif 1322 /* See declaration of similar field in struct zone */ 1323 unsigned long pageblock_flags[0]; 1324}; 1325 1326void subsection_map_init(unsigned long pfn, unsigned long nr_pages); 1327 1328struct page; 1329struct page_ext; 1330struct mem_section { 1331 /* 1332 * This is, logically, a pointer to an array of struct 1333 * pages. However, it is stored with some other magic. 1334 * (see sparse.c::sparse_init_one_section()) 1335 * 1336 * Additionally during early boot we encode node id of 1337 * the location of the section here to guide allocation. 1338 * (see sparse.c::memory_present()) 1339 * 1340 * Making it a UL at least makes someone do a cast 1341 * before using it wrong. 1342 */ 1343 unsigned long section_mem_map; 1344 1345 struct mem_section_usage *usage; 1346#ifdef CONFIG_PAGE_EXTENSION 1347 /* 1348 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use 1349 * section. (see page_ext.h about this.) 1350 */ 1351 struct page_ext *page_ext; 1352 unsigned long pad; 1353#endif 1354 /* 1355 * WARNING: mem_section must be a power-of-2 in size for the 1356 * calculation and use of SECTION_ROOT_MASK to make sense. 1357 */ 1358}; 1359 1360#ifdef CONFIG_SPARSEMEM_EXTREME 1361#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) 1362#else 1363#define SECTIONS_PER_ROOT 1 1364#endif 1365 1366#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) 1367#define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) 1368#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) 1369 1370#ifdef CONFIG_SPARSEMEM_EXTREME 1371extern struct mem_section **mem_section; 1372#else 1373extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; 1374#endif 1375 1376static inline unsigned long *section_to_usemap(struct mem_section *ms) 1377{ 1378 return ms->usage->pageblock_flags; 1379} 1380 1381static inline struct mem_section *__nr_to_section(unsigned long nr) 1382{ 1383#ifdef CONFIG_SPARSEMEM_EXTREME 1384 if (!mem_section) 1385 return NULL; 1386#endif 1387 if (!mem_section[SECTION_NR_TO_ROOT(nr)]) 1388 return NULL; 1389 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; 1390} 1391extern size_t mem_section_usage_size(void); 1392 1393/* 1394 * We use the lower bits of the mem_map pointer to store 1395 * a little bit of information. The pointer is calculated 1396 * as mem_map - section_nr_to_pfn(pnum). The result is 1397 * aligned to the minimum alignment of the two values: 1398 * 1. All mem_map arrays are page-aligned. 1399 * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT 1400 * lowest bits. PFN_SECTION_SHIFT is arch-specific 1401 * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the 1402 * worst combination is powerpc with 256k pages, 1403 * which results in PFN_SECTION_SHIFT equal 6. 1404 * To sum it up, at least 6 bits are available. 1405 */ 1406#define SECTION_MARKED_PRESENT (1UL<<0) 1407#define SECTION_HAS_MEM_MAP (1UL<<1) 1408#define SECTION_IS_ONLINE (1UL<<2) 1409#define SECTION_IS_EARLY (1UL<<3) 1410#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) 1411#define SECTION_MAP_LAST_BIT (1UL<<5) 1412#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) 1413#define SECTION_NID_SHIFT 6 1414 1415static inline struct page *__section_mem_map_addr(struct mem_section *section) 1416{ 1417 unsigned long map = section->section_mem_map; 1418 map &= SECTION_MAP_MASK; 1419 return (struct page *)map; 1420} 1421 1422static inline int present_section(struct mem_section *section) 1423{ 1424 return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); 1425} 1426 1427static inline int present_section_nr(unsigned long nr) 1428{ 1429 return present_section(__nr_to_section(nr)); 1430} 1431 1432static inline int valid_section(struct mem_section *section) 1433{ 1434 return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); 1435} 1436 1437static inline int early_section(struct mem_section *section) 1438{ 1439 return (section && (section->section_mem_map & SECTION_IS_EARLY)); 1440} 1441 1442static inline int valid_section_nr(unsigned long nr) 1443{ 1444 return valid_section(__nr_to_section(nr)); 1445} 1446 1447static inline int online_section(struct mem_section *section) 1448{ 1449 return (section && (section->section_mem_map & SECTION_IS_ONLINE)); 1450} 1451 1452static inline int online_device_section(struct mem_section *section) 1453{ 1454 unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; 1455 1456 return section && ((section->section_mem_map & flags) == flags); 1457} 1458 1459static inline int online_section_nr(unsigned long nr) 1460{ 1461 return online_section(__nr_to_section(nr)); 1462} 1463 1464#ifdef CONFIG_MEMORY_HOTPLUG 1465void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); 1466void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); 1467#endif 1468 1469static inline struct mem_section *__pfn_to_section(unsigned long pfn) 1470{ 1471 return __nr_to_section(pfn_to_section_nr(pfn)); 1472} 1473 1474extern unsigned long __highest_present_section_nr; 1475 1476static inline int subsection_map_index(unsigned long pfn) 1477{ 1478 return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; 1479} 1480 1481#ifdef CONFIG_SPARSEMEM_VMEMMAP 1482static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) 1483{ 1484 int idx = subsection_map_index(pfn); 1485 1486 return test_bit(idx, ms->usage->subsection_map); 1487} 1488#else 1489static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) 1490{ 1491 return 1; 1492} 1493#endif 1494 1495#ifndef CONFIG_HAVE_ARCH_PFN_VALID 1496/** 1497 * pfn_valid - check if there is a valid memory map entry for a PFN 1498 * @pfn: the page frame number to check 1499 * 1500 * Check if there is a valid memory map entry aka struct page for the @pfn. 1501 * Note, that availability of the memory map entry does not imply that 1502 * there is actual usable memory at that @pfn. The struct page may 1503 * represent a hole or an unusable page frame. 1504 * 1505 * Return: 1 for PFNs that have memory map entries and 0 otherwise 1506 */ 1507static inline int pfn_valid(unsigned long pfn) 1508{ 1509 struct mem_section *ms; 1510 1511 /* 1512 * Ensure the upper PAGE_SHIFT bits are clear in the 1513 * pfn. Else it might lead to false positives when 1514 * some of the upper bits are set, but the lower bits 1515 * match a valid pfn. 1516 */ 1517 if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) 1518 return 0; 1519 1520 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 1521 return 0; 1522 ms = __pfn_to_section(pfn); 1523 if (!valid_section(ms)) 1524 return 0; 1525 /* 1526 * Traditionally early sections always returned pfn_valid() for 1527 * the entire section-sized span. 1528 */ 1529 return early_section(ms) || pfn_section_valid(ms, pfn); 1530} 1531#endif 1532 1533static inline int pfn_in_present_section(unsigned long pfn) 1534{ 1535 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 1536 return 0; 1537 return present_section(__pfn_to_section(pfn)); 1538} 1539 1540static inline unsigned long next_present_section_nr(unsigned long section_nr) 1541{ 1542 while (++section_nr <= __highest_present_section_nr) { 1543 if (present_section_nr(section_nr)) 1544 return section_nr; 1545 } 1546 1547 return -1; 1548} 1549 1550/* 1551 * These are _only_ used during initialisation, therefore they 1552 * can use __initdata ... They could have names to indicate 1553 * this restriction. 1554 */ 1555#ifdef CONFIG_NUMA 1556#define pfn_to_nid(pfn) \ 1557({ \ 1558 unsigned long __pfn_to_nid_pfn = (pfn); \ 1559 page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ 1560}) 1561#else 1562#define pfn_to_nid(pfn) (0) 1563#endif 1564 1565void sparse_init(void); 1566#else 1567#define sparse_init() do {} while (0) 1568#define sparse_index_init(_sec, _nid) do {} while (0) 1569#define pfn_in_present_section pfn_valid 1570#define subsection_map_init(_pfn, _nr_pages) do {} while (0) 1571#endif /* CONFIG_SPARSEMEM */ 1572 1573#endif /* !__GENERATING_BOUNDS.H */ 1574#endif /* !__ASSEMBLY__ */ 1575#endif /* _LINUX_MMZONE_H */