at v5.17 49 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_MMZONE_H 3#define _LINUX_MMZONE_H 4 5#ifndef __ASSEMBLY__ 6#ifndef __GENERATING_BOUNDS_H 7 8#include <linux/spinlock.h> 9#include <linux/list.h> 10#include <linux/wait.h> 11#include <linux/bitops.h> 12#include <linux/cache.h> 13#include <linux/threads.h> 14#include <linux/numa.h> 15#include <linux/init.h> 16#include <linux/seqlock.h> 17#include <linux/nodemask.h> 18#include <linux/pageblock-flags.h> 19#include <linux/page-flags-layout.h> 20#include <linux/atomic.h> 21#include <linux/mm_types.h> 22#include <linux/page-flags.h> 23#include <linux/local_lock.h> 24#include <asm/page.h> 25 26/* Free memory management - zoned buddy allocator. */ 27#ifndef CONFIG_FORCE_MAX_ZONEORDER 28#define MAX_ORDER 11 29#else 30#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER 31#endif 32#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) 33 34/* 35 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed 36 * costly to service. That is between allocation orders which should 37 * coalesce naturally under reasonable reclaim pressure and those which 38 * will not. 39 */ 40#define PAGE_ALLOC_COSTLY_ORDER 3 41 42enum migratetype { 43 MIGRATE_UNMOVABLE, 44 MIGRATE_MOVABLE, 45 MIGRATE_RECLAIMABLE, 46 MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ 47 MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, 48#ifdef CONFIG_CMA 49 /* 50 * MIGRATE_CMA migration type is designed to mimic the way 51 * ZONE_MOVABLE works. Only movable pages can be allocated 52 * from MIGRATE_CMA pageblocks and page allocator never 53 * implicitly change migration type of MIGRATE_CMA pageblock. 54 * 55 * The way to use it is to change migratetype of a range of 56 * pageblocks to MIGRATE_CMA which can be done by 57 * __free_pageblock_cma() function. What is important though 58 * is that a range of pageblocks must be aligned to 59 * MAX_ORDER_NR_PAGES should biggest page be bigger than 60 * a single pageblock. 61 */ 62 MIGRATE_CMA, 63#endif 64#ifdef CONFIG_MEMORY_ISOLATION 65 MIGRATE_ISOLATE, /* can't allocate from here */ 66#endif 67 MIGRATE_TYPES 68}; 69 70/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ 71extern const char * const migratetype_names[MIGRATE_TYPES]; 72 73#ifdef CONFIG_CMA 74# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) 75# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) 76#else 77# define is_migrate_cma(migratetype) false 78# define is_migrate_cma_page(_page) false 79#endif 80 81static inline bool is_migrate_movable(int mt) 82{ 83 return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; 84} 85 86#define for_each_migratetype_order(order, type) \ 87 for (order = 0; order < MAX_ORDER; order++) \ 88 for (type = 0; type < MIGRATE_TYPES; type++) 89 90extern int page_group_by_mobility_disabled; 91 92#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) 93 94#define get_pageblock_migratetype(page) \ 95 get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) 96 97struct free_area { 98 struct list_head free_list[MIGRATE_TYPES]; 99 unsigned long nr_free; 100}; 101 102static inline struct page *get_page_from_free_area(struct free_area *area, 103 int migratetype) 104{ 105 return list_first_entry_or_null(&area->free_list[migratetype], 106 struct page, lru); 107} 108 109static inline bool free_area_empty(struct free_area *area, int migratetype) 110{ 111 return list_empty(&area->free_list[migratetype]); 112} 113 114struct pglist_data; 115 116/* 117 * Add a wild amount of padding here to ensure data fall into separate 118 * cachelines. There are very few zone structures in the machine, so space 119 * consumption is not a concern here. 120 */ 121#if defined(CONFIG_SMP) 122struct zone_padding { 123 char x[0]; 124} ____cacheline_internodealigned_in_smp; 125#define ZONE_PADDING(name) struct zone_padding name; 126#else 127#define ZONE_PADDING(name) 128#endif 129 130#ifdef CONFIG_NUMA 131enum numa_stat_item { 132 NUMA_HIT, /* allocated in intended node */ 133 NUMA_MISS, /* allocated in non intended node */ 134 NUMA_FOREIGN, /* was intended here, hit elsewhere */ 135 NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ 136 NUMA_LOCAL, /* allocation from local node */ 137 NUMA_OTHER, /* allocation from other node */ 138 NR_VM_NUMA_EVENT_ITEMS 139}; 140#else 141#define NR_VM_NUMA_EVENT_ITEMS 0 142#endif 143 144enum zone_stat_item { 145 /* First 128 byte cacheline (assuming 64 bit words) */ 146 NR_FREE_PAGES, 147 NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ 148 NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, 149 NR_ZONE_ACTIVE_ANON, 150 NR_ZONE_INACTIVE_FILE, 151 NR_ZONE_ACTIVE_FILE, 152 NR_ZONE_UNEVICTABLE, 153 NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ 154 NR_MLOCK, /* mlock()ed pages found and moved off LRU */ 155 /* Second 128 byte cacheline */ 156 NR_BOUNCE, 157#if IS_ENABLED(CONFIG_ZSMALLOC) 158 NR_ZSPAGES, /* allocated in zsmalloc */ 159#endif 160 NR_FREE_CMA_PAGES, 161 NR_VM_ZONE_STAT_ITEMS }; 162 163enum node_stat_item { 164 NR_LRU_BASE, 165 NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ 166 NR_ACTIVE_ANON, /* " " " " " */ 167 NR_INACTIVE_FILE, /* " " " " " */ 168 NR_ACTIVE_FILE, /* " " " " " */ 169 NR_UNEVICTABLE, /* " " " " " */ 170 NR_SLAB_RECLAIMABLE_B, 171 NR_SLAB_UNRECLAIMABLE_B, 172 NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ 173 NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ 174 WORKINGSET_NODES, 175 WORKINGSET_REFAULT_BASE, 176 WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, 177 WORKINGSET_REFAULT_FILE, 178 WORKINGSET_ACTIVATE_BASE, 179 WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, 180 WORKINGSET_ACTIVATE_FILE, 181 WORKINGSET_RESTORE_BASE, 182 WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, 183 WORKINGSET_RESTORE_FILE, 184 WORKINGSET_NODERECLAIM, 185 NR_ANON_MAPPED, /* Mapped anonymous pages */ 186 NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. 187 only modified from process context */ 188 NR_FILE_PAGES, 189 NR_FILE_DIRTY, 190 NR_WRITEBACK, 191 NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ 192 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ 193 NR_SHMEM_THPS, 194 NR_SHMEM_PMDMAPPED, 195 NR_FILE_THPS, 196 NR_FILE_PMDMAPPED, 197 NR_ANON_THPS, 198 NR_VMSCAN_WRITE, 199 NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ 200 NR_DIRTIED, /* page dirtyings since bootup */ 201 NR_WRITTEN, /* page writings since bootup */ 202 NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ 203 NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ 204 NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ 205 NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ 206 NR_KERNEL_STACK_KB, /* measured in KiB */ 207#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) 208 NR_KERNEL_SCS_KB, /* measured in KiB */ 209#endif 210 NR_PAGETABLE, /* used for pagetables */ 211#ifdef CONFIG_SWAP 212 NR_SWAPCACHE, 213#endif 214 NR_VM_NODE_STAT_ITEMS 215}; 216 217/* 218 * Returns true if the item should be printed in THPs (/proc/vmstat 219 * currently prints number of anon, file and shmem THPs. But the item 220 * is charged in pages). 221 */ 222static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) 223{ 224 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 225 return false; 226 227 return item == NR_ANON_THPS || 228 item == NR_FILE_THPS || 229 item == NR_SHMEM_THPS || 230 item == NR_SHMEM_PMDMAPPED || 231 item == NR_FILE_PMDMAPPED; 232} 233 234/* 235 * Returns true if the value is measured in bytes (most vmstat values are 236 * measured in pages). This defines the API part, the internal representation 237 * might be different. 238 */ 239static __always_inline bool vmstat_item_in_bytes(int idx) 240{ 241 /* 242 * Global and per-node slab counters track slab pages. 243 * It's expected that changes are multiples of PAGE_SIZE. 244 * Internally values are stored in pages. 245 * 246 * Per-memcg and per-lruvec counters track memory, consumed 247 * by individual slab objects. These counters are actually 248 * byte-precise. 249 */ 250 return (idx == NR_SLAB_RECLAIMABLE_B || 251 idx == NR_SLAB_UNRECLAIMABLE_B); 252} 253 254/* 255 * We do arithmetic on the LRU lists in various places in the code, 256 * so it is important to keep the active lists LRU_ACTIVE higher in 257 * the array than the corresponding inactive lists, and to keep 258 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. 259 * 260 * This has to be kept in sync with the statistics in zone_stat_item 261 * above and the descriptions in vmstat_text in mm/vmstat.c 262 */ 263#define LRU_BASE 0 264#define LRU_ACTIVE 1 265#define LRU_FILE 2 266 267enum lru_list { 268 LRU_INACTIVE_ANON = LRU_BASE, 269 LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, 270 LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, 271 LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, 272 LRU_UNEVICTABLE, 273 NR_LRU_LISTS 274}; 275 276enum vmscan_throttle_state { 277 VMSCAN_THROTTLE_WRITEBACK, 278 VMSCAN_THROTTLE_ISOLATED, 279 VMSCAN_THROTTLE_NOPROGRESS, 280 VMSCAN_THROTTLE_CONGESTED, 281 NR_VMSCAN_THROTTLE, 282}; 283 284#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) 285 286#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) 287 288static inline bool is_file_lru(enum lru_list lru) 289{ 290 return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); 291} 292 293static inline bool is_active_lru(enum lru_list lru) 294{ 295 return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); 296} 297 298#define ANON_AND_FILE 2 299 300enum lruvec_flags { 301 LRUVEC_CONGESTED, /* lruvec has many dirty pages 302 * backed by a congested BDI 303 */ 304}; 305 306struct lruvec { 307 struct list_head lists[NR_LRU_LISTS]; 308 /* per lruvec lru_lock for memcg */ 309 spinlock_t lru_lock; 310 /* 311 * These track the cost of reclaiming one LRU - file or anon - 312 * over the other. As the observed cost of reclaiming one LRU 313 * increases, the reclaim scan balance tips toward the other. 314 */ 315 unsigned long anon_cost; 316 unsigned long file_cost; 317 /* Non-resident age, driven by LRU movement */ 318 atomic_long_t nonresident_age; 319 /* Refaults at the time of last reclaim cycle */ 320 unsigned long refaults[ANON_AND_FILE]; 321 /* Various lruvec state flags (enum lruvec_flags) */ 322 unsigned long flags; 323#ifdef CONFIG_MEMCG 324 struct pglist_data *pgdat; 325#endif 326}; 327 328/* Isolate unmapped pages */ 329#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) 330/* Isolate for asynchronous migration */ 331#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) 332/* Isolate unevictable pages */ 333#define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) 334 335/* LRU Isolation modes. */ 336typedef unsigned __bitwise isolate_mode_t; 337 338enum zone_watermarks { 339 WMARK_MIN, 340 WMARK_LOW, 341 WMARK_HIGH, 342 NR_WMARK 343}; 344 345/* 346 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional 347 * for pageblock size for THP if configured. 348 */ 349#ifdef CONFIG_TRANSPARENT_HUGEPAGE 350#define NR_PCP_THP 1 351#else 352#define NR_PCP_THP 0 353#endif 354#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) 355 356/* 357 * Shift to encode migratetype and order in the same integer, with order 358 * in the least significant bits. 359 */ 360#define NR_PCP_ORDER_WIDTH 8 361#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1) 362 363#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) 364#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) 365#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) 366#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) 367 368/* Fields and list protected by pagesets local_lock in page_alloc.c */ 369struct per_cpu_pages { 370 int count; /* number of pages in the list */ 371 int high; /* high watermark, emptying needed */ 372 int batch; /* chunk size for buddy add/remove */ 373 short free_factor; /* batch scaling factor during free */ 374#ifdef CONFIG_NUMA 375 short expire; /* When 0, remote pagesets are drained */ 376#endif 377 378 /* Lists of pages, one per migrate type stored on the pcp-lists */ 379 struct list_head lists[NR_PCP_LISTS]; 380}; 381 382struct per_cpu_zonestat { 383#ifdef CONFIG_SMP 384 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; 385 s8 stat_threshold; 386#endif 387#ifdef CONFIG_NUMA 388 /* 389 * Low priority inaccurate counters that are only folded 390 * on demand. Use a large type to avoid the overhead of 391 * folding during refresh_cpu_vm_stats. 392 */ 393 unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; 394#endif 395}; 396 397struct per_cpu_nodestat { 398 s8 stat_threshold; 399 s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; 400}; 401 402#endif /* !__GENERATING_BOUNDS.H */ 403 404enum zone_type { 405 /* 406 * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able 407 * to DMA to all of the addressable memory (ZONE_NORMAL). 408 * On architectures where this area covers the whole 32 bit address 409 * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller 410 * DMA addressing constraints. This distinction is important as a 32bit 411 * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit 412 * platforms may need both zones as they support peripherals with 413 * different DMA addressing limitations. 414 */ 415#ifdef CONFIG_ZONE_DMA 416 ZONE_DMA, 417#endif 418#ifdef CONFIG_ZONE_DMA32 419 ZONE_DMA32, 420#endif 421 /* 422 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be 423 * performed on pages in ZONE_NORMAL if the DMA devices support 424 * transfers to all addressable memory. 425 */ 426 ZONE_NORMAL, 427#ifdef CONFIG_HIGHMEM 428 /* 429 * A memory area that is only addressable by the kernel through 430 * mapping portions into its own address space. This is for example 431 * used by i386 to allow the kernel to address the memory beyond 432 * 900MB. The kernel will set up special mappings (page 433 * table entries on i386) for each page that the kernel needs to 434 * access. 435 */ 436 ZONE_HIGHMEM, 437#endif 438 /* 439 * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains 440 * movable pages with few exceptional cases described below. Main use 441 * cases for ZONE_MOVABLE are to make memory offlining/unplug more 442 * likely to succeed, and to locally limit unmovable allocations - e.g., 443 * to increase the number of THP/huge pages. Notable special cases are: 444 * 445 * 1. Pinned pages: (long-term) pinning of movable pages might 446 * essentially turn such pages unmovable. Therefore, we do not allow 447 * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and 448 * faulted, they come from the right zone right away. However, it is 449 * still possible that address space already has pages in 450 * ZONE_MOVABLE at the time when pages are pinned (i.e. user has 451 * touches that memory before pinning). In such case we migrate them 452 * to a different zone. When migration fails - pinning fails. 453 * 2. memblock allocations: kernelcore/movablecore setups might create 454 * situations where ZONE_MOVABLE contains unmovable allocations 455 * after boot. Memory offlining and allocations fail early. 456 * 3. Memory holes: kernelcore/movablecore setups might create very rare 457 * situations where ZONE_MOVABLE contains memory holes after boot, 458 * for example, if we have sections that are only partially 459 * populated. Memory offlining and allocations fail early. 460 * 4. PG_hwpoison pages: while poisoned pages can be skipped during 461 * memory offlining, such pages cannot be allocated. 462 * 5. Unmovable PG_offline pages: in paravirtualized environments, 463 * hotplugged memory blocks might only partially be managed by the 464 * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The 465 * parts not manged by the buddy are unmovable PG_offline pages. In 466 * some cases (virtio-mem), such pages can be skipped during 467 * memory offlining, however, cannot be moved/allocated. These 468 * techniques might use alloc_contig_range() to hide previously 469 * exposed pages from the buddy again (e.g., to implement some sort 470 * of memory unplug in virtio-mem). 471 * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create 472 * situations where ZERO_PAGE(0) which is allocated differently 473 * on different platforms may end up in a movable zone. ZERO_PAGE(0) 474 * cannot be migrated. 475 * 7. Memory-hotplug: when using memmap_on_memory and onlining the 476 * memory to the MOVABLE zone, the vmemmap pages are also placed in 477 * such zone. Such pages cannot be really moved around as they are 478 * self-stored in the range, but they are treated as movable when 479 * the range they describe is about to be offlined. 480 * 481 * In general, no unmovable allocations that degrade memory offlining 482 * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) 483 * have to expect that migrating pages in ZONE_MOVABLE can fail (even 484 * if has_unmovable_pages() states that there are no unmovable pages, 485 * there can be false negatives). 486 */ 487 ZONE_MOVABLE, 488#ifdef CONFIG_ZONE_DEVICE 489 ZONE_DEVICE, 490#endif 491 __MAX_NR_ZONES 492 493}; 494 495#ifndef __GENERATING_BOUNDS_H 496 497#define ASYNC_AND_SYNC 2 498 499struct zone { 500 /* Read-mostly fields */ 501 502 /* zone watermarks, access with *_wmark_pages(zone) macros */ 503 unsigned long _watermark[NR_WMARK]; 504 unsigned long watermark_boost; 505 506 unsigned long nr_reserved_highatomic; 507 508 /* 509 * We don't know if the memory that we're going to allocate will be 510 * freeable or/and it will be released eventually, so to avoid totally 511 * wasting several GB of ram we must reserve some of the lower zone 512 * memory (otherwise we risk to run OOM on the lower zones despite 513 * there being tons of freeable ram on the higher zones). This array is 514 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl 515 * changes. 516 */ 517 long lowmem_reserve[MAX_NR_ZONES]; 518 519#ifdef CONFIG_NUMA 520 int node; 521#endif 522 struct pglist_data *zone_pgdat; 523 struct per_cpu_pages __percpu *per_cpu_pageset; 524 struct per_cpu_zonestat __percpu *per_cpu_zonestats; 525 /* 526 * the high and batch values are copied to individual pagesets for 527 * faster access 528 */ 529 int pageset_high; 530 int pageset_batch; 531 532#ifndef CONFIG_SPARSEMEM 533 /* 534 * Flags for a pageblock_nr_pages block. See pageblock-flags.h. 535 * In SPARSEMEM, this map is stored in struct mem_section 536 */ 537 unsigned long *pageblock_flags; 538#endif /* CONFIG_SPARSEMEM */ 539 540 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 541 unsigned long zone_start_pfn; 542 543 /* 544 * spanned_pages is the total pages spanned by the zone, including 545 * holes, which is calculated as: 546 * spanned_pages = zone_end_pfn - zone_start_pfn; 547 * 548 * present_pages is physical pages existing within the zone, which 549 * is calculated as: 550 * present_pages = spanned_pages - absent_pages(pages in holes); 551 * 552 * present_early_pages is present pages existing within the zone 553 * located on memory available since early boot, excluding hotplugged 554 * memory. 555 * 556 * managed_pages is present pages managed by the buddy system, which 557 * is calculated as (reserved_pages includes pages allocated by the 558 * bootmem allocator): 559 * managed_pages = present_pages - reserved_pages; 560 * 561 * cma pages is present pages that are assigned for CMA use 562 * (MIGRATE_CMA). 563 * 564 * So present_pages may be used by memory hotplug or memory power 565 * management logic to figure out unmanaged pages by checking 566 * (present_pages - managed_pages). And managed_pages should be used 567 * by page allocator and vm scanner to calculate all kinds of watermarks 568 * and thresholds. 569 * 570 * Locking rules: 571 * 572 * zone_start_pfn and spanned_pages are protected by span_seqlock. 573 * It is a seqlock because it has to be read outside of zone->lock, 574 * and it is done in the main allocator path. But, it is written 575 * quite infrequently. 576 * 577 * The span_seq lock is declared along with zone->lock because it is 578 * frequently read in proximity to zone->lock. It's good to 579 * give them a chance of being in the same cacheline. 580 * 581 * Write access to present_pages at runtime should be protected by 582 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of 583 * present_pages should get_online_mems() to get a stable value. 584 */ 585 atomic_long_t managed_pages; 586 unsigned long spanned_pages; 587 unsigned long present_pages; 588#if defined(CONFIG_MEMORY_HOTPLUG) 589 unsigned long present_early_pages; 590#endif 591#ifdef CONFIG_CMA 592 unsigned long cma_pages; 593#endif 594 595 const char *name; 596 597#ifdef CONFIG_MEMORY_ISOLATION 598 /* 599 * Number of isolated pageblock. It is used to solve incorrect 600 * freepage counting problem due to racy retrieving migratetype 601 * of pageblock. Protected by zone->lock. 602 */ 603 unsigned long nr_isolate_pageblock; 604#endif 605 606#ifdef CONFIG_MEMORY_HOTPLUG 607 /* see spanned/present_pages for more description */ 608 seqlock_t span_seqlock; 609#endif 610 611 int initialized; 612 613 /* Write-intensive fields used from the page allocator */ 614 ZONE_PADDING(_pad1_) 615 616 /* free areas of different sizes */ 617 struct free_area free_area[MAX_ORDER]; 618 619 /* zone flags, see below */ 620 unsigned long flags; 621 622 /* Primarily protects free_area */ 623 spinlock_t lock; 624 625 /* Write-intensive fields used by compaction and vmstats. */ 626 ZONE_PADDING(_pad2_) 627 628 /* 629 * When free pages are below this point, additional steps are taken 630 * when reading the number of free pages to avoid per-cpu counter 631 * drift allowing watermarks to be breached 632 */ 633 unsigned long percpu_drift_mark; 634 635#if defined CONFIG_COMPACTION || defined CONFIG_CMA 636 /* pfn where compaction free scanner should start */ 637 unsigned long compact_cached_free_pfn; 638 /* pfn where compaction migration scanner should start */ 639 unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; 640 unsigned long compact_init_migrate_pfn; 641 unsigned long compact_init_free_pfn; 642#endif 643 644#ifdef CONFIG_COMPACTION 645 /* 646 * On compaction failure, 1<<compact_defer_shift compactions 647 * are skipped before trying again. The number attempted since 648 * last failure is tracked with compact_considered. 649 * compact_order_failed is the minimum compaction failed order. 650 */ 651 unsigned int compact_considered; 652 unsigned int compact_defer_shift; 653 int compact_order_failed; 654#endif 655 656#if defined CONFIG_COMPACTION || defined CONFIG_CMA 657 /* Set to true when the PG_migrate_skip bits should be cleared */ 658 bool compact_blockskip_flush; 659#endif 660 661 bool contiguous; 662 663 ZONE_PADDING(_pad3_) 664 /* Zone statistics */ 665 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 666 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; 667} ____cacheline_internodealigned_in_smp; 668 669enum pgdat_flags { 670 PGDAT_DIRTY, /* reclaim scanning has recently found 671 * many dirty file pages at the tail 672 * of the LRU. 673 */ 674 PGDAT_WRITEBACK, /* reclaim scanning has recently found 675 * many pages under writeback 676 */ 677 PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 678}; 679 680enum zone_flags { 681 ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. 682 * Cleared when kswapd is woken. 683 */ 684 ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ 685}; 686 687static inline unsigned long zone_managed_pages(struct zone *zone) 688{ 689 return (unsigned long)atomic_long_read(&zone->managed_pages); 690} 691 692static inline unsigned long zone_cma_pages(struct zone *zone) 693{ 694#ifdef CONFIG_CMA 695 return zone->cma_pages; 696#else 697 return 0; 698#endif 699} 700 701static inline unsigned long zone_end_pfn(const struct zone *zone) 702{ 703 return zone->zone_start_pfn + zone->spanned_pages; 704} 705 706static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) 707{ 708 return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); 709} 710 711static inline bool zone_is_initialized(struct zone *zone) 712{ 713 return zone->initialized; 714} 715 716static inline bool zone_is_empty(struct zone *zone) 717{ 718 return zone->spanned_pages == 0; 719} 720 721/* 722 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty 723 * intersection with the given zone 724 */ 725static inline bool zone_intersects(struct zone *zone, 726 unsigned long start_pfn, unsigned long nr_pages) 727{ 728 if (zone_is_empty(zone)) 729 return false; 730 if (start_pfn >= zone_end_pfn(zone) || 731 start_pfn + nr_pages <= zone->zone_start_pfn) 732 return false; 733 734 return true; 735} 736 737/* 738 * The "priority" of VM scanning is how much of the queues we will scan in one 739 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 740 * queues ("queue_length >> 12") during an aging round. 741 */ 742#define DEF_PRIORITY 12 743 744/* Maximum number of zones on a zonelist */ 745#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) 746 747enum { 748 ZONELIST_FALLBACK, /* zonelist with fallback */ 749#ifdef CONFIG_NUMA 750 /* 751 * The NUMA zonelists are doubled because we need zonelists that 752 * restrict the allocations to a single node for __GFP_THISNODE. 753 */ 754 ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ 755#endif 756 MAX_ZONELISTS 757}; 758 759/* 760 * This struct contains information about a zone in a zonelist. It is stored 761 * here to avoid dereferences into large structures and lookups of tables 762 */ 763struct zoneref { 764 struct zone *zone; /* Pointer to actual zone */ 765 int zone_idx; /* zone_idx(zoneref->zone) */ 766}; 767 768/* 769 * One allocation request operates on a zonelist. A zonelist 770 * is a list of zones, the first one is the 'goal' of the 771 * allocation, the other zones are fallback zones, in decreasing 772 * priority. 773 * 774 * To speed the reading of the zonelist, the zonerefs contain the zone index 775 * of the entry being read. Helper functions to access information given 776 * a struct zoneref are 777 * 778 * zonelist_zone() - Return the struct zone * for an entry in _zonerefs 779 * zonelist_zone_idx() - Return the index of the zone for an entry 780 * zonelist_node_idx() - Return the index of the node for an entry 781 */ 782struct zonelist { 783 struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; 784}; 785 786/* 787 * The array of struct pages for flatmem. 788 * It must be declared for SPARSEMEM as well because there are configurations 789 * that rely on that. 790 */ 791extern struct page *mem_map; 792 793#ifdef CONFIG_TRANSPARENT_HUGEPAGE 794struct deferred_split { 795 spinlock_t split_queue_lock; 796 struct list_head split_queue; 797 unsigned long split_queue_len; 798}; 799#endif 800 801/* 802 * On NUMA machines, each NUMA node would have a pg_data_t to describe 803 * it's memory layout. On UMA machines there is a single pglist_data which 804 * describes the whole memory. 805 * 806 * Memory statistics and page replacement data structures are maintained on a 807 * per-zone basis. 808 */ 809typedef struct pglist_data { 810 /* 811 * node_zones contains just the zones for THIS node. Not all of the 812 * zones may be populated, but it is the full list. It is referenced by 813 * this node's node_zonelists as well as other node's node_zonelists. 814 */ 815 struct zone node_zones[MAX_NR_ZONES]; 816 817 /* 818 * node_zonelists contains references to all zones in all nodes. 819 * Generally the first zones will be references to this node's 820 * node_zones. 821 */ 822 struct zonelist node_zonelists[MAX_ZONELISTS]; 823 824 int nr_zones; /* number of populated zones in this node */ 825#ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ 826 struct page *node_mem_map; 827#ifdef CONFIG_PAGE_EXTENSION 828 struct page_ext *node_page_ext; 829#endif 830#endif 831#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) 832 /* 833 * Must be held any time you expect node_start_pfn, 834 * node_present_pages, node_spanned_pages or nr_zones to stay constant. 835 * Also synchronizes pgdat->first_deferred_pfn during deferred page 836 * init. 837 * 838 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to 839 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG 840 * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. 841 * 842 * Nests above zone->lock and zone->span_seqlock 843 */ 844 spinlock_t node_size_lock; 845#endif 846 unsigned long node_start_pfn; 847 unsigned long node_present_pages; /* total number of physical pages */ 848 unsigned long node_spanned_pages; /* total size of physical page 849 range, including holes */ 850 int node_id; 851 wait_queue_head_t kswapd_wait; 852 wait_queue_head_t pfmemalloc_wait; 853 854 /* workqueues for throttling reclaim for different reasons. */ 855 wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; 856 857 atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ 858 unsigned long nr_reclaim_start; /* nr pages written while throttled 859 * when throttling started. */ 860 struct task_struct *kswapd; /* Protected by 861 mem_hotplug_begin/end() */ 862 int kswapd_order; 863 enum zone_type kswapd_highest_zoneidx; 864 865 int kswapd_failures; /* Number of 'reclaimed == 0' runs */ 866 867#ifdef CONFIG_COMPACTION 868 int kcompactd_max_order; 869 enum zone_type kcompactd_highest_zoneidx; 870 wait_queue_head_t kcompactd_wait; 871 struct task_struct *kcompactd; 872 bool proactive_compact_trigger; 873#endif 874 /* 875 * This is a per-node reserve of pages that are not available 876 * to userspace allocations. 877 */ 878 unsigned long totalreserve_pages; 879 880#ifdef CONFIG_NUMA 881 /* 882 * node reclaim becomes active if more unmapped pages exist. 883 */ 884 unsigned long min_unmapped_pages; 885 unsigned long min_slab_pages; 886#endif /* CONFIG_NUMA */ 887 888 /* Write-intensive fields used by page reclaim */ 889 ZONE_PADDING(_pad1_) 890 891#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 892 /* 893 * If memory initialisation on large machines is deferred then this 894 * is the first PFN that needs to be initialised. 895 */ 896 unsigned long first_deferred_pfn; 897#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 898 899#ifdef CONFIG_TRANSPARENT_HUGEPAGE 900 struct deferred_split deferred_split_queue; 901#endif 902 903 /* Fields commonly accessed by the page reclaim scanner */ 904 905 /* 906 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. 907 * 908 * Use mem_cgroup_lruvec() to look up lruvecs. 909 */ 910 struct lruvec __lruvec; 911 912 unsigned long flags; 913 914 ZONE_PADDING(_pad2_) 915 916 /* Per-node vmstats */ 917 struct per_cpu_nodestat __percpu *per_cpu_nodestats; 918 atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; 919} pg_data_t; 920 921#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 922#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 923#ifdef CONFIG_FLATMEM 924#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) 925#else 926#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) 927#endif 928#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 929 930#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 931#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) 932 933static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) 934{ 935 return pgdat->node_start_pfn + pgdat->node_spanned_pages; 936} 937 938static inline bool pgdat_is_empty(pg_data_t *pgdat) 939{ 940 return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; 941} 942 943#include <linux/memory_hotplug.h> 944 945void build_all_zonelists(pg_data_t *pgdat); 946void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, 947 enum zone_type highest_zoneidx); 948bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 949 int highest_zoneidx, unsigned int alloc_flags, 950 long free_pages); 951bool zone_watermark_ok(struct zone *z, unsigned int order, 952 unsigned long mark, int highest_zoneidx, 953 unsigned int alloc_flags); 954bool zone_watermark_ok_safe(struct zone *z, unsigned int order, 955 unsigned long mark, int highest_zoneidx); 956/* 957 * Memory initialization context, use to differentiate memory added by 958 * the platform statically or via memory hotplug interface. 959 */ 960enum meminit_context { 961 MEMINIT_EARLY, 962 MEMINIT_HOTPLUG, 963}; 964 965extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, 966 unsigned long size); 967 968extern void lruvec_init(struct lruvec *lruvec); 969 970static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) 971{ 972#ifdef CONFIG_MEMCG 973 return lruvec->pgdat; 974#else 975 return container_of(lruvec, struct pglist_data, __lruvec); 976#endif 977} 978 979#ifdef CONFIG_HAVE_MEMORYLESS_NODES 980int local_memory_node(int node_id); 981#else 982static inline int local_memory_node(int node_id) { return node_id; }; 983#endif 984 985/* 986 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 987 */ 988#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 989 990#ifdef CONFIG_ZONE_DEVICE 991static inline bool zone_is_zone_device(struct zone *zone) 992{ 993 return zone_idx(zone) == ZONE_DEVICE; 994} 995#else 996static inline bool zone_is_zone_device(struct zone *zone) 997{ 998 return false; 999} 1000#endif 1001 1002/* 1003 * Returns true if a zone has pages managed by the buddy allocator. 1004 * All the reclaim decisions have to use this function rather than 1005 * populated_zone(). If the whole zone is reserved then we can easily 1006 * end up with populated_zone() && !managed_zone(). 1007 */ 1008static inline bool managed_zone(struct zone *zone) 1009{ 1010 return zone_managed_pages(zone); 1011} 1012 1013/* Returns true if a zone has memory */ 1014static inline bool populated_zone(struct zone *zone) 1015{ 1016 return zone->present_pages; 1017} 1018 1019#ifdef CONFIG_NUMA 1020static inline int zone_to_nid(struct zone *zone) 1021{ 1022 return zone->node; 1023} 1024 1025static inline void zone_set_nid(struct zone *zone, int nid) 1026{ 1027 zone->node = nid; 1028} 1029#else 1030static inline int zone_to_nid(struct zone *zone) 1031{ 1032 return 0; 1033} 1034 1035static inline void zone_set_nid(struct zone *zone, int nid) {} 1036#endif 1037 1038extern int movable_zone; 1039 1040static inline int is_highmem_idx(enum zone_type idx) 1041{ 1042#ifdef CONFIG_HIGHMEM 1043 return (idx == ZONE_HIGHMEM || 1044 (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); 1045#else 1046 return 0; 1047#endif 1048} 1049 1050#ifdef CONFIG_ZONE_DMA 1051bool has_managed_dma(void); 1052#else 1053static inline bool has_managed_dma(void) 1054{ 1055 return false; 1056} 1057#endif 1058 1059/** 1060 * is_highmem - helper function to quickly check if a struct zone is a 1061 * highmem zone or not. This is an attempt to keep references 1062 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 1063 * @zone: pointer to struct zone variable 1064 * Return: 1 for a highmem zone, 0 otherwise 1065 */ 1066static inline int is_highmem(struct zone *zone) 1067{ 1068#ifdef CONFIG_HIGHMEM 1069 return is_highmem_idx(zone_idx(zone)); 1070#else 1071 return 0; 1072#endif 1073} 1074 1075/* These two functions are used to setup the per zone pages min values */ 1076struct ctl_table; 1077 1078int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, 1079 loff_t *); 1080int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, 1081 size_t *, loff_t *); 1082extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; 1083int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, 1084 size_t *, loff_t *); 1085int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int, 1086 void *, size_t *, loff_t *); 1087int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, 1088 void *, size_t *, loff_t *); 1089int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, 1090 void *, size_t *, loff_t *); 1091int numa_zonelist_order_handler(struct ctl_table *, int, 1092 void *, size_t *, loff_t *); 1093extern int percpu_pagelist_high_fraction; 1094extern char numa_zonelist_order[]; 1095#define NUMA_ZONELIST_ORDER_LEN 16 1096 1097#ifndef CONFIG_NUMA 1098 1099extern struct pglist_data contig_page_data; 1100static inline struct pglist_data *NODE_DATA(int nid) 1101{ 1102 return &contig_page_data; 1103} 1104#define NODE_MEM_MAP(nid) mem_map 1105 1106#else /* CONFIG_NUMA */ 1107 1108#include <asm/mmzone.h> 1109 1110#endif /* !CONFIG_NUMA */ 1111 1112extern struct pglist_data *first_online_pgdat(void); 1113extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); 1114extern struct zone *next_zone(struct zone *zone); 1115 1116/** 1117 * for_each_online_pgdat - helper macro to iterate over all online nodes 1118 * @pgdat: pointer to a pg_data_t variable 1119 */ 1120#define for_each_online_pgdat(pgdat) \ 1121 for (pgdat = first_online_pgdat(); \ 1122 pgdat; \ 1123 pgdat = next_online_pgdat(pgdat)) 1124/** 1125 * for_each_zone - helper macro to iterate over all memory zones 1126 * @zone: pointer to struct zone variable 1127 * 1128 * The user only needs to declare the zone variable, for_each_zone 1129 * fills it in. 1130 */ 1131#define for_each_zone(zone) \ 1132 for (zone = (first_online_pgdat())->node_zones; \ 1133 zone; \ 1134 zone = next_zone(zone)) 1135 1136#define for_each_populated_zone(zone) \ 1137 for (zone = (first_online_pgdat())->node_zones; \ 1138 zone; \ 1139 zone = next_zone(zone)) \ 1140 if (!populated_zone(zone)) \ 1141 ; /* do nothing */ \ 1142 else 1143 1144static inline struct zone *zonelist_zone(struct zoneref *zoneref) 1145{ 1146 return zoneref->zone; 1147} 1148 1149static inline int zonelist_zone_idx(struct zoneref *zoneref) 1150{ 1151 return zoneref->zone_idx; 1152} 1153 1154static inline int zonelist_node_idx(struct zoneref *zoneref) 1155{ 1156 return zone_to_nid(zoneref->zone); 1157} 1158 1159struct zoneref *__next_zones_zonelist(struct zoneref *z, 1160 enum zone_type highest_zoneidx, 1161 nodemask_t *nodes); 1162 1163/** 1164 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point 1165 * @z: The cursor used as a starting point for the search 1166 * @highest_zoneidx: The zone index of the highest zone to return 1167 * @nodes: An optional nodemask to filter the zonelist with 1168 * 1169 * This function returns the next zone at or below a given zone index that is 1170 * within the allowed nodemask using a cursor as the starting point for the 1171 * search. The zoneref returned is a cursor that represents the current zone 1172 * being examined. It should be advanced by one before calling 1173 * next_zones_zonelist again. 1174 * 1175 * Return: the next zone at or below highest_zoneidx within the allowed 1176 * nodemask using a cursor within a zonelist as a starting point 1177 */ 1178static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, 1179 enum zone_type highest_zoneidx, 1180 nodemask_t *nodes) 1181{ 1182 if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) 1183 return z; 1184 return __next_zones_zonelist(z, highest_zoneidx, nodes); 1185} 1186 1187/** 1188 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist 1189 * @zonelist: The zonelist to search for a suitable zone 1190 * @highest_zoneidx: The zone index of the highest zone to return 1191 * @nodes: An optional nodemask to filter the zonelist with 1192 * 1193 * This function returns the first zone at or below a given zone index that is 1194 * within the allowed nodemask. The zoneref returned is a cursor that can be 1195 * used to iterate the zonelist with next_zones_zonelist by advancing it by 1196 * one before calling. 1197 * 1198 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is 1199 * never NULL). This may happen either genuinely, or due to concurrent nodemask 1200 * update due to cpuset modification. 1201 * 1202 * Return: Zoneref pointer for the first suitable zone found 1203 */ 1204static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, 1205 enum zone_type highest_zoneidx, 1206 nodemask_t *nodes) 1207{ 1208 return next_zones_zonelist(zonelist->_zonerefs, 1209 highest_zoneidx, nodes); 1210} 1211 1212/** 1213 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask 1214 * @zone: The current zone in the iterator 1215 * @z: The current pointer within zonelist->_zonerefs being iterated 1216 * @zlist: The zonelist being iterated 1217 * @highidx: The zone index of the highest zone to return 1218 * @nodemask: Nodemask allowed by the allocator 1219 * 1220 * This iterator iterates though all zones at or below a given zone index and 1221 * within a given nodemask 1222 */ 1223#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ 1224 for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ 1225 zone; \ 1226 z = next_zones_zonelist(++z, highidx, nodemask), \ 1227 zone = zonelist_zone(z)) 1228 1229#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ 1230 for (zone = z->zone; \ 1231 zone; \ 1232 z = next_zones_zonelist(++z, highidx, nodemask), \ 1233 zone = zonelist_zone(z)) 1234 1235 1236/** 1237 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index 1238 * @zone: The current zone in the iterator 1239 * @z: The current pointer within zonelist->zones being iterated 1240 * @zlist: The zonelist being iterated 1241 * @highidx: The zone index of the highest zone to return 1242 * 1243 * This iterator iterates though all zones at or below a given zone index. 1244 */ 1245#define for_each_zone_zonelist(zone, z, zlist, highidx) \ 1246 for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) 1247 1248/* Whether the 'nodes' are all movable nodes */ 1249static inline bool movable_only_nodes(nodemask_t *nodes) 1250{ 1251 struct zonelist *zonelist; 1252 struct zoneref *z; 1253 int nid; 1254 1255 if (nodes_empty(*nodes)) 1256 return false; 1257 1258 /* 1259 * We can chose arbitrary node from the nodemask to get a 1260 * zonelist as they are interlinked. We just need to find 1261 * at least one zone that can satisfy kernel allocations. 1262 */ 1263 nid = first_node(*nodes); 1264 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; 1265 z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); 1266 return (!z->zone) ? true : false; 1267} 1268 1269 1270#ifdef CONFIG_SPARSEMEM 1271#include <asm/sparsemem.h> 1272#endif 1273 1274#ifdef CONFIG_FLATMEM 1275#define pfn_to_nid(pfn) (0) 1276#endif 1277 1278#ifdef CONFIG_SPARSEMEM 1279 1280/* 1281 * PA_SECTION_SHIFT physical address to/from section number 1282 * PFN_SECTION_SHIFT pfn to/from section number 1283 */ 1284#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) 1285#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) 1286 1287#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) 1288 1289#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) 1290#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) 1291 1292#define SECTION_BLOCKFLAGS_BITS \ 1293 ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) 1294 1295#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS 1296#error Allocator MAX_ORDER exceeds SECTION_SIZE 1297#endif 1298 1299static inline unsigned long pfn_to_section_nr(unsigned long pfn) 1300{ 1301 return pfn >> PFN_SECTION_SHIFT; 1302} 1303static inline unsigned long section_nr_to_pfn(unsigned long sec) 1304{ 1305 return sec << PFN_SECTION_SHIFT; 1306} 1307 1308#define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) 1309#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1310 1311#define SUBSECTION_SHIFT 21 1312#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) 1313 1314#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) 1315#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) 1316#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) 1317 1318#if SUBSECTION_SHIFT > SECTION_SIZE_BITS 1319#error Subsection size exceeds section size 1320#else 1321#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) 1322#endif 1323 1324#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) 1325#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) 1326 1327struct mem_section_usage { 1328#ifdef CONFIG_SPARSEMEM_VMEMMAP 1329 DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); 1330#endif 1331 /* See declaration of similar field in struct zone */ 1332 unsigned long pageblock_flags[0]; 1333}; 1334 1335void subsection_map_init(unsigned long pfn, unsigned long nr_pages); 1336 1337struct page; 1338struct page_ext; 1339struct mem_section { 1340 /* 1341 * This is, logically, a pointer to an array of struct 1342 * pages. However, it is stored with some other magic. 1343 * (see sparse.c::sparse_init_one_section()) 1344 * 1345 * Additionally during early boot we encode node id of 1346 * the location of the section here to guide allocation. 1347 * (see sparse.c::memory_present()) 1348 * 1349 * Making it a UL at least makes someone do a cast 1350 * before using it wrong. 1351 */ 1352 unsigned long section_mem_map; 1353 1354 struct mem_section_usage *usage; 1355#ifdef CONFIG_PAGE_EXTENSION 1356 /* 1357 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use 1358 * section. (see page_ext.h about this.) 1359 */ 1360 struct page_ext *page_ext; 1361 unsigned long pad; 1362#endif 1363 /* 1364 * WARNING: mem_section must be a power-of-2 in size for the 1365 * calculation and use of SECTION_ROOT_MASK to make sense. 1366 */ 1367}; 1368 1369#ifdef CONFIG_SPARSEMEM_EXTREME 1370#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) 1371#else 1372#define SECTIONS_PER_ROOT 1 1373#endif 1374 1375#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) 1376#define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) 1377#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) 1378 1379#ifdef CONFIG_SPARSEMEM_EXTREME 1380extern struct mem_section **mem_section; 1381#else 1382extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; 1383#endif 1384 1385static inline unsigned long *section_to_usemap(struct mem_section *ms) 1386{ 1387 return ms->usage->pageblock_flags; 1388} 1389 1390static inline struct mem_section *__nr_to_section(unsigned long nr) 1391{ 1392#ifdef CONFIG_SPARSEMEM_EXTREME 1393 if (!mem_section) 1394 return NULL; 1395#endif 1396 if (!mem_section[SECTION_NR_TO_ROOT(nr)]) 1397 return NULL; 1398 return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; 1399} 1400extern size_t mem_section_usage_size(void); 1401 1402/* 1403 * We use the lower bits of the mem_map pointer to store 1404 * a little bit of information. The pointer is calculated 1405 * as mem_map - section_nr_to_pfn(pnum). The result is 1406 * aligned to the minimum alignment of the two values: 1407 * 1. All mem_map arrays are page-aligned. 1408 * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT 1409 * lowest bits. PFN_SECTION_SHIFT is arch-specific 1410 * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the 1411 * worst combination is powerpc with 256k pages, 1412 * which results in PFN_SECTION_SHIFT equal 6. 1413 * To sum it up, at least 6 bits are available. 1414 */ 1415#define SECTION_MARKED_PRESENT (1UL<<0) 1416#define SECTION_HAS_MEM_MAP (1UL<<1) 1417#define SECTION_IS_ONLINE (1UL<<2) 1418#define SECTION_IS_EARLY (1UL<<3) 1419#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) 1420#define SECTION_MAP_LAST_BIT (1UL<<5) 1421#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) 1422#define SECTION_NID_SHIFT 6 1423 1424static inline struct page *__section_mem_map_addr(struct mem_section *section) 1425{ 1426 unsigned long map = section->section_mem_map; 1427 map &= SECTION_MAP_MASK; 1428 return (struct page *)map; 1429} 1430 1431static inline int present_section(struct mem_section *section) 1432{ 1433 return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); 1434} 1435 1436static inline int present_section_nr(unsigned long nr) 1437{ 1438 return present_section(__nr_to_section(nr)); 1439} 1440 1441static inline int valid_section(struct mem_section *section) 1442{ 1443 return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); 1444} 1445 1446static inline int early_section(struct mem_section *section) 1447{ 1448 return (section && (section->section_mem_map & SECTION_IS_EARLY)); 1449} 1450 1451static inline int valid_section_nr(unsigned long nr) 1452{ 1453 return valid_section(__nr_to_section(nr)); 1454} 1455 1456static inline int online_section(struct mem_section *section) 1457{ 1458 return (section && (section->section_mem_map & SECTION_IS_ONLINE)); 1459} 1460 1461static inline int online_device_section(struct mem_section *section) 1462{ 1463 unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; 1464 1465 return section && ((section->section_mem_map & flags) == flags); 1466} 1467 1468static inline int online_section_nr(unsigned long nr) 1469{ 1470 return online_section(__nr_to_section(nr)); 1471} 1472 1473#ifdef CONFIG_MEMORY_HOTPLUG 1474void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); 1475void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); 1476#endif 1477 1478static inline struct mem_section *__pfn_to_section(unsigned long pfn) 1479{ 1480 return __nr_to_section(pfn_to_section_nr(pfn)); 1481} 1482 1483extern unsigned long __highest_present_section_nr; 1484 1485static inline int subsection_map_index(unsigned long pfn) 1486{ 1487 return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; 1488} 1489 1490#ifdef CONFIG_SPARSEMEM_VMEMMAP 1491static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) 1492{ 1493 int idx = subsection_map_index(pfn); 1494 1495 return test_bit(idx, ms->usage->subsection_map); 1496} 1497#else 1498static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) 1499{ 1500 return 1; 1501} 1502#endif 1503 1504#ifndef CONFIG_HAVE_ARCH_PFN_VALID 1505/** 1506 * pfn_valid - check if there is a valid memory map entry for a PFN 1507 * @pfn: the page frame number to check 1508 * 1509 * Check if there is a valid memory map entry aka struct page for the @pfn. 1510 * Note, that availability of the memory map entry does not imply that 1511 * there is actual usable memory at that @pfn. The struct page may 1512 * represent a hole or an unusable page frame. 1513 * 1514 * Return: 1 for PFNs that have memory map entries and 0 otherwise 1515 */ 1516static inline int pfn_valid(unsigned long pfn) 1517{ 1518 struct mem_section *ms; 1519 1520 /* 1521 * Ensure the upper PAGE_SHIFT bits are clear in the 1522 * pfn. Else it might lead to false positives when 1523 * some of the upper bits are set, but the lower bits 1524 * match a valid pfn. 1525 */ 1526 if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) 1527 return 0; 1528 1529 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 1530 return 0; 1531 ms = __pfn_to_section(pfn); 1532 if (!valid_section(ms)) 1533 return 0; 1534 /* 1535 * Traditionally early sections always returned pfn_valid() for 1536 * the entire section-sized span. 1537 */ 1538 return early_section(ms) || pfn_section_valid(ms, pfn); 1539} 1540#endif 1541 1542static inline int pfn_in_present_section(unsigned long pfn) 1543{ 1544 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 1545 return 0; 1546 return present_section(__pfn_to_section(pfn)); 1547} 1548 1549static inline unsigned long next_present_section_nr(unsigned long section_nr) 1550{ 1551 while (++section_nr <= __highest_present_section_nr) { 1552 if (present_section_nr(section_nr)) 1553 return section_nr; 1554 } 1555 1556 return -1; 1557} 1558 1559/* 1560 * These are _only_ used during initialisation, therefore they 1561 * can use __initdata ... They could have names to indicate 1562 * this restriction. 1563 */ 1564#ifdef CONFIG_NUMA 1565#define pfn_to_nid(pfn) \ 1566({ \ 1567 unsigned long __pfn_to_nid_pfn = (pfn); \ 1568 page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ 1569}) 1570#else 1571#define pfn_to_nid(pfn) (0) 1572#endif 1573 1574void sparse_init(void); 1575#else 1576#define sparse_init() do {} while (0) 1577#define sparse_index_init(_sec, _nid) do {} while (0) 1578#define pfn_in_present_section pfn_valid 1579#define subsection_map_init(_pfn, _nr_pages) do {} while (0) 1580#endif /* CONFIG_SPARSEMEM */ 1581 1582#endif /* !__GENERATING_BOUNDS.H */ 1583#endif /* !__ASSEMBLY__ */ 1584#endif /* _LINUX_MMZONE_H */