at v2.6.13 18 kB view raw
1#ifndef _LINUX_MMZONE_H 2#define _LINUX_MMZONE_H 3 4#ifdef __KERNEL__ 5#ifndef __ASSEMBLY__ 6 7#include <linux/config.h> 8#include <linux/spinlock.h> 9#include <linux/list.h> 10#include <linux/wait.h> 11#include <linux/cache.h> 12#include <linux/threads.h> 13#include <linux/numa.h> 14#include <linux/init.h> 15#include <asm/atomic.h> 16 17/* Free memory management - zoned buddy allocator. */ 18#ifndef CONFIG_FORCE_MAX_ZONEORDER 19#define MAX_ORDER 11 20#else 21#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER 22#endif 23 24struct free_area { 25 struct list_head free_list; 26 unsigned long nr_free; 27}; 28 29struct pglist_data; 30 31/* 32 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. 33 * So add a wild amount of padding here to ensure that they fall into separate 34 * cachelines. There are very few zone structures in the machine, so space 35 * consumption is not a concern here. 36 */ 37#if defined(CONFIG_SMP) 38struct zone_padding { 39 char x[0]; 40} ____cacheline_maxaligned_in_smp; 41#define ZONE_PADDING(name) struct zone_padding name; 42#else 43#define ZONE_PADDING(name) 44#endif 45 46struct per_cpu_pages { 47 int count; /* number of pages in the list */ 48 int low; /* low watermark, refill needed */ 49 int high; /* high watermark, emptying needed */ 50 int batch; /* chunk size for buddy add/remove */ 51 struct list_head list; /* the list of pages */ 52}; 53 54struct per_cpu_pageset { 55 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 56#ifdef CONFIG_NUMA 57 unsigned long numa_hit; /* allocated in intended node */ 58 unsigned long numa_miss; /* allocated in non intended node */ 59 unsigned long numa_foreign; /* was intended here, hit elsewhere */ 60 unsigned long interleave_hit; /* interleaver prefered this zone */ 61 unsigned long local_node; /* allocation from local node */ 62 unsigned long other_node; /* allocation from other node */ 63#endif 64} ____cacheline_aligned_in_smp; 65 66#ifdef CONFIG_NUMA 67#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) 68#else 69#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) 70#endif 71 72#define ZONE_DMA 0 73#define ZONE_NORMAL 1 74#define ZONE_HIGHMEM 2 75 76#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ 77#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ 78 79 80/* 81 * When a memory allocation must conform to specific limitations (such 82 * as being suitable for DMA) the caller will pass in hints to the 83 * allocator in the gfp_mask, in the zone modifier bits. These bits 84 * are used to select a priority ordered list of memory zones which 85 * match the requested limits. GFP_ZONEMASK defines which bits within 86 * the gfp_mask should be considered as zone modifiers. Each valid 87 * combination of the zone modifier bits has a corresponding list 88 * of zones (in node_zonelists). Thus for two zone modifiers there 89 * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will 90 * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible 91 * combinations of zone modifiers in "zone modifier space". 92 */ 93#define GFP_ZONEMASK 0x03 94/* 95 * As an optimisation any zone modifier bits which are only valid when 96 * no other zone modifier bits are set (loners) should be placed in 97 * the highest order bits of this field. This allows us to reduce the 98 * extent of the zonelists thus saving space. For example in the case 99 * of three zone modifier bits, we could require up to eight zonelists. 100 * If the left most zone modifier is a "loner" then the highest valid 101 * zonelist would be four allowing us to allocate only five zonelists. 102 * Use the first form when the left most bit is not a "loner", otherwise 103 * use the second. 104 */ 105/* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ 106#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ 107 108/* 109 * On machines where it is needed (eg PCs) we divide physical memory 110 * into multiple physical zones. On a PC we have 3 zones: 111 * 112 * ZONE_DMA < 16 MB ISA DMA capable memory 113 * ZONE_NORMAL 16-896 MB direct mapped by the kernel 114 * ZONE_HIGHMEM > 896 MB only page cache and user processes 115 */ 116 117struct zone { 118 /* Fields commonly accessed by the page allocator */ 119 unsigned long free_pages; 120 unsigned long pages_min, pages_low, pages_high; 121 /* 122 * We don't know if the memory that we're going to allocate will be freeable 123 * or/and it will be released eventually, so to avoid totally wasting several 124 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 125 * to run OOM on the lower zones despite there's tons of freeable ram 126 * on the higher zones). This array is recalculated at runtime if the 127 * sysctl_lowmem_reserve_ratio sysctl changes. 128 */ 129 unsigned long lowmem_reserve[MAX_NR_ZONES]; 130 131#ifdef CONFIG_NUMA 132 struct per_cpu_pageset *pageset[NR_CPUS]; 133#else 134 struct per_cpu_pageset pageset[NR_CPUS]; 135#endif 136 /* 137 * free areas of different sizes 138 */ 139 spinlock_t lock; 140 struct free_area free_area[MAX_ORDER]; 141 142 143 ZONE_PADDING(_pad1_) 144 145 /* Fields commonly accessed by the page reclaim scanner */ 146 spinlock_t lru_lock; 147 struct list_head active_list; 148 struct list_head inactive_list; 149 unsigned long nr_scan_active; 150 unsigned long nr_scan_inactive; 151 unsigned long nr_active; 152 unsigned long nr_inactive; 153 unsigned long pages_scanned; /* since last reclaim */ 154 int all_unreclaimable; /* All pages pinned */ 155 156 /* 157 * Does the allocator try to reclaim pages from the zone as soon 158 * as it fails a watermark_ok() in __alloc_pages? 159 */ 160 int reclaim_pages; 161 /* A count of how many reclaimers are scanning this zone */ 162 atomic_t reclaim_in_progress; 163 164 /* 165 * prev_priority holds the scanning priority for this zone. It is 166 * defined as the scanning priority at which we achieved our reclaim 167 * target at the previous try_to_free_pages() or balance_pgdat() 168 * invokation. 169 * 170 * We use prev_priority as a measure of how much stress page reclaim is 171 * under - it drives the swappiness decision: whether to unmap mapped 172 * pages. 173 * 174 * temp_priority is used to remember the scanning priority at which 175 * this zone was successfully refilled to free_pages == pages_high. 176 * 177 * Access to both these fields is quite racy even on uniprocessor. But 178 * it is expected to average out OK. 179 */ 180 int temp_priority; 181 int prev_priority; 182 183 184 ZONE_PADDING(_pad2_) 185 /* Rarely used or read-mostly fields */ 186 187 /* 188 * wait_table -- the array holding the hash table 189 * wait_table_size -- the size of the hash table array 190 * wait_table_bits -- wait_table_size == (1 << wait_table_bits) 191 * 192 * The purpose of all these is to keep track of the people 193 * waiting for a page to become available and make them 194 * runnable again when possible. The trouble is that this 195 * consumes a lot of space, especially when so few things 196 * wait on pages at a given time. So instead of using 197 * per-page waitqueues, we use a waitqueue hash table. 198 * 199 * The bucket discipline is to sleep on the same queue when 200 * colliding and wake all in that wait queue when removing. 201 * When something wakes, it must check to be sure its page is 202 * truly available, a la thundering herd. The cost of a 203 * collision is great, but given the expected load of the 204 * table, they should be so rare as to be outweighed by the 205 * benefits from the saved space. 206 * 207 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the 208 * primary users of these fields, and in mm/page_alloc.c 209 * free_area_init_core() performs the initialization of them. 210 */ 211 wait_queue_head_t * wait_table; 212 unsigned long wait_table_size; 213 unsigned long wait_table_bits; 214 215 /* 216 * Discontig memory support fields. 217 */ 218 struct pglist_data *zone_pgdat; 219 struct page *zone_mem_map; 220 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 221 unsigned long zone_start_pfn; 222 223 unsigned long spanned_pages; /* total size, including holes */ 224 unsigned long present_pages; /* amount of memory (excluding holes) */ 225 226 /* 227 * rarely used fields: 228 */ 229 char *name; 230} ____cacheline_maxaligned_in_smp; 231 232 233/* 234 * The "priority" of VM scanning is how much of the queues we will scan in one 235 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 236 * queues ("queue_length >> 12") during an aging round. 237 */ 238#define DEF_PRIORITY 12 239 240/* 241 * One allocation request operates on a zonelist. A zonelist 242 * is a list of zones, the first one is the 'goal' of the 243 * allocation, the other zones are fallback zones, in decreasing 244 * priority. 245 * 246 * Right now a zonelist takes up less than a cacheline. We never 247 * modify it apart from boot-up, and only a few indices are used, 248 * so despite the zonelist table being relatively big, the cache 249 * footprint of this construct is very small. 250 */ 251struct zonelist { 252 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited 253}; 254 255 256/* 257 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM 258 * (mostly NUMA machines?) to denote a higher-level memory zone than the 259 * zone denotes. 260 * 261 * On NUMA machines, each NUMA node would have a pg_data_t to describe 262 * it's memory layout. 263 * 264 * Memory statistics and page replacement data structures are maintained on a 265 * per-zone basis. 266 */ 267struct bootmem_data; 268typedef struct pglist_data { 269 struct zone node_zones[MAX_NR_ZONES]; 270 struct zonelist node_zonelists[GFP_ZONETYPES]; 271 int nr_zones; 272#ifdef CONFIG_FLAT_NODE_MEM_MAP 273 struct page *node_mem_map; 274#endif 275 struct bootmem_data *bdata; 276 unsigned long node_start_pfn; 277 unsigned long node_present_pages; /* total number of physical pages */ 278 unsigned long node_spanned_pages; /* total size of physical page 279 range, including holes */ 280 int node_id; 281 struct pglist_data *pgdat_next; 282 wait_queue_head_t kswapd_wait; 283 struct task_struct *kswapd; 284 int kswapd_max_order; 285} pg_data_t; 286 287#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 288#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 289#ifdef CONFIG_FLAT_NODE_MEM_MAP 290#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) 291#else 292#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) 293#endif 294#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 295 296extern struct pglist_data *pgdat_list; 297 298void __get_zone_counts(unsigned long *active, unsigned long *inactive, 299 unsigned long *free, struct pglist_data *pgdat); 300void get_zone_counts(unsigned long *active, unsigned long *inactive, 301 unsigned long *free); 302void build_all_zonelists(void); 303void wakeup_kswapd(struct zone *zone, int order); 304int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 305 int alloc_type, int can_try_harder, int gfp_high); 306 307#ifdef CONFIG_HAVE_MEMORY_PRESENT 308void memory_present(int nid, unsigned long start, unsigned long end); 309#else 310static inline void memory_present(int nid, unsigned long start, unsigned long end) {} 311#endif 312 313#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE 314unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); 315#endif 316 317/* 318 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 319 */ 320#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 321 322/** 323 * for_each_pgdat - helper macro to iterate over all nodes 324 * @pgdat - pointer to a pg_data_t variable 325 * 326 * Meant to help with common loops of the form 327 * pgdat = pgdat_list; 328 * while(pgdat) { 329 * ... 330 * pgdat = pgdat->pgdat_next; 331 * } 332 */ 333#define for_each_pgdat(pgdat) \ 334 for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) 335 336/* 337 * next_zone - helper magic for for_each_zone() 338 * Thanks to William Lee Irwin III for this piece of ingenuity. 339 */ 340static inline struct zone *next_zone(struct zone *zone) 341{ 342 pg_data_t *pgdat = zone->zone_pgdat; 343 344 if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) 345 zone++; 346 else if (pgdat->pgdat_next) { 347 pgdat = pgdat->pgdat_next; 348 zone = pgdat->node_zones; 349 } else 350 zone = NULL; 351 352 return zone; 353} 354 355/** 356 * for_each_zone - helper macro to iterate over all memory zones 357 * @zone - pointer to struct zone variable 358 * 359 * The user only needs to declare the zone variable, for_each_zone 360 * fills it in. This basically means for_each_zone() is an 361 * easier to read version of this piece of code: 362 * 363 * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) 364 * for (i = 0; i < MAX_NR_ZONES; ++i) { 365 * struct zone * z = pgdat->node_zones + i; 366 * ... 367 * } 368 * } 369 */ 370#define for_each_zone(zone) \ 371 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) 372 373static inline int is_highmem_idx(int idx) 374{ 375 return (idx == ZONE_HIGHMEM); 376} 377 378static inline int is_normal_idx(int idx) 379{ 380 return (idx == ZONE_NORMAL); 381} 382/** 383 * is_highmem - helper function to quickly check if a struct zone is a 384 * highmem zone or not. This is an attempt to keep references 385 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 386 * @zone - pointer to struct zone variable 387 */ 388static inline int is_highmem(struct zone *zone) 389{ 390 return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; 391} 392 393static inline int is_normal(struct zone *zone) 394{ 395 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; 396} 397 398/* These two functions are used to setup the per zone pages min values */ 399struct ctl_table; 400struct file; 401int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 402 void __user *, size_t *, loff_t *); 403extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; 404int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, 405 void __user *, size_t *, loff_t *); 406 407#include <linux/topology.h> 408/* Returns the number of the current Node. */ 409#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) 410 411#ifndef CONFIG_NEED_MULTIPLE_NODES 412 413extern struct pglist_data contig_page_data; 414#define NODE_DATA(nid) (&contig_page_data) 415#define NODE_MEM_MAP(nid) mem_map 416#define MAX_NODES_SHIFT 1 417#define pfn_to_nid(pfn) (0) 418 419#else /* CONFIG_NEED_MULTIPLE_NODES */ 420 421#include <asm/mmzone.h> 422 423#endif /* !CONFIG_NEED_MULTIPLE_NODES */ 424 425#ifdef CONFIG_SPARSEMEM 426#include <asm/sparsemem.h> 427#endif 428 429#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 430/* 431 * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 432 * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. 433 */ 434#define FLAGS_RESERVED 8 435 436#elif BITS_PER_LONG == 64 437/* 438 * with 64 bit flags field, there's plenty of room. 439 */ 440#define FLAGS_RESERVED 32 441 442#else 443 444#error BITS_PER_LONG not defined 445 446#endif 447 448#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 449#define early_pfn_to_nid(nid) (0UL) 450#endif 451 452#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) 453#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) 454 455#ifdef CONFIG_SPARSEMEM 456 457/* 458 * SECTION_SHIFT #bits space required to store a section # 459 * 460 * PA_SECTION_SHIFT physical address to/from section number 461 * PFN_SECTION_SHIFT pfn to/from section number 462 */ 463#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) 464 465#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) 466#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) 467 468#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) 469 470#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) 471#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) 472 473#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS 474#error Allocator MAX_ORDER exceeds SECTION_SIZE 475#endif 476 477struct page; 478struct mem_section { 479 /* 480 * This is, logically, a pointer to an array of struct 481 * pages. However, it is stored with some other magic. 482 * (see sparse.c::sparse_init_one_section()) 483 * 484 * Making it a UL at least makes someone do a cast 485 * before using it wrong. 486 */ 487 unsigned long section_mem_map; 488}; 489 490extern struct mem_section mem_section[NR_MEM_SECTIONS]; 491 492static inline struct mem_section *__nr_to_section(unsigned long nr) 493{ 494 return &mem_section[nr]; 495} 496 497/* 498 * We use the lower bits of the mem_map pointer to store 499 * a little bit of information. There should be at least 500 * 3 bits here due to 32-bit alignment. 501 */ 502#define SECTION_MARKED_PRESENT (1UL<<0) 503#define SECTION_HAS_MEM_MAP (1UL<<1) 504#define SECTION_MAP_LAST_BIT (1UL<<2) 505#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) 506 507static inline struct page *__section_mem_map_addr(struct mem_section *section) 508{ 509 unsigned long map = section->section_mem_map; 510 map &= SECTION_MAP_MASK; 511 return (struct page *)map; 512} 513 514static inline int valid_section(struct mem_section *section) 515{ 516 return (section->section_mem_map & SECTION_MARKED_PRESENT); 517} 518 519static inline int section_has_mem_map(struct mem_section *section) 520{ 521 return (section->section_mem_map & SECTION_HAS_MEM_MAP); 522} 523 524static inline int valid_section_nr(unsigned long nr) 525{ 526 return valid_section(__nr_to_section(nr)); 527} 528 529/* 530 * Given a kernel address, find the home node of the underlying memory. 531 */ 532#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) 533 534static inline struct mem_section *__pfn_to_section(unsigned long pfn) 535{ 536 return __nr_to_section(pfn_to_section_nr(pfn)); 537} 538 539#define pfn_to_page(pfn) \ 540({ \ 541 unsigned long __pfn = (pfn); \ 542 __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ 543}) 544#define page_to_pfn(page) \ 545({ \ 546 page - __section_mem_map_addr(__nr_to_section( \ 547 page_to_section(page))); \ 548}) 549 550static inline int pfn_valid(unsigned long pfn) 551{ 552 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) 553 return 0; 554 return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); 555} 556 557/* 558 * These are _only_ used during initialisation, therefore they 559 * can use __initdata ... They could have names to indicate 560 * this restriction. 561 */ 562#ifdef CONFIG_NUMA 563#define pfn_to_nid early_pfn_to_nid 564#endif 565 566#define pfn_to_pgdat(pfn) \ 567({ \ 568 NODE_DATA(pfn_to_nid(pfn)); \ 569}) 570 571#define early_pfn_valid(pfn) pfn_valid(pfn) 572void sparse_init(void); 573#else 574#define sparse_init() do {} while (0) 575#endif /* CONFIG_SPARSEMEM */ 576 577#ifdef CONFIG_NODES_SPAN_OTHER_NODES 578#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) 579#else 580#define early_pfn_in_nid(pfn, nid) (1) 581#endif 582 583#ifndef early_pfn_valid 584#define early_pfn_valid(pfn) (1) 585#endif 586 587void memory_present(int nid, unsigned long start, unsigned long end); 588unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); 589 590#endif /* !__ASSEMBLY__ */ 591#endif /* __KERNEL__ */ 592#endif /* _LINUX_MMZONE_H */