at v3.11-rc2 1414 lines 35 kB view raw
1/* 2 * linux/mm/vmstat.c 3 * 4 * Manages VM statistics 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * 7 * zoned VM statistics 8 * Copyright (C) 2006 Silicon Graphics, Inc., 9 * Christoph Lameter <christoph@lameter.com> 10 */ 11#include <linux/fs.h> 12#include <linux/mm.h> 13#include <linux/err.h> 14#include <linux/module.h> 15#include <linux/slab.h> 16#include <linux/cpu.h> 17#include <linux/vmstat.h> 18#include <linux/sched.h> 19#include <linux/math64.h> 20#include <linux/writeback.h> 21#include <linux/compaction.h> 22 23#ifdef CONFIG_VM_EVENT_COUNTERS 24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 25EXPORT_PER_CPU_SYMBOL(vm_event_states); 26 27static void sum_vm_events(unsigned long *ret) 28{ 29 int cpu; 30 int i; 31 32 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 33 34 for_each_online_cpu(cpu) { 35 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 36 37 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 38 ret[i] += this->event[i]; 39 } 40} 41 42/* 43 * Accumulate the vm event counters across all CPUs. 44 * The result is unavoidably approximate - it can change 45 * during and after execution of this function. 46*/ 47void all_vm_events(unsigned long *ret) 48{ 49 get_online_cpus(); 50 sum_vm_events(ret); 51 put_online_cpus(); 52} 53EXPORT_SYMBOL_GPL(all_vm_events); 54 55/* 56 * Fold the foreign cpu events into our own. 57 * 58 * This is adding to the events on one processor 59 * but keeps the global counts constant. 60 */ 61void vm_events_fold_cpu(int cpu) 62{ 63 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); 64 int i; 65 66 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 67 count_vm_events(i, fold_state->event[i]); 68 fold_state->event[i] = 0; 69 } 70} 71 72#endif /* CONFIG_VM_EVENT_COUNTERS */ 73 74/* 75 * Manage combined zone based / global counters 76 * 77 * vm_stat contains the global counters 78 */ 79atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; 80EXPORT_SYMBOL(vm_stat); 81 82#ifdef CONFIG_SMP 83 84int calculate_pressure_threshold(struct zone *zone) 85{ 86 int threshold; 87 int watermark_distance; 88 89 /* 90 * As vmstats are not up to date, there is drift between the estimated 91 * and real values. For high thresholds and a high number of CPUs, it 92 * is possible for the min watermark to be breached while the estimated 93 * value looks fine. The pressure threshold is a reduced value such 94 * that even the maximum amount of drift will not accidentally breach 95 * the min watermark 96 */ 97 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); 98 threshold = max(1, (int)(watermark_distance / num_online_cpus())); 99 100 /* 101 * Maximum threshold is 125 102 */ 103 threshold = min(125, threshold); 104 105 return threshold; 106} 107 108int calculate_normal_threshold(struct zone *zone) 109{ 110 int threshold; 111 int mem; /* memory in 128 MB units */ 112 113 /* 114 * The threshold scales with the number of processors and the amount 115 * of memory per zone. More memory means that we can defer updates for 116 * longer, more processors could lead to more contention. 117 * fls() is used to have a cheap way of logarithmic scaling. 118 * 119 * Some sample thresholds: 120 * 121 * Threshold Processors (fls) Zonesize fls(mem+1) 122 * ------------------------------------------------------------------ 123 * 8 1 1 0.9-1 GB 4 124 * 16 2 2 0.9-1 GB 4 125 * 20 2 2 1-2 GB 5 126 * 24 2 2 2-4 GB 6 127 * 28 2 2 4-8 GB 7 128 * 32 2 2 8-16 GB 8 129 * 4 2 2 <128M 1 130 * 30 4 3 2-4 GB 5 131 * 48 4 3 8-16 GB 8 132 * 32 8 4 1-2 GB 4 133 * 32 8 4 0.9-1GB 4 134 * 10 16 5 <128M 1 135 * 40 16 5 900M 4 136 * 70 64 7 2-4 GB 5 137 * 84 64 7 4-8 GB 6 138 * 108 512 9 4-8 GB 6 139 * 125 1024 10 8-16 GB 8 140 * 125 1024 10 16-32 GB 9 141 */ 142 143 mem = zone->managed_pages >> (27 - PAGE_SHIFT); 144 145 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 146 147 /* 148 * Maximum threshold is 125 149 */ 150 threshold = min(125, threshold); 151 152 return threshold; 153} 154 155/* 156 * Refresh the thresholds for each zone. 157 */ 158void refresh_zone_stat_thresholds(void) 159{ 160 struct zone *zone; 161 int cpu; 162 int threshold; 163 164 for_each_populated_zone(zone) { 165 unsigned long max_drift, tolerate_drift; 166 167 threshold = calculate_normal_threshold(zone); 168 169 for_each_online_cpu(cpu) 170 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 171 = threshold; 172 173 /* 174 * Only set percpu_drift_mark if there is a danger that 175 * NR_FREE_PAGES reports the low watermark is ok when in fact 176 * the min watermark could be breached by an allocation 177 */ 178 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); 179 max_drift = num_online_cpus() * threshold; 180 if (max_drift > tolerate_drift) 181 zone->percpu_drift_mark = high_wmark_pages(zone) + 182 max_drift; 183 } 184} 185 186void set_pgdat_percpu_threshold(pg_data_t *pgdat, 187 int (*calculate_pressure)(struct zone *)) 188{ 189 struct zone *zone; 190 int cpu; 191 int threshold; 192 int i; 193 194 for (i = 0; i < pgdat->nr_zones; i++) { 195 zone = &pgdat->node_zones[i]; 196 if (!zone->percpu_drift_mark) 197 continue; 198 199 threshold = (*calculate_pressure)(zone); 200 for_each_possible_cpu(cpu) 201 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 202 = threshold; 203 } 204} 205 206/* 207 * For use when we know that interrupts are disabled. 208 */ 209void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 210 int delta) 211{ 212 struct per_cpu_pageset __percpu *pcp = zone->pageset; 213 s8 __percpu *p = pcp->vm_stat_diff + item; 214 long x; 215 long t; 216 217 x = delta + __this_cpu_read(*p); 218 219 t = __this_cpu_read(pcp->stat_threshold); 220 221 if (unlikely(x > t || x < -t)) { 222 zone_page_state_add(x, zone, item); 223 x = 0; 224 } 225 __this_cpu_write(*p, x); 226} 227EXPORT_SYMBOL(__mod_zone_page_state); 228 229/* 230 * Optimized increment and decrement functions. 231 * 232 * These are only for a single page and therefore can take a struct page * 233 * argument instead of struct zone *. This allows the inclusion of the code 234 * generated for page_zone(page) into the optimized functions. 235 * 236 * No overflow check is necessary and therefore the differential can be 237 * incremented or decremented in place which may allow the compilers to 238 * generate better code. 239 * The increment or decrement is known and therefore one boundary check can 240 * be omitted. 241 * 242 * NOTE: These functions are very performance sensitive. Change only 243 * with care. 244 * 245 * Some processors have inc/dec instructions that are atomic vs an interrupt. 246 * However, the code must first determine the differential location in a zone 247 * based on the processor number and then inc/dec the counter. There is no 248 * guarantee without disabling preemption that the processor will not change 249 * in between and therefore the atomicity vs. interrupt cannot be exploited 250 * in a useful way here. 251 */ 252void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 253{ 254 struct per_cpu_pageset __percpu *pcp = zone->pageset; 255 s8 __percpu *p = pcp->vm_stat_diff + item; 256 s8 v, t; 257 258 v = __this_cpu_inc_return(*p); 259 t = __this_cpu_read(pcp->stat_threshold); 260 if (unlikely(v > t)) { 261 s8 overstep = t >> 1; 262 263 zone_page_state_add(v + overstep, zone, item); 264 __this_cpu_write(*p, -overstep); 265 } 266} 267 268void __inc_zone_page_state(struct page *page, enum zone_stat_item item) 269{ 270 __inc_zone_state(page_zone(page), item); 271} 272EXPORT_SYMBOL(__inc_zone_page_state); 273 274void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 275{ 276 struct per_cpu_pageset __percpu *pcp = zone->pageset; 277 s8 __percpu *p = pcp->vm_stat_diff + item; 278 s8 v, t; 279 280 v = __this_cpu_dec_return(*p); 281 t = __this_cpu_read(pcp->stat_threshold); 282 if (unlikely(v < - t)) { 283 s8 overstep = t >> 1; 284 285 zone_page_state_add(v - overstep, zone, item); 286 __this_cpu_write(*p, overstep); 287 } 288} 289 290void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 291{ 292 __dec_zone_state(page_zone(page), item); 293} 294EXPORT_SYMBOL(__dec_zone_page_state); 295 296#ifdef CONFIG_HAVE_CMPXCHG_LOCAL 297/* 298 * If we have cmpxchg_local support then we do not need to incur the overhead 299 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. 300 * 301 * mod_state() modifies the zone counter state through atomic per cpu 302 * operations. 303 * 304 * Overstep mode specifies how overstep should handled: 305 * 0 No overstepping 306 * 1 Overstepping half of threshold 307 * -1 Overstepping minus half of threshold 308*/ 309static inline void mod_state(struct zone *zone, 310 enum zone_stat_item item, int delta, int overstep_mode) 311{ 312 struct per_cpu_pageset __percpu *pcp = zone->pageset; 313 s8 __percpu *p = pcp->vm_stat_diff + item; 314 long o, n, t, z; 315 316 do { 317 z = 0; /* overflow to zone counters */ 318 319 /* 320 * The fetching of the stat_threshold is racy. We may apply 321 * a counter threshold to the wrong the cpu if we get 322 * rescheduled while executing here. However, the next 323 * counter update will apply the threshold again and 324 * therefore bring the counter under the threshold again. 325 * 326 * Most of the time the thresholds are the same anyways 327 * for all cpus in a zone. 328 */ 329 t = this_cpu_read(pcp->stat_threshold); 330 331 o = this_cpu_read(*p); 332 n = delta + o; 333 334 if (n > t || n < -t) { 335 int os = overstep_mode * (t >> 1) ; 336 337 /* Overflow must be added to zone counters */ 338 z = n + os; 339 n = -os; 340 } 341 } while (this_cpu_cmpxchg(*p, o, n) != o); 342 343 if (z) 344 zone_page_state_add(z, zone, item); 345} 346 347void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 348 int delta) 349{ 350 mod_state(zone, item, delta, 0); 351} 352EXPORT_SYMBOL(mod_zone_page_state); 353 354void inc_zone_state(struct zone *zone, enum zone_stat_item item) 355{ 356 mod_state(zone, item, 1, 1); 357} 358 359void inc_zone_page_state(struct page *page, enum zone_stat_item item) 360{ 361 mod_state(page_zone(page), item, 1, 1); 362} 363EXPORT_SYMBOL(inc_zone_page_state); 364 365void dec_zone_page_state(struct page *page, enum zone_stat_item item) 366{ 367 mod_state(page_zone(page), item, -1, -1); 368} 369EXPORT_SYMBOL(dec_zone_page_state); 370#else 371/* 372 * Use interrupt disable to serialize counter updates 373 */ 374void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 375 int delta) 376{ 377 unsigned long flags; 378 379 local_irq_save(flags); 380 __mod_zone_page_state(zone, item, delta); 381 local_irq_restore(flags); 382} 383EXPORT_SYMBOL(mod_zone_page_state); 384 385void inc_zone_state(struct zone *zone, enum zone_stat_item item) 386{ 387 unsigned long flags; 388 389 local_irq_save(flags); 390 __inc_zone_state(zone, item); 391 local_irq_restore(flags); 392} 393 394void inc_zone_page_state(struct page *page, enum zone_stat_item item) 395{ 396 unsigned long flags; 397 struct zone *zone; 398 399 zone = page_zone(page); 400 local_irq_save(flags); 401 __inc_zone_state(zone, item); 402 local_irq_restore(flags); 403} 404EXPORT_SYMBOL(inc_zone_page_state); 405 406void dec_zone_page_state(struct page *page, enum zone_stat_item item) 407{ 408 unsigned long flags; 409 410 local_irq_save(flags); 411 __dec_zone_page_state(page, item); 412 local_irq_restore(flags); 413} 414EXPORT_SYMBOL(dec_zone_page_state); 415#endif 416 417/* 418 * Update the zone counters for one cpu. 419 * 420 * The cpu specified must be either the current cpu or a processor that 421 * is not online. If it is the current cpu then the execution thread must 422 * be pinned to the current cpu. 423 * 424 * Note that refresh_cpu_vm_stats strives to only access 425 * node local memory. The per cpu pagesets on remote zones are placed 426 * in the memory local to the processor using that pageset. So the 427 * loop over all zones will access a series of cachelines local to 428 * the processor. 429 * 430 * The call to zone_page_state_add updates the cachelines with the 431 * statistics in the remote zone struct as well as the global cachelines 432 * with the global counters. These could cause remote node cache line 433 * bouncing and will have to be only done when necessary. 434 */ 435void refresh_cpu_vm_stats(int cpu) 436{ 437 struct zone *zone; 438 int i; 439 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 440 441 for_each_populated_zone(zone) { 442 struct per_cpu_pageset *p; 443 444 p = per_cpu_ptr(zone->pageset, cpu); 445 446 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 447 if (p->vm_stat_diff[i]) { 448 unsigned long flags; 449 int v; 450 451 local_irq_save(flags); 452 v = p->vm_stat_diff[i]; 453 p->vm_stat_diff[i] = 0; 454 local_irq_restore(flags); 455 atomic_long_add(v, &zone->vm_stat[i]); 456 global_diff[i] += v; 457#ifdef CONFIG_NUMA 458 /* 3 seconds idle till flush */ 459 p->expire = 3; 460#endif 461 } 462 cond_resched(); 463#ifdef CONFIG_NUMA 464 /* 465 * Deal with draining the remote pageset of this 466 * processor 467 * 468 * Check if there are pages remaining in this pageset 469 * if not then there is nothing to expire. 470 */ 471 if (!p->expire || !p->pcp.count) 472 continue; 473 474 /* 475 * We never drain zones local to this processor. 476 */ 477 if (zone_to_nid(zone) == numa_node_id()) { 478 p->expire = 0; 479 continue; 480 } 481 482 p->expire--; 483 if (p->expire) 484 continue; 485 486 if (p->pcp.count) 487 drain_zone_pages(zone, &p->pcp); 488#endif 489 } 490 491 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 492 if (global_diff[i]) 493 atomic_long_add(global_diff[i], &vm_stat[i]); 494} 495 496/* 497 * this is only called if !populated_zone(zone), which implies no other users of 498 * pset->vm_stat_diff[] exsist. 499 */ 500void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) 501{ 502 int i; 503 504 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 505 if (pset->vm_stat_diff[i]) { 506 int v = pset->vm_stat_diff[i]; 507 pset->vm_stat_diff[i] = 0; 508 atomic_long_add(v, &zone->vm_stat[i]); 509 atomic_long_add(v, &vm_stat[i]); 510 } 511} 512#endif 513 514#ifdef CONFIG_NUMA 515/* 516 * zonelist = the list of zones passed to the allocator 517 * z = the zone from which the allocation occurred. 518 * 519 * Must be called with interrupts disabled. 520 * 521 * When __GFP_OTHER_NODE is set assume the node of the preferred 522 * zone is the local node. This is useful for daemons who allocate 523 * memory on behalf of other processes. 524 */ 525void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) 526{ 527 if (z->zone_pgdat == preferred_zone->zone_pgdat) { 528 __inc_zone_state(z, NUMA_HIT); 529 } else { 530 __inc_zone_state(z, NUMA_MISS); 531 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 532 } 533 if (z->node == ((flags & __GFP_OTHER_NODE) ? 534 preferred_zone->node : numa_node_id())) 535 __inc_zone_state(z, NUMA_LOCAL); 536 else 537 __inc_zone_state(z, NUMA_OTHER); 538} 539#endif 540 541#ifdef CONFIG_COMPACTION 542 543struct contig_page_info { 544 unsigned long free_pages; 545 unsigned long free_blocks_total; 546 unsigned long free_blocks_suitable; 547}; 548 549/* 550 * Calculate the number of free pages in a zone, how many contiguous 551 * pages are free and how many are large enough to satisfy an allocation of 552 * the target size. Note that this function makes no attempt to estimate 553 * how many suitable free blocks there *might* be if MOVABLE pages were 554 * migrated. Calculating that is possible, but expensive and can be 555 * figured out from userspace 556 */ 557static void fill_contig_page_info(struct zone *zone, 558 unsigned int suitable_order, 559 struct contig_page_info *info) 560{ 561 unsigned int order; 562 563 info->free_pages = 0; 564 info->free_blocks_total = 0; 565 info->free_blocks_suitable = 0; 566 567 for (order = 0; order < MAX_ORDER; order++) { 568 unsigned long blocks; 569 570 /* Count number of free blocks */ 571 blocks = zone->free_area[order].nr_free; 572 info->free_blocks_total += blocks; 573 574 /* Count free base pages */ 575 info->free_pages += blocks << order; 576 577 /* Count the suitable free blocks */ 578 if (order >= suitable_order) 579 info->free_blocks_suitable += blocks << 580 (order - suitable_order); 581 } 582} 583 584/* 585 * A fragmentation index only makes sense if an allocation of a requested 586 * size would fail. If that is true, the fragmentation index indicates 587 * whether external fragmentation or a lack of memory was the problem. 588 * The value can be used to determine if page reclaim or compaction 589 * should be used 590 */ 591static int __fragmentation_index(unsigned int order, struct contig_page_info *info) 592{ 593 unsigned long requested = 1UL << order; 594 595 if (!info->free_blocks_total) 596 return 0; 597 598 /* Fragmentation index only makes sense when a request would fail */ 599 if (info->free_blocks_suitable) 600 return -1000; 601 602 /* 603 * Index is between 0 and 1 so return within 3 decimal places 604 * 605 * 0 => allocation would fail due to lack of memory 606 * 1 => allocation would fail due to fragmentation 607 */ 608 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); 609} 610 611/* Same as __fragmentation index but allocs contig_page_info on stack */ 612int fragmentation_index(struct zone *zone, unsigned int order) 613{ 614 struct contig_page_info info; 615 616 fill_contig_page_info(zone, order, &info); 617 return __fragmentation_index(order, &info); 618} 619#endif 620 621#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) 622#include <linux/proc_fs.h> 623#include <linux/seq_file.h> 624 625static char * const migratetype_names[MIGRATE_TYPES] = { 626 "Unmovable", 627 "Reclaimable", 628 "Movable", 629 "Reserve", 630#ifdef CONFIG_CMA 631 "CMA", 632#endif 633#ifdef CONFIG_MEMORY_ISOLATION 634 "Isolate", 635#endif 636}; 637 638static void *frag_start(struct seq_file *m, loff_t *pos) 639{ 640 pg_data_t *pgdat; 641 loff_t node = *pos; 642 for (pgdat = first_online_pgdat(); 643 pgdat && node; 644 pgdat = next_online_pgdat(pgdat)) 645 --node; 646 647 return pgdat; 648} 649 650static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 651{ 652 pg_data_t *pgdat = (pg_data_t *)arg; 653 654 (*pos)++; 655 return next_online_pgdat(pgdat); 656} 657 658static void frag_stop(struct seq_file *m, void *arg) 659{ 660} 661 662/* Walk all the zones in a node and print using a callback */ 663static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, 664 void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) 665{ 666 struct zone *zone; 667 struct zone *node_zones = pgdat->node_zones; 668 unsigned long flags; 669 670 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 671 if (!populated_zone(zone)) 672 continue; 673 674 spin_lock_irqsave(&zone->lock, flags); 675 print(m, pgdat, zone); 676 spin_unlock_irqrestore(&zone->lock, flags); 677 } 678} 679#endif 680 681#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) 682#ifdef CONFIG_ZONE_DMA 683#define TEXT_FOR_DMA(xx) xx "_dma", 684#else 685#define TEXT_FOR_DMA(xx) 686#endif 687 688#ifdef CONFIG_ZONE_DMA32 689#define TEXT_FOR_DMA32(xx) xx "_dma32", 690#else 691#define TEXT_FOR_DMA32(xx) 692#endif 693 694#ifdef CONFIG_HIGHMEM 695#define TEXT_FOR_HIGHMEM(xx) xx "_high", 696#else 697#define TEXT_FOR_HIGHMEM(xx) 698#endif 699 700#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ 701 TEXT_FOR_HIGHMEM(xx) xx "_movable", 702 703const char * const vmstat_text[] = { 704 /* Zoned VM counters */ 705 "nr_free_pages", 706 "nr_inactive_anon", 707 "nr_active_anon", 708 "nr_inactive_file", 709 "nr_active_file", 710 "nr_unevictable", 711 "nr_mlock", 712 "nr_anon_pages", 713 "nr_mapped", 714 "nr_file_pages", 715 "nr_dirty", 716 "nr_writeback", 717 "nr_slab_reclaimable", 718 "nr_slab_unreclaimable", 719 "nr_page_table_pages", 720 "nr_kernel_stack", 721 "nr_unstable", 722 "nr_bounce", 723 "nr_vmscan_write", 724 "nr_vmscan_immediate_reclaim", 725 "nr_writeback_temp", 726 "nr_isolated_anon", 727 "nr_isolated_file", 728 "nr_shmem", 729 "nr_dirtied", 730 "nr_written", 731 732#ifdef CONFIG_NUMA 733 "numa_hit", 734 "numa_miss", 735 "numa_foreign", 736 "numa_interleave", 737 "numa_local", 738 "numa_other", 739#endif 740 "nr_anon_transparent_hugepages", 741 "nr_free_cma", 742 "nr_dirty_threshold", 743 "nr_dirty_background_threshold", 744 745#ifdef CONFIG_VM_EVENT_COUNTERS 746 "pgpgin", 747 "pgpgout", 748 "pswpin", 749 "pswpout", 750 751 TEXTS_FOR_ZONES("pgalloc") 752 753 "pgfree", 754 "pgactivate", 755 "pgdeactivate", 756 757 "pgfault", 758 "pgmajfault", 759 760 TEXTS_FOR_ZONES("pgrefill") 761 TEXTS_FOR_ZONES("pgsteal_kswapd") 762 TEXTS_FOR_ZONES("pgsteal_direct") 763 TEXTS_FOR_ZONES("pgscan_kswapd") 764 TEXTS_FOR_ZONES("pgscan_direct") 765 "pgscan_direct_throttle", 766 767#ifdef CONFIG_NUMA 768 "zone_reclaim_failed", 769#endif 770 "pginodesteal", 771 "slabs_scanned", 772 "kswapd_inodesteal", 773 "kswapd_low_wmark_hit_quickly", 774 "kswapd_high_wmark_hit_quickly", 775 "pageoutrun", 776 "allocstall", 777 778 "pgrotated", 779 780#ifdef CONFIG_NUMA_BALANCING 781 "numa_pte_updates", 782 "numa_hint_faults", 783 "numa_hint_faults_local", 784 "numa_pages_migrated", 785#endif 786#ifdef CONFIG_MIGRATION 787 "pgmigrate_success", 788 "pgmigrate_fail", 789#endif 790#ifdef CONFIG_COMPACTION 791 "compact_migrate_scanned", 792 "compact_free_scanned", 793 "compact_isolated", 794 "compact_stall", 795 "compact_fail", 796 "compact_success", 797#endif 798 799#ifdef CONFIG_HUGETLB_PAGE 800 "htlb_buddy_alloc_success", 801 "htlb_buddy_alloc_fail", 802#endif 803 "unevictable_pgs_culled", 804 "unevictable_pgs_scanned", 805 "unevictable_pgs_rescued", 806 "unevictable_pgs_mlocked", 807 "unevictable_pgs_munlocked", 808 "unevictable_pgs_cleared", 809 "unevictable_pgs_stranded", 810 811#ifdef CONFIG_TRANSPARENT_HUGEPAGE 812 "thp_fault_alloc", 813 "thp_fault_fallback", 814 "thp_collapse_alloc", 815 "thp_collapse_alloc_failed", 816 "thp_split", 817 "thp_zero_page_alloc", 818 "thp_zero_page_alloc_failed", 819#endif 820 821#endif /* CONFIG_VM_EVENTS_COUNTERS */ 822}; 823#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 824 825 826#ifdef CONFIG_PROC_FS 827static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 828 struct zone *zone) 829{ 830 int order; 831 832 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 833 for (order = 0; order < MAX_ORDER; ++order) 834 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 835 seq_putc(m, '\n'); 836} 837 838/* 839 * This walks the free areas for each zone. 840 */ 841static int frag_show(struct seq_file *m, void *arg) 842{ 843 pg_data_t *pgdat = (pg_data_t *)arg; 844 walk_zones_in_node(m, pgdat, frag_show_print); 845 return 0; 846} 847 848static void pagetypeinfo_showfree_print(struct seq_file *m, 849 pg_data_t *pgdat, struct zone *zone) 850{ 851 int order, mtype; 852 853 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { 854 seq_printf(m, "Node %4d, zone %8s, type %12s ", 855 pgdat->node_id, 856 zone->name, 857 migratetype_names[mtype]); 858 for (order = 0; order < MAX_ORDER; ++order) { 859 unsigned long freecount = 0; 860 struct free_area *area; 861 struct list_head *curr; 862 863 area = &(zone->free_area[order]); 864 865 list_for_each(curr, &area->free_list[mtype]) 866 freecount++; 867 seq_printf(m, "%6lu ", freecount); 868 } 869 seq_putc(m, '\n'); 870 } 871} 872 873/* Print out the free pages at each order for each migatetype */ 874static int pagetypeinfo_showfree(struct seq_file *m, void *arg) 875{ 876 int order; 877 pg_data_t *pgdat = (pg_data_t *)arg; 878 879 /* Print header */ 880 seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); 881 for (order = 0; order < MAX_ORDER; ++order) 882 seq_printf(m, "%6d ", order); 883 seq_putc(m, '\n'); 884 885 walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); 886 887 return 0; 888} 889 890static void pagetypeinfo_showblockcount_print(struct seq_file *m, 891 pg_data_t *pgdat, struct zone *zone) 892{ 893 int mtype; 894 unsigned long pfn; 895 unsigned long start_pfn = zone->zone_start_pfn; 896 unsigned long end_pfn = zone_end_pfn(zone); 897 unsigned long count[MIGRATE_TYPES] = { 0, }; 898 899 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 900 struct page *page; 901 902 if (!pfn_valid(pfn)) 903 continue; 904 905 page = pfn_to_page(pfn); 906 907 /* Watch for unexpected holes punched in the memmap */ 908 if (!memmap_valid_within(pfn, page, zone)) 909 continue; 910 911 mtype = get_pageblock_migratetype(page); 912 913 if (mtype < MIGRATE_TYPES) 914 count[mtype]++; 915 } 916 917 /* Print counts */ 918 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 919 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 920 seq_printf(m, "%12lu ", count[mtype]); 921 seq_putc(m, '\n'); 922} 923 924/* Print out the free pages at each order for each migratetype */ 925static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) 926{ 927 int mtype; 928 pg_data_t *pgdat = (pg_data_t *)arg; 929 930 seq_printf(m, "\n%-23s", "Number of blocks type "); 931 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 932 seq_printf(m, "%12s ", migratetype_names[mtype]); 933 seq_putc(m, '\n'); 934 walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); 935 936 return 0; 937} 938 939/* 940 * This prints out statistics in relation to grouping pages by mobility. 941 * It is expensive to collect so do not constantly read the file. 942 */ 943static int pagetypeinfo_show(struct seq_file *m, void *arg) 944{ 945 pg_data_t *pgdat = (pg_data_t *)arg; 946 947 /* check memoryless node */ 948 if (!node_state(pgdat->node_id, N_MEMORY)) 949 return 0; 950 951 seq_printf(m, "Page block order: %d\n", pageblock_order); 952 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); 953 seq_putc(m, '\n'); 954 pagetypeinfo_showfree(m, pgdat); 955 pagetypeinfo_showblockcount(m, pgdat); 956 957 return 0; 958} 959 960static const struct seq_operations fragmentation_op = { 961 .start = frag_start, 962 .next = frag_next, 963 .stop = frag_stop, 964 .show = frag_show, 965}; 966 967static int fragmentation_open(struct inode *inode, struct file *file) 968{ 969 return seq_open(file, &fragmentation_op); 970} 971 972static const struct file_operations fragmentation_file_operations = { 973 .open = fragmentation_open, 974 .read = seq_read, 975 .llseek = seq_lseek, 976 .release = seq_release, 977}; 978 979static const struct seq_operations pagetypeinfo_op = { 980 .start = frag_start, 981 .next = frag_next, 982 .stop = frag_stop, 983 .show = pagetypeinfo_show, 984}; 985 986static int pagetypeinfo_open(struct inode *inode, struct file *file) 987{ 988 return seq_open(file, &pagetypeinfo_op); 989} 990 991static const struct file_operations pagetypeinfo_file_ops = { 992 .open = pagetypeinfo_open, 993 .read = seq_read, 994 .llseek = seq_lseek, 995 .release = seq_release, 996}; 997 998static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 999 struct zone *zone) 1000{ 1001 int i; 1002 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 1003 seq_printf(m, 1004 "\n pages free %lu" 1005 "\n min %lu" 1006 "\n low %lu" 1007 "\n high %lu" 1008 "\n scanned %lu" 1009 "\n spanned %lu" 1010 "\n present %lu" 1011 "\n managed %lu", 1012 zone_page_state(zone, NR_FREE_PAGES), 1013 min_wmark_pages(zone), 1014 low_wmark_pages(zone), 1015 high_wmark_pages(zone), 1016 zone->pages_scanned, 1017 zone->spanned_pages, 1018 zone->present_pages, 1019 zone->managed_pages); 1020 1021 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1022 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1023 zone_page_state(zone, i)); 1024 1025 seq_printf(m, 1026 "\n protection: (%lu", 1027 zone->lowmem_reserve[0]); 1028 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 1029 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 1030 seq_printf(m, 1031 ")" 1032 "\n pagesets"); 1033 for_each_online_cpu(i) { 1034 struct per_cpu_pageset *pageset; 1035 1036 pageset = per_cpu_ptr(zone->pageset, i); 1037 seq_printf(m, 1038 "\n cpu: %i" 1039 "\n count: %i" 1040 "\n high: %i" 1041 "\n batch: %i", 1042 i, 1043 pageset->pcp.count, 1044 pageset->pcp.high, 1045 pageset->pcp.batch); 1046#ifdef CONFIG_SMP 1047 seq_printf(m, "\n vm stats threshold: %d", 1048 pageset->stat_threshold); 1049#endif 1050 } 1051 seq_printf(m, 1052 "\n all_unreclaimable: %u" 1053 "\n start_pfn: %lu" 1054 "\n inactive_ratio: %u", 1055 zone->all_unreclaimable, 1056 zone->zone_start_pfn, 1057 zone->inactive_ratio); 1058 seq_putc(m, '\n'); 1059} 1060 1061/* 1062 * Output information about zones in @pgdat. 1063 */ 1064static int zoneinfo_show(struct seq_file *m, void *arg) 1065{ 1066 pg_data_t *pgdat = (pg_data_t *)arg; 1067 walk_zones_in_node(m, pgdat, zoneinfo_show_print); 1068 return 0; 1069} 1070 1071static const struct seq_operations zoneinfo_op = { 1072 .start = frag_start, /* iterate over all zones. The same as in 1073 * fragmentation. */ 1074 .next = frag_next, 1075 .stop = frag_stop, 1076 .show = zoneinfo_show, 1077}; 1078 1079static int zoneinfo_open(struct inode *inode, struct file *file) 1080{ 1081 return seq_open(file, &zoneinfo_op); 1082} 1083 1084static const struct file_operations proc_zoneinfo_file_operations = { 1085 .open = zoneinfo_open, 1086 .read = seq_read, 1087 .llseek = seq_lseek, 1088 .release = seq_release, 1089}; 1090 1091enum writeback_stat_item { 1092 NR_DIRTY_THRESHOLD, 1093 NR_DIRTY_BG_THRESHOLD, 1094 NR_VM_WRITEBACK_STAT_ITEMS, 1095}; 1096 1097static void *vmstat_start(struct seq_file *m, loff_t *pos) 1098{ 1099 unsigned long *v; 1100 int i, stat_items_size; 1101 1102 if (*pos >= ARRAY_SIZE(vmstat_text)) 1103 return NULL; 1104 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + 1105 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); 1106 1107#ifdef CONFIG_VM_EVENT_COUNTERS 1108 stat_items_size += sizeof(struct vm_event_state); 1109#endif 1110 1111 v = kmalloc(stat_items_size, GFP_KERNEL); 1112 m->private = v; 1113 if (!v) 1114 return ERR_PTR(-ENOMEM); 1115 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1116 v[i] = global_page_state(i); 1117 v += NR_VM_ZONE_STAT_ITEMS; 1118 1119 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, 1120 v + NR_DIRTY_THRESHOLD); 1121 v += NR_VM_WRITEBACK_STAT_ITEMS; 1122 1123#ifdef CONFIG_VM_EVENT_COUNTERS 1124 all_vm_events(v); 1125 v[PGPGIN] /= 2; /* sectors -> kbytes */ 1126 v[PGPGOUT] /= 2; 1127#endif 1128 return (unsigned long *)m->private + *pos; 1129} 1130 1131static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 1132{ 1133 (*pos)++; 1134 if (*pos >= ARRAY_SIZE(vmstat_text)) 1135 return NULL; 1136 return (unsigned long *)m->private + *pos; 1137} 1138 1139static int vmstat_show(struct seq_file *m, void *arg) 1140{ 1141 unsigned long *l = arg; 1142 unsigned long off = l - (unsigned long *)m->private; 1143 1144 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 1145 return 0; 1146} 1147 1148static void vmstat_stop(struct seq_file *m, void *arg) 1149{ 1150 kfree(m->private); 1151 m->private = NULL; 1152} 1153 1154static const struct seq_operations vmstat_op = { 1155 .start = vmstat_start, 1156 .next = vmstat_next, 1157 .stop = vmstat_stop, 1158 .show = vmstat_show, 1159}; 1160 1161static int vmstat_open(struct inode *inode, struct file *file) 1162{ 1163 return seq_open(file, &vmstat_op); 1164} 1165 1166static const struct file_operations proc_vmstat_file_operations = { 1167 .open = vmstat_open, 1168 .read = seq_read, 1169 .llseek = seq_lseek, 1170 .release = seq_release, 1171}; 1172#endif /* CONFIG_PROC_FS */ 1173 1174#ifdef CONFIG_SMP 1175static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1176int sysctl_stat_interval __read_mostly = HZ; 1177 1178static void vmstat_update(struct work_struct *w) 1179{ 1180 refresh_cpu_vm_stats(smp_processor_id()); 1181 schedule_delayed_work(&__get_cpu_var(vmstat_work), 1182 round_jiffies_relative(sysctl_stat_interval)); 1183} 1184 1185static void start_cpu_timer(int cpu) 1186{ 1187 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1188 1189 INIT_DEFERRABLE_WORK(work, vmstat_update); 1190 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1191} 1192 1193/* 1194 * Use the cpu notifier to insure that the thresholds are recalculated 1195 * when necessary. 1196 */ 1197static int vmstat_cpuup_callback(struct notifier_block *nfb, 1198 unsigned long action, 1199 void *hcpu) 1200{ 1201 long cpu = (long)hcpu; 1202 1203 switch (action) { 1204 case CPU_ONLINE: 1205 case CPU_ONLINE_FROZEN: 1206 refresh_zone_stat_thresholds(); 1207 start_cpu_timer(cpu); 1208 node_set_state(cpu_to_node(cpu), N_CPU); 1209 break; 1210 case CPU_DOWN_PREPARE: 1211 case CPU_DOWN_PREPARE_FROZEN: 1212 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); 1213 per_cpu(vmstat_work, cpu).work.func = NULL; 1214 break; 1215 case CPU_DOWN_FAILED: 1216 case CPU_DOWN_FAILED_FROZEN: 1217 start_cpu_timer(cpu); 1218 break; 1219 case CPU_DEAD: 1220 case CPU_DEAD_FROZEN: 1221 refresh_zone_stat_thresholds(); 1222 break; 1223 default: 1224 break; 1225 } 1226 return NOTIFY_OK; 1227} 1228 1229static struct notifier_block vmstat_notifier = 1230 { &vmstat_cpuup_callback, NULL, 0 }; 1231#endif 1232 1233static int __init setup_vmstat(void) 1234{ 1235#ifdef CONFIG_SMP 1236 int cpu; 1237 1238 register_cpu_notifier(&vmstat_notifier); 1239 1240 for_each_online_cpu(cpu) 1241 start_cpu_timer(cpu); 1242#endif 1243#ifdef CONFIG_PROC_FS 1244 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); 1245 proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); 1246 proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); 1247 proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); 1248#endif 1249 return 0; 1250} 1251module_init(setup_vmstat) 1252 1253#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1254#include <linux/debugfs.h> 1255 1256 1257/* 1258 * Return an index indicating how much of the available free memory is 1259 * unusable for an allocation of the requested size. 1260 */ 1261static int unusable_free_index(unsigned int order, 1262 struct contig_page_info *info) 1263{ 1264 /* No free memory is interpreted as all free memory is unusable */ 1265 if (info->free_pages == 0) 1266 return 1000; 1267 1268 /* 1269 * Index should be a value between 0 and 1. Return a value to 3 1270 * decimal places. 1271 * 1272 * 0 => no fragmentation 1273 * 1 => high fragmentation 1274 */ 1275 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); 1276 1277} 1278 1279static void unusable_show_print(struct seq_file *m, 1280 pg_data_t *pgdat, struct zone *zone) 1281{ 1282 unsigned int order; 1283 int index; 1284 struct contig_page_info info; 1285 1286 seq_printf(m, "Node %d, zone %8s ", 1287 pgdat->node_id, 1288 zone->name); 1289 for (order = 0; order < MAX_ORDER; ++order) { 1290 fill_contig_page_info(zone, order, &info); 1291 index = unusable_free_index(order, &info); 1292 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 1293 } 1294 1295 seq_putc(m, '\n'); 1296} 1297 1298/* 1299 * Display unusable free space index 1300 * 1301 * The unusable free space index measures how much of the available free 1302 * memory cannot be used to satisfy an allocation of a given size and is a 1303 * value between 0 and 1. The higher the value, the more of free memory is 1304 * unusable and by implication, the worse the external fragmentation is. This 1305 * can be expressed as a percentage by multiplying by 100. 1306 */ 1307static int unusable_show(struct seq_file *m, void *arg) 1308{ 1309 pg_data_t *pgdat = (pg_data_t *)arg; 1310 1311 /* check memoryless node */ 1312 if (!node_state(pgdat->node_id, N_MEMORY)) 1313 return 0; 1314 1315 walk_zones_in_node(m, pgdat, unusable_show_print); 1316 1317 return 0; 1318} 1319 1320static const struct seq_operations unusable_op = { 1321 .start = frag_start, 1322 .next = frag_next, 1323 .stop = frag_stop, 1324 .show = unusable_show, 1325}; 1326 1327static int unusable_open(struct inode *inode, struct file *file) 1328{ 1329 return seq_open(file, &unusable_op); 1330} 1331 1332static const struct file_operations unusable_file_ops = { 1333 .open = unusable_open, 1334 .read = seq_read, 1335 .llseek = seq_lseek, 1336 .release = seq_release, 1337}; 1338 1339static void extfrag_show_print(struct seq_file *m, 1340 pg_data_t *pgdat, struct zone *zone) 1341{ 1342 unsigned int order; 1343 int index; 1344 1345 /* Alloc on stack as interrupts are disabled for zone walk */ 1346 struct contig_page_info info; 1347 1348 seq_printf(m, "Node %d, zone %8s ", 1349 pgdat->node_id, 1350 zone->name); 1351 for (order = 0; order < MAX_ORDER; ++order) { 1352 fill_contig_page_info(zone, order, &info); 1353 index = __fragmentation_index(order, &info); 1354 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 1355 } 1356 1357 seq_putc(m, '\n'); 1358} 1359 1360/* 1361 * Display fragmentation index for orders that allocations would fail for 1362 */ 1363static int extfrag_show(struct seq_file *m, void *arg) 1364{ 1365 pg_data_t *pgdat = (pg_data_t *)arg; 1366 1367 walk_zones_in_node(m, pgdat, extfrag_show_print); 1368 1369 return 0; 1370} 1371 1372static const struct seq_operations extfrag_op = { 1373 .start = frag_start, 1374 .next = frag_next, 1375 .stop = frag_stop, 1376 .show = extfrag_show, 1377}; 1378 1379static int extfrag_open(struct inode *inode, struct file *file) 1380{ 1381 return seq_open(file, &extfrag_op); 1382} 1383 1384static const struct file_operations extfrag_file_ops = { 1385 .open = extfrag_open, 1386 .read = seq_read, 1387 .llseek = seq_lseek, 1388 .release = seq_release, 1389}; 1390 1391static int __init extfrag_debug_init(void) 1392{ 1393 struct dentry *extfrag_debug_root; 1394 1395 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 1396 if (!extfrag_debug_root) 1397 return -ENOMEM; 1398 1399 if (!debugfs_create_file("unusable_index", 0444, 1400 extfrag_debug_root, NULL, &unusable_file_ops)) 1401 goto fail; 1402 1403 if (!debugfs_create_file("extfrag_index", 0444, 1404 extfrag_debug_root, NULL, &extfrag_file_ops)) 1405 goto fail; 1406 1407 return 0; 1408fail: 1409 debugfs_remove_recursive(extfrag_debug_root); 1410 return -ENOMEM; 1411} 1412 1413module_init(extfrag_debug_init); 1414#endif