mm/vmstat.c at v3.17-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / vmstat.c
at v3.17-rc2 1488 lines 36 kB view raw
   1/*
   2 *  linux/mm/vmstat.c
   3 *
   4 *  Manages VM statistics
   5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6 *
   7 *  zoned VM statistics
   8 *  Copyright (C) 2006 Silicon Graphics, Inc.,
   9 *		Christoph Lameter <christoph@lameter.com>
  10 */
  11#include <linux/fs.h>
  12#include <linux/mm.h>
  13#include <linux/err.h>
  14#include <linux/module.h>
  15#include <linux/slab.h>
  16#include <linux/cpu.h>
  17#include <linux/vmstat.h>
  18#include <linux/sched.h>
  19#include <linux/math64.h>
  20#include <linux/writeback.h>
  21#include <linux/compaction.h>
  22#include <linux/mm_inline.h>
  23
  24#include "internal.h"
  25
  26#ifdef CONFIG_VM_EVENT_COUNTERS
  27DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
  28EXPORT_PER_CPU_SYMBOL(vm_event_states);
  29
  30static void sum_vm_events(unsigned long *ret)
  31{
  32	int cpu;
  33	int i;
  34
  35	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
  36
  37	for_each_online_cpu(cpu) {
  38		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
  39
  40		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
  41			ret[i] += this->event[i];
  42	}
  43}
  44
  45/*
  46 * Accumulate the vm event counters across all CPUs.
  47 * The result is unavoidably approximate - it can change
  48 * during and after execution of this function.
  49*/
  50void all_vm_events(unsigned long *ret)
  51{
  52	get_online_cpus();
  53	sum_vm_events(ret);
  54	put_online_cpus();
  55}
  56EXPORT_SYMBOL_GPL(all_vm_events);
  57
  58/*
  59 * Fold the foreign cpu events into our own.
  60 *
  61 * This is adding to the events on one processor
  62 * but keeps the global counts constant.
  63 */
  64void vm_events_fold_cpu(int cpu)
  65{
  66	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
  67	int i;
  68
  69	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
  70		count_vm_events(i, fold_state->event[i]);
  71		fold_state->event[i] = 0;
  72	}
  73}
  74
  75#endif /* CONFIG_VM_EVENT_COUNTERS */
  76
  77/*
  78 * Manage combined zone based / global counters
  79 *
  80 * vm_stat contains the global counters
  81 */
  82atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
  83EXPORT_SYMBOL(vm_stat);
  84
  85#ifdef CONFIG_SMP
  86
  87int calculate_pressure_threshold(struct zone *zone)
  88{
  89	int threshold;
  90	int watermark_distance;
  91
  92	/*
  93	 * As vmstats are not up to date, there is drift between the estimated
  94	 * and real values. For high thresholds and a high number of CPUs, it
  95	 * is possible for the min watermark to be breached while the estimated
  96	 * value looks fine. The pressure threshold is a reduced value such
  97	 * that even the maximum amount of drift will not accidentally breach
  98	 * the min watermark
  99	 */
 100	watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
 101	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
 102
 103	/*
 104	 * Maximum threshold is 125
 105	 */
 106	threshold = min(125, threshold);
 107
 108	return threshold;
 109}
 110
 111int calculate_normal_threshold(struct zone *zone)
 112{
 113	int threshold;
 114	int mem;	/* memory in 128 MB units */
 115
 116	/*
 117	 * The threshold scales with the number of processors and the amount
 118	 * of memory per zone. More memory means that we can defer updates for
 119	 * longer, more processors could lead to more contention.
 120 	 * fls() is used to have a cheap way of logarithmic scaling.
 121	 *
 122	 * Some sample thresholds:
 123	 *
 124	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
 125	 * ------------------------------------------------------------------
 126	 * 8		1		1	0.9-1 GB	4
 127	 * 16		2		2	0.9-1 GB	4
 128	 * 20 		2		2	1-2 GB		5
 129	 * 24		2		2	2-4 GB		6
 130	 * 28		2		2	4-8 GB		7
 131	 * 32		2		2	8-16 GB		8
 132	 * 4		2		2	<128M		1
 133	 * 30		4		3	2-4 GB		5
 134	 * 48		4		3	8-16 GB		8
 135	 * 32		8		4	1-2 GB		4
 136	 * 32		8		4	0.9-1GB		4
 137	 * 10		16		5	<128M		1
 138	 * 40		16		5	900M		4
 139	 * 70		64		7	2-4 GB		5
 140	 * 84		64		7	4-8 GB		6
 141	 * 108		512		9	4-8 GB		6
 142	 * 125		1024		10	8-16 GB		8
 143	 * 125		1024		10	16-32 GB	9
 144	 */
 145
 146	mem = zone->managed_pages >> (27 - PAGE_SHIFT);
 147
 148	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
 149
 150	/*
 151	 * Maximum threshold is 125
 152	 */
 153	threshold = min(125, threshold);
 154
 155	return threshold;
 156}
 157
 158/*
 159 * Refresh the thresholds for each zone.
 160 */
 161void refresh_zone_stat_thresholds(void)
 162{
 163	struct zone *zone;
 164	int cpu;
 165	int threshold;
 166
 167	for_each_populated_zone(zone) {
 168		unsigned long max_drift, tolerate_drift;
 169
 170		threshold = calculate_normal_threshold(zone);
 171
 172		for_each_online_cpu(cpu)
 173			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 174							= threshold;
 175
 176		/*
 177		 * Only set percpu_drift_mark if there is a danger that
 178		 * NR_FREE_PAGES reports the low watermark is ok when in fact
 179		 * the min watermark could be breached by an allocation
 180		 */
 181		tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
 182		max_drift = num_online_cpus() * threshold;
 183		if (max_drift > tolerate_drift)
 184			zone->percpu_drift_mark = high_wmark_pages(zone) +
 185					max_drift;
 186	}
 187}
 188
 189void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 190				int (*calculate_pressure)(struct zone *))
 191{
 192	struct zone *zone;
 193	int cpu;
 194	int threshold;
 195	int i;
 196
 197	for (i = 0; i < pgdat->nr_zones; i++) {
 198		zone = &pgdat->node_zones[i];
 199		if (!zone->percpu_drift_mark)
 200			continue;
 201
 202		threshold = (*calculate_pressure)(zone);
 203		for_each_online_cpu(cpu)
 204			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 205							= threshold;
 206	}
 207}
 208
 209/*
 210 * For use when we know that interrupts are disabled,
 211 * or when we know that preemption is disabled and that
 212 * particular counter cannot be updated from interrupt context.
 213 */
 214void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 215				int delta)
 216{
 217	struct per_cpu_pageset __percpu *pcp = zone->pageset;
 218	s8 __percpu *p = pcp->vm_stat_diff + item;
 219	long x;
 220	long t;
 221
 222	x = delta + __this_cpu_read(*p);
 223
 224	t = __this_cpu_read(pcp->stat_threshold);
 225
 226	if (unlikely(x > t || x < -t)) {
 227		zone_page_state_add(x, zone, item);
 228		x = 0;
 229	}
 230	__this_cpu_write(*p, x);
 231}
 232EXPORT_SYMBOL(__mod_zone_page_state);
 233
 234/*
 235 * Optimized increment and decrement functions.
 236 *
 237 * These are only for a single page and therefore can take a struct page *
 238 * argument instead of struct zone *. This allows the inclusion of the code
 239 * generated for page_zone(page) into the optimized functions.
 240 *
 241 * No overflow check is necessary and therefore the differential can be
 242 * incremented or decremented in place which may allow the compilers to
 243 * generate better code.
 244 * The increment or decrement is known and therefore one boundary check can
 245 * be omitted.
 246 *
 247 * NOTE: These functions are very performance sensitive. Change only
 248 * with care.
 249 *
 250 * Some processors have inc/dec instructions that are atomic vs an interrupt.
 251 * However, the code must first determine the differential location in a zone
 252 * based on the processor number and then inc/dec the counter. There is no
 253 * guarantee without disabling preemption that the processor will not change
 254 * in between and therefore the atomicity vs. interrupt cannot be exploited
 255 * in a useful way here.
 256 */
 257void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 258{
 259	struct per_cpu_pageset __percpu *pcp = zone->pageset;
 260	s8 __percpu *p = pcp->vm_stat_diff + item;
 261	s8 v, t;
 262
 263	v = __this_cpu_inc_return(*p);
 264	t = __this_cpu_read(pcp->stat_threshold);
 265	if (unlikely(v > t)) {
 266		s8 overstep = t >> 1;
 267
 268		zone_page_state_add(v + overstep, zone, item);
 269		__this_cpu_write(*p, -overstep);
 270	}
 271}
 272
 273void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 274{
 275	__inc_zone_state(page_zone(page), item);
 276}
 277EXPORT_SYMBOL(__inc_zone_page_state);
 278
 279void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 280{
 281	struct per_cpu_pageset __percpu *pcp = zone->pageset;
 282	s8 __percpu *p = pcp->vm_stat_diff + item;
 283	s8 v, t;
 284
 285	v = __this_cpu_dec_return(*p);
 286	t = __this_cpu_read(pcp->stat_threshold);
 287	if (unlikely(v < - t)) {
 288		s8 overstep = t >> 1;
 289
 290		zone_page_state_add(v - overstep, zone, item);
 291		__this_cpu_write(*p, overstep);
 292	}
 293}
 294
 295void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 296{
 297	__dec_zone_state(page_zone(page), item);
 298}
 299EXPORT_SYMBOL(__dec_zone_page_state);
 300
 301#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 302/*
 303 * If we have cmpxchg_local support then we do not need to incur the overhead
 304 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
 305 *
 306 * mod_state() modifies the zone counter state through atomic per cpu
 307 * operations.
 308 *
 309 * Overstep mode specifies how overstep should handled:
 310 *     0       No overstepping
 311 *     1       Overstepping half of threshold
 312 *     -1      Overstepping minus half of threshold
 313*/
 314static inline void mod_state(struct zone *zone,
 315       enum zone_stat_item item, int delta, int overstep_mode)
 316{
 317	struct per_cpu_pageset __percpu *pcp = zone->pageset;
 318	s8 __percpu *p = pcp->vm_stat_diff + item;
 319	long o, n, t, z;
 320
 321	do {
 322		z = 0;  /* overflow to zone counters */
 323
 324		/*
 325		 * The fetching of the stat_threshold is racy. We may apply
 326		 * a counter threshold to the wrong the cpu if we get
 327		 * rescheduled while executing here. However, the next
 328		 * counter update will apply the threshold again and
 329		 * therefore bring the counter under the threshold again.
 330		 *
 331		 * Most of the time the thresholds are the same anyways
 332		 * for all cpus in a zone.
 333		 */
 334		t = this_cpu_read(pcp->stat_threshold);
 335
 336		o = this_cpu_read(*p);
 337		n = delta + o;
 338
 339		if (n > t || n < -t) {
 340			int os = overstep_mode * (t >> 1) ;
 341
 342			/* Overflow must be added to zone counters */
 343			z = n + os;
 344			n = -os;
 345		}
 346	} while (this_cpu_cmpxchg(*p, o, n) != o);
 347
 348	if (z)
 349		zone_page_state_add(z, zone, item);
 350}
 351
 352void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 353					int delta)
 354{
 355	mod_state(zone, item, delta, 0);
 356}
 357EXPORT_SYMBOL(mod_zone_page_state);
 358
 359void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 360{
 361	mod_state(zone, item, 1, 1);
 362}
 363
 364void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 365{
 366	mod_state(page_zone(page), item, 1, 1);
 367}
 368EXPORT_SYMBOL(inc_zone_page_state);
 369
 370void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 371{
 372	mod_state(page_zone(page), item, -1, -1);
 373}
 374EXPORT_SYMBOL(dec_zone_page_state);
 375#else
 376/*
 377 * Use interrupt disable to serialize counter updates
 378 */
 379void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 380					int delta)
 381{
 382	unsigned long flags;
 383
 384	local_irq_save(flags);
 385	__mod_zone_page_state(zone, item, delta);
 386	local_irq_restore(flags);
 387}
 388EXPORT_SYMBOL(mod_zone_page_state);
 389
 390void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 391{
 392	unsigned long flags;
 393
 394	local_irq_save(flags);
 395	__inc_zone_state(zone, item);
 396	local_irq_restore(flags);
 397}
 398
 399void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 400{
 401	unsigned long flags;
 402	struct zone *zone;
 403
 404	zone = page_zone(page);
 405	local_irq_save(flags);
 406	__inc_zone_state(zone, item);
 407	local_irq_restore(flags);
 408}
 409EXPORT_SYMBOL(inc_zone_page_state);
 410
 411void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 412{
 413	unsigned long flags;
 414
 415	local_irq_save(flags);
 416	__dec_zone_page_state(page, item);
 417	local_irq_restore(flags);
 418}
 419EXPORT_SYMBOL(dec_zone_page_state);
 420#endif
 421
 422static inline void fold_diff(int *diff)
 423{
 424	int i;
 425
 426	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 427		if (diff[i])
 428			atomic_long_add(diff[i], &vm_stat[i]);
 429}
 430
 431/*
 432 * Update the zone counters for the current cpu.
 433 *
 434 * Note that refresh_cpu_vm_stats strives to only access
 435 * node local memory. The per cpu pagesets on remote zones are placed
 436 * in the memory local to the processor using that pageset. So the
 437 * loop over all zones will access a series of cachelines local to
 438 * the processor.
 439 *
 440 * The call to zone_page_state_add updates the cachelines with the
 441 * statistics in the remote zone struct as well as the global cachelines
 442 * with the global counters. These could cause remote node cache line
 443 * bouncing and will have to be only done when necessary.
 444 */
 445static void refresh_cpu_vm_stats(void)
 446{
 447	struct zone *zone;
 448	int i;
 449	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 450
 451	for_each_populated_zone(zone) {
 452		struct per_cpu_pageset __percpu *p = zone->pageset;
 453
 454		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 455			int v;
 456
 457			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
 458			if (v) {
 459
 460				atomic_long_add(v, &zone->vm_stat[i]);
 461				global_diff[i] += v;
 462#ifdef CONFIG_NUMA
 463				/* 3 seconds idle till flush */
 464				__this_cpu_write(p->expire, 3);
 465#endif
 466			}
 467		}
 468		cond_resched();
 469#ifdef CONFIG_NUMA
 470		/*
 471		 * Deal with draining the remote pageset of this
 472		 * processor
 473		 *
 474		 * Check if there are pages remaining in this pageset
 475		 * if not then there is nothing to expire.
 476		 */
 477		if (!__this_cpu_read(p->expire) ||
 478			       !__this_cpu_read(p->pcp.count))
 479			continue;
 480
 481		/*
 482		 * We never drain zones local to this processor.
 483		 */
 484		if (zone_to_nid(zone) == numa_node_id()) {
 485			__this_cpu_write(p->expire, 0);
 486			continue;
 487		}
 488
 489
 490		if (__this_cpu_dec_return(p->expire))
 491			continue;
 492
 493		if (__this_cpu_read(p->pcp.count))
 494			drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
 495#endif
 496	}
 497	fold_diff(global_diff);
 498}
 499
 500/*
 501 * Fold the data for an offline cpu into the global array.
 502 * There cannot be any access by the offline cpu and therefore
 503 * synchronization is simplified.
 504 */
 505void cpu_vm_stats_fold(int cpu)
 506{
 507	struct zone *zone;
 508	int i;
 509	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 510
 511	for_each_populated_zone(zone) {
 512		struct per_cpu_pageset *p;
 513
 514		p = per_cpu_ptr(zone->pageset, cpu);
 515
 516		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 517			if (p->vm_stat_diff[i]) {
 518				int v;
 519
 520				v = p->vm_stat_diff[i];
 521				p->vm_stat_diff[i] = 0;
 522				atomic_long_add(v, &zone->vm_stat[i]);
 523				global_diff[i] += v;
 524			}
 525	}
 526
 527	fold_diff(global_diff);
 528}
 529
 530/*
 531 * this is only called if !populated_zone(zone), which implies no other users of
 532 * pset->vm_stat_diff[] exsist.
 533 */
 534void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 535{
 536	int i;
 537
 538	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 539		if (pset->vm_stat_diff[i]) {
 540			int v = pset->vm_stat_diff[i];
 541			pset->vm_stat_diff[i] = 0;
 542			atomic_long_add(v, &zone->vm_stat[i]);
 543			atomic_long_add(v, &vm_stat[i]);
 544		}
 545}
 546#endif
 547
 548#ifdef CONFIG_NUMA
 549/*
 550 * zonelist = the list of zones passed to the allocator
 551 * z 	    = the zone from which the allocation occurred.
 552 *
 553 * Must be called with interrupts disabled.
 554 *
 555 * When __GFP_OTHER_NODE is set assume the node of the preferred
 556 * zone is the local node. This is useful for daemons who allocate
 557 * memory on behalf of other processes.
 558 */
 559void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 560{
 561	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
 562		__inc_zone_state(z, NUMA_HIT);
 563	} else {
 564		__inc_zone_state(z, NUMA_MISS);
 565		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
 566	}
 567	if (z->node == ((flags & __GFP_OTHER_NODE) ?
 568			preferred_zone->node : numa_node_id()))
 569		__inc_zone_state(z, NUMA_LOCAL);
 570	else
 571		__inc_zone_state(z, NUMA_OTHER);
 572}
 573#endif
 574
 575#ifdef CONFIG_COMPACTION
 576
 577struct contig_page_info {
 578	unsigned long free_pages;
 579	unsigned long free_blocks_total;
 580	unsigned long free_blocks_suitable;
 581};
 582
 583/*
 584 * Calculate the number of free pages in a zone, how many contiguous
 585 * pages are free and how many are large enough to satisfy an allocation of
 586 * the target size. Note that this function makes no attempt to estimate
 587 * how many suitable free blocks there *might* be if MOVABLE pages were
 588 * migrated. Calculating that is possible, but expensive and can be
 589 * figured out from userspace
 590 */
 591static void fill_contig_page_info(struct zone *zone,
 592				unsigned int suitable_order,
 593				struct contig_page_info *info)
 594{
 595	unsigned int order;
 596
 597	info->free_pages = 0;
 598	info->free_blocks_total = 0;
 599	info->free_blocks_suitable = 0;
 600
 601	for (order = 0; order < MAX_ORDER; order++) {
 602		unsigned long blocks;
 603
 604		/* Count number of free blocks */
 605		blocks = zone->free_area[order].nr_free;
 606		info->free_blocks_total += blocks;
 607
 608		/* Count free base pages */
 609		info->free_pages += blocks << order;
 610
 611		/* Count the suitable free blocks */
 612		if (order >= suitable_order)
 613			info->free_blocks_suitable += blocks <<
 614						(order - suitable_order);
 615	}
 616}
 617
 618/*
 619 * A fragmentation index only makes sense if an allocation of a requested
 620 * size would fail. If that is true, the fragmentation index indicates
 621 * whether external fragmentation or a lack of memory was the problem.
 622 * The value can be used to determine if page reclaim or compaction
 623 * should be used
 624 */
 625static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
 626{
 627	unsigned long requested = 1UL << order;
 628
 629	if (!info->free_blocks_total)
 630		return 0;
 631
 632	/* Fragmentation index only makes sense when a request would fail */
 633	if (info->free_blocks_suitable)
 634		return -1000;
 635
 636	/*
 637	 * Index is between 0 and 1 so return within 3 decimal places
 638	 *
 639	 * 0 => allocation would fail due to lack of memory
 640	 * 1 => allocation would fail due to fragmentation
 641	 */
 642	return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
 643}
 644
 645/* Same as __fragmentation index but allocs contig_page_info on stack */
 646int fragmentation_index(struct zone *zone, unsigned int order)
 647{
 648	struct contig_page_info info;
 649
 650	fill_contig_page_info(zone, order, &info);
 651	return __fragmentation_index(order, &info);
 652}
 653#endif
 654
 655#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
 656#include <linux/proc_fs.h>
 657#include <linux/seq_file.h>
 658
 659static char * const migratetype_names[MIGRATE_TYPES] = {
 660	"Unmovable",
 661	"Reclaimable",
 662	"Movable",
 663	"Reserve",
 664#ifdef CONFIG_CMA
 665	"CMA",
 666#endif
 667#ifdef CONFIG_MEMORY_ISOLATION
 668	"Isolate",
 669#endif
 670};
 671
 672static void *frag_start(struct seq_file *m, loff_t *pos)
 673{
 674	pg_data_t *pgdat;
 675	loff_t node = *pos;
 676	for (pgdat = first_online_pgdat();
 677	     pgdat && node;
 678	     pgdat = next_online_pgdat(pgdat))
 679		--node;
 680
 681	return pgdat;
 682}
 683
 684static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
 685{
 686	pg_data_t *pgdat = (pg_data_t *)arg;
 687
 688	(*pos)++;
 689	return next_online_pgdat(pgdat);
 690}
 691
 692static void frag_stop(struct seq_file *m, void *arg)
 693{
 694}
 695
 696/* Walk all the zones in a node and print using a callback */
 697static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 698		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
 699{
 700	struct zone *zone;
 701	struct zone *node_zones = pgdat->node_zones;
 702	unsigned long flags;
 703
 704	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 705		if (!populated_zone(zone))
 706			continue;
 707
 708		spin_lock_irqsave(&zone->lock, flags);
 709		print(m, pgdat, zone);
 710		spin_unlock_irqrestore(&zone->lock, flags);
 711	}
 712}
 713#endif
 714
 715#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
 716#ifdef CONFIG_ZONE_DMA
 717#define TEXT_FOR_DMA(xx) xx "_dma",
 718#else
 719#define TEXT_FOR_DMA(xx)
 720#endif
 721
 722#ifdef CONFIG_ZONE_DMA32
 723#define TEXT_FOR_DMA32(xx) xx "_dma32",
 724#else
 725#define TEXT_FOR_DMA32(xx)
 726#endif
 727
 728#ifdef CONFIG_HIGHMEM
 729#define TEXT_FOR_HIGHMEM(xx) xx "_high",
 730#else
 731#define TEXT_FOR_HIGHMEM(xx)
 732#endif
 733
 734#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
 735					TEXT_FOR_HIGHMEM(xx) xx "_movable",
 736
 737const char * const vmstat_text[] = {
 738	/* Zoned VM counters */
 739	"nr_free_pages",
 740	"nr_alloc_batch",
 741	"nr_inactive_anon",
 742	"nr_active_anon",
 743	"nr_inactive_file",
 744	"nr_active_file",
 745	"nr_unevictable",
 746	"nr_mlock",
 747	"nr_anon_pages",
 748	"nr_mapped",
 749	"nr_file_pages",
 750	"nr_dirty",
 751	"nr_writeback",
 752	"nr_slab_reclaimable",
 753	"nr_slab_unreclaimable",
 754	"nr_page_table_pages",
 755	"nr_kernel_stack",
 756	"nr_unstable",
 757	"nr_bounce",
 758	"nr_vmscan_write",
 759	"nr_vmscan_immediate_reclaim",
 760	"nr_writeback_temp",
 761	"nr_isolated_anon",
 762	"nr_isolated_file",
 763	"nr_shmem",
 764	"nr_dirtied",
 765	"nr_written",
 766	"nr_pages_scanned",
 767
 768#ifdef CONFIG_NUMA
 769	"numa_hit",
 770	"numa_miss",
 771	"numa_foreign",
 772	"numa_interleave",
 773	"numa_local",
 774	"numa_other",
 775#endif
 776	"workingset_refault",
 777	"workingset_activate",
 778	"workingset_nodereclaim",
 779	"nr_anon_transparent_hugepages",
 780	"nr_free_cma",
 781	"nr_dirty_threshold",
 782	"nr_dirty_background_threshold",
 783
 784#ifdef CONFIG_VM_EVENT_COUNTERS
 785	"pgpgin",
 786	"pgpgout",
 787	"pswpin",
 788	"pswpout",
 789
 790	TEXTS_FOR_ZONES("pgalloc")
 791
 792	"pgfree",
 793	"pgactivate",
 794	"pgdeactivate",
 795
 796	"pgfault",
 797	"pgmajfault",
 798
 799	TEXTS_FOR_ZONES("pgrefill")
 800	TEXTS_FOR_ZONES("pgsteal_kswapd")
 801	TEXTS_FOR_ZONES("pgsteal_direct")
 802	TEXTS_FOR_ZONES("pgscan_kswapd")
 803	TEXTS_FOR_ZONES("pgscan_direct")
 804	"pgscan_direct_throttle",
 805
 806#ifdef CONFIG_NUMA
 807	"zone_reclaim_failed",
 808#endif
 809	"pginodesteal",
 810	"slabs_scanned",
 811	"kswapd_inodesteal",
 812	"kswapd_low_wmark_hit_quickly",
 813	"kswapd_high_wmark_hit_quickly",
 814	"pageoutrun",
 815	"allocstall",
 816
 817	"pgrotated",
 818
 819	"drop_pagecache",
 820	"drop_slab",
 821
 822#ifdef CONFIG_NUMA_BALANCING
 823	"numa_pte_updates",
 824	"numa_huge_pte_updates",
 825	"numa_hint_faults",
 826	"numa_hint_faults_local",
 827	"numa_pages_migrated",
 828#endif
 829#ifdef CONFIG_MIGRATION
 830	"pgmigrate_success",
 831	"pgmigrate_fail",
 832#endif
 833#ifdef CONFIG_COMPACTION
 834	"compact_migrate_scanned",
 835	"compact_free_scanned",
 836	"compact_isolated",
 837	"compact_stall",
 838	"compact_fail",
 839	"compact_success",
 840#endif
 841
 842#ifdef CONFIG_HUGETLB_PAGE
 843	"htlb_buddy_alloc_success",
 844	"htlb_buddy_alloc_fail",
 845#endif
 846	"unevictable_pgs_culled",
 847	"unevictable_pgs_scanned",
 848	"unevictable_pgs_rescued",
 849	"unevictable_pgs_mlocked",
 850	"unevictable_pgs_munlocked",
 851	"unevictable_pgs_cleared",
 852	"unevictable_pgs_stranded",
 853
 854#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 855	"thp_fault_alloc",
 856	"thp_fault_fallback",
 857	"thp_collapse_alloc",
 858	"thp_collapse_alloc_failed",
 859	"thp_split",
 860	"thp_zero_page_alloc",
 861	"thp_zero_page_alloc_failed",
 862#endif
 863#ifdef CONFIG_DEBUG_TLBFLUSH
 864#ifdef CONFIG_SMP
 865	"nr_tlb_remote_flush",
 866	"nr_tlb_remote_flush_received",
 867#endif /* CONFIG_SMP */
 868	"nr_tlb_local_flush_all",
 869	"nr_tlb_local_flush_one",
 870#endif /* CONFIG_DEBUG_TLBFLUSH */
 871
 872#ifdef CONFIG_DEBUG_VM_VMACACHE
 873	"vmacache_find_calls",
 874	"vmacache_find_hits",
 875#endif
 876#endif /* CONFIG_VM_EVENTS_COUNTERS */
 877};
 878#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
 879
 880
 881#ifdef CONFIG_PROC_FS
 882static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 883						struct zone *zone)
 884{
 885	int order;
 886
 887	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 888	for (order = 0; order < MAX_ORDER; ++order)
 889		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
 890	seq_putc(m, '\n');
 891}
 892
 893/*
 894 * This walks the free areas for each zone.
 895 */
 896static int frag_show(struct seq_file *m, void *arg)
 897{
 898	pg_data_t *pgdat = (pg_data_t *)arg;
 899	walk_zones_in_node(m, pgdat, frag_show_print);
 900	return 0;
 901}
 902
 903static void pagetypeinfo_showfree_print(struct seq_file *m,
 904					pg_data_t *pgdat, struct zone *zone)
 905{
 906	int order, mtype;
 907
 908	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
 909		seq_printf(m, "Node %4d, zone %8s, type %12s ",
 910					pgdat->node_id,
 911					zone->name,
 912					migratetype_names[mtype]);
 913		for (order = 0; order < MAX_ORDER; ++order) {
 914			unsigned long freecount = 0;
 915			struct free_area *area;
 916			struct list_head *curr;
 917
 918			area = &(zone->free_area[order]);
 919
 920			list_for_each(curr, &area->free_list[mtype])
 921				freecount++;
 922			seq_printf(m, "%6lu ", freecount);
 923		}
 924		seq_putc(m, '\n');
 925	}
 926}
 927
 928/* Print out the free pages at each order for each migatetype */
 929static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
 930{
 931	int order;
 932	pg_data_t *pgdat = (pg_data_t *)arg;
 933
 934	/* Print header */
 935	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
 936	for (order = 0; order < MAX_ORDER; ++order)
 937		seq_printf(m, "%6d ", order);
 938	seq_putc(m, '\n');
 939
 940	walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
 941
 942	return 0;
 943}
 944
 945static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 946					pg_data_t *pgdat, struct zone *zone)
 947{
 948	int mtype;
 949	unsigned long pfn;
 950	unsigned long start_pfn = zone->zone_start_pfn;
 951	unsigned long end_pfn = zone_end_pfn(zone);
 952	unsigned long count[MIGRATE_TYPES] = { 0, };
 953
 954	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 955		struct page *page;
 956
 957		if (!pfn_valid(pfn))
 958			continue;
 959
 960		page = pfn_to_page(pfn);
 961
 962		/* Watch for unexpected holes punched in the memmap */
 963		if (!memmap_valid_within(pfn, page, zone))
 964			continue;
 965
 966		mtype = get_pageblock_migratetype(page);
 967
 968		if (mtype < MIGRATE_TYPES)
 969			count[mtype]++;
 970	}
 971
 972	/* Print counts */
 973	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 974	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
 975		seq_printf(m, "%12lu ", count[mtype]);
 976	seq_putc(m, '\n');
 977}
 978
 979/* Print out the free pages at each order for each migratetype */
 980static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
 981{
 982	int mtype;
 983	pg_data_t *pgdat = (pg_data_t *)arg;
 984
 985	seq_printf(m, "\n%-23s", "Number of blocks type ");
 986	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
 987		seq_printf(m, "%12s ", migratetype_names[mtype]);
 988	seq_putc(m, '\n');
 989	walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
 990
 991	return 0;
 992}
 993
 994/*
 995 * This prints out statistics in relation to grouping pages by mobility.
 996 * It is expensive to collect so do not constantly read the file.
 997 */
 998static int pagetypeinfo_show(struct seq_file *m, void *arg)
 999{
1000	pg_data_t *pgdat = (pg_data_t *)arg;
1001
1002	/* check memoryless node */
1003	if (!node_state(pgdat->node_id, N_MEMORY))
1004		return 0;
1005
1006	seq_printf(m, "Page block order: %d\n", pageblock_order);
1007	seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1008	seq_putc(m, '\n');
1009	pagetypeinfo_showfree(m, pgdat);
1010	pagetypeinfo_showblockcount(m, pgdat);
1011
1012	return 0;
1013}
1014
1015static const struct seq_operations fragmentation_op = {
1016	.start	= frag_start,
1017	.next	= frag_next,
1018	.stop	= frag_stop,
1019	.show	= frag_show,
1020};
1021
1022static int fragmentation_open(struct inode *inode, struct file *file)
1023{
1024	return seq_open(file, &fragmentation_op);
1025}
1026
1027static const struct file_operations fragmentation_file_operations = {
1028	.open		= fragmentation_open,
1029	.read		= seq_read,
1030	.llseek		= seq_lseek,
1031	.release	= seq_release,
1032};
1033
1034static const struct seq_operations pagetypeinfo_op = {
1035	.start	= frag_start,
1036	.next	= frag_next,
1037	.stop	= frag_stop,
1038	.show	= pagetypeinfo_show,
1039};
1040
1041static int pagetypeinfo_open(struct inode *inode, struct file *file)
1042{
1043	return seq_open(file, &pagetypeinfo_op);
1044}
1045
1046static const struct file_operations pagetypeinfo_file_ops = {
1047	.open		= pagetypeinfo_open,
1048	.read		= seq_read,
1049	.llseek		= seq_lseek,
1050	.release	= seq_release,
1051};
1052
1053static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1054							struct zone *zone)
1055{
1056	int i;
1057	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1058	seq_printf(m,
1059		   "\n  pages free     %lu"
1060		   "\n        min      %lu"
1061		   "\n        low      %lu"
1062		   "\n        high     %lu"
1063		   "\n        scanned  %lu"
1064		   "\n        spanned  %lu"
1065		   "\n        present  %lu"
1066		   "\n        managed  %lu",
1067		   zone_page_state(zone, NR_FREE_PAGES),
1068		   min_wmark_pages(zone),
1069		   low_wmark_pages(zone),
1070		   high_wmark_pages(zone),
1071		   zone_page_state(zone, NR_PAGES_SCANNED),
1072		   zone->spanned_pages,
1073		   zone->present_pages,
1074		   zone->managed_pages);
1075
1076	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1077		seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
1078				zone_page_state(zone, i));
1079
1080	seq_printf(m,
1081		   "\n        protection: (%ld",
1082		   zone->lowmem_reserve[0]);
1083	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1084		seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1085	seq_printf(m,
1086		   ")"
1087		   "\n  pagesets");
1088	for_each_online_cpu(i) {
1089		struct per_cpu_pageset *pageset;
1090
1091		pageset = per_cpu_ptr(zone->pageset, i);
1092		seq_printf(m,
1093			   "\n    cpu: %i"
1094			   "\n              count: %i"
1095			   "\n              high:  %i"
1096			   "\n              batch: %i",
1097			   i,
1098			   pageset->pcp.count,
1099			   pageset->pcp.high,
1100			   pageset->pcp.batch);
1101#ifdef CONFIG_SMP
1102		seq_printf(m, "\n  vm stats threshold: %d",
1103				pageset->stat_threshold);
1104#endif
1105	}
1106	seq_printf(m,
1107		   "\n  all_unreclaimable: %u"
1108		   "\n  start_pfn:         %lu"
1109		   "\n  inactive_ratio:    %u",
1110		   !zone_reclaimable(zone),
1111		   zone->zone_start_pfn,
1112		   zone->inactive_ratio);
1113	seq_putc(m, '\n');
1114}
1115
1116/*
1117 * Output information about zones in @pgdat.
1118 */
1119static int zoneinfo_show(struct seq_file *m, void *arg)
1120{
1121	pg_data_t *pgdat = (pg_data_t *)arg;
1122	walk_zones_in_node(m, pgdat, zoneinfo_show_print);
1123	return 0;
1124}
1125
1126static const struct seq_operations zoneinfo_op = {
1127	.start	= frag_start, /* iterate over all zones. The same as in
1128			       * fragmentation. */
1129	.next	= frag_next,
1130	.stop	= frag_stop,
1131	.show	= zoneinfo_show,
1132};
1133
1134static int zoneinfo_open(struct inode *inode, struct file *file)
1135{
1136	return seq_open(file, &zoneinfo_op);
1137}
1138
1139static const struct file_operations proc_zoneinfo_file_operations = {
1140	.open		= zoneinfo_open,
1141	.read		= seq_read,
1142	.llseek		= seq_lseek,
1143	.release	= seq_release,
1144};
1145
1146enum writeback_stat_item {
1147	NR_DIRTY_THRESHOLD,
1148	NR_DIRTY_BG_THRESHOLD,
1149	NR_VM_WRITEBACK_STAT_ITEMS,
1150};
1151
1152static void *vmstat_start(struct seq_file *m, loff_t *pos)
1153{
1154	unsigned long *v;
1155	int i, stat_items_size;
1156
1157	if (*pos >= ARRAY_SIZE(vmstat_text))
1158		return NULL;
1159	stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
1160			  NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
1161
1162#ifdef CONFIG_VM_EVENT_COUNTERS
1163	stat_items_size += sizeof(struct vm_event_state);
1164#endif
1165
1166	v = kmalloc(stat_items_size, GFP_KERNEL);
1167	m->private = v;
1168	if (!v)
1169		return ERR_PTR(-ENOMEM);
1170	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1171		v[i] = global_page_state(i);
1172	v += NR_VM_ZONE_STAT_ITEMS;
1173
1174	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1175			    v + NR_DIRTY_THRESHOLD);
1176	v += NR_VM_WRITEBACK_STAT_ITEMS;
1177
1178#ifdef CONFIG_VM_EVENT_COUNTERS
1179	all_vm_events(v);
1180	v[PGPGIN] /= 2;		/* sectors -> kbytes */
1181	v[PGPGOUT] /= 2;
1182#endif
1183	return (unsigned long *)m->private + *pos;
1184}
1185
1186static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1187{
1188	(*pos)++;
1189	if (*pos >= ARRAY_SIZE(vmstat_text))
1190		return NULL;
1191	return (unsigned long *)m->private + *pos;
1192}
1193
1194static int vmstat_show(struct seq_file *m, void *arg)
1195{
1196	unsigned long *l = arg;
1197	unsigned long off = l - (unsigned long *)m->private;
1198
1199	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
1200	return 0;
1201}
1202
1203static void vmstat_stop(struct seq_file *m, void *arg)
1204{
1205	kfree(m->private);
1206	m->private = NULL;
1207}
1208
1209static const struct seq_operations vmstat_op = {
1210	.start	= vmstat_start,
1211	.next	= vmstat_next,
1212	.stop	= vmstat_stop,
1213	.show	= vmstat_show,
1214};
1215
1216static int vmstat_open(struct inode *inode, struct file *file)
1217{
1218	return seq_open(file, &vmstat_op);
1219}
1220
1221static const struct file_operations proc_vmstat_file_operations = {
1222	.open		= vmstat_open,
1223	.read		= seq_read,
1224	.llseek		= seq_lseek,
1225	.release	= seq_release,
1226};
1227#endif /* CONFIG_PROC_FS */
1228
1229#ifdef CONFIG_SMP
1230static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1231int sysctl_stat_interval __read_mostly = HZ;
1232
1233static void vmstat_update(struct work_struct *w)
1234{
1235	refresh_cpu_vm_stats();
1236	schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1237		round_jiffies_relative(sysctl_stat_interval));
1238}
1239
1240static void start_cpu_timer(int cpu)
1241{
1242	struct delayed_work *work = &per_cpu(vmstat_work, cpu);
1243
1244	INIT_DEFERRABLE_WORK(work, vmstat_update);
1245	schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
1246}
1247
1248static void vmstat_cpu_dead(int node)
1249{
1250	int cpu;
1251
1252	get_online_cpus();
1253	for_each_online_cpu(cpu)
1254		if (cpu_to_node(cpu) == node)
1255			goto end;
1256
1257	node_clear_state(node, N_CPU);
1258end:
1259	put_online_cpus();
1260}
1261
1262/*
1263 * Use the cpu notifier to insure that the thresholds are recalculated
1264 * when necessary.
1265 */
1266static int vmstat_cpuup_callback(struct notifier_block *nfb,
1267		unsigned long action,
1268		void *hcpu)
1269{
1270	long cpu = (long)hcpu;
1271
1272	switch (action) {
1273	case CPU_ONLINE:
1274	case CPU_ONLINE_FROZEN:
1275		refresh_zone_stat_thresholds();
1276		start_cpu_timer(cpu);
1277		node_set_state(cpu_to_node(cpu), N_CPU);
1278		break;
1279	case CPU_DOWN_PREPARE:
1280	case CPU_DOWN_PREPARE_FROZEN:
1281		cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1282		per_cpu(vmstat_work, cpu).work.func = NULL;
1283		break;
1284	case CPU_DOWN_FAILED:
1285	case CPU_DOWN_FAILED_FROZEN:
1286		start_cpu_timer(cpu);
1287		break;
1288	case CPU_DEAD:
1289	case CPU_DEAD_FROZEN:
1290		refresh_zone_stat_thresholds();
1291		vmstat_cpu_dead(cpu_to_node(cpu));
1292		break;
1293	default:
1294		break;
1295	}
1296	return NOTIFY_OK;
1297}
1298
1299static struct notifier_block vmstat_notifier =
1300	{ &vmstat_cpuup_callback, NULL, 0 };
1301#endif
1302
1303static int __init setup_vmstat(void)
1304{
1305#ifdef CONFIG_SMP
1306	int cpu;
1307
1308	cpu_notifier_register_begin();
1309	__register_cpu_notifier(&vmstat_notifier);
1310
1311	for_each_online_cpu(cpu) {
1312		start_cpu_timer(cpu);
1313		node_set_state(cpu_to_node(cpu), N_CPU);
1314	}
1315	cpu_notifier_register_done();
1316#endif
1317#ifdef CONFIG_PROC_FS
1318	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
1319	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
1320	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
1321	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
1322#endif
1323	return 0;
1324}
1325module_init(setup_vmstat)
1326
1327#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1328#include <linux/debugfs.h>
1329
1330
1331/*
1332 * Return an index indicating how much of the available free memory is
1333 * unusable for an allocation of the requested size.
1334 */
1335static int unusable_free_index(unsigned int order,
1336				struct contig_page_info *info)
1337{
1338	/* No free memory is interpreted as all free memory is unusable */
1339	if (info->free_pages == 0)
1340		return 1000;
1341
1342	/*
1343	 * Index should be a value between 0 and 1. Return a value to 3
1344	 * decimal places.
1345	 *
1346	 * 0 => no fragmentation
1347	 * 1 => high fragmentation
1348	 */
1349	return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1350
1351}
1352
1353static void unusable_show_print(struct seq_file *m,
1354					pg_data_t *pgdat, struct zone *zone)
1355{
1356	unsigned int order;
1357	int index;
1358	struct contig_page_info info;
1359
1360	seq_printf(m, "Node %d, zone %8s ",
1361				pgdat->node_id,
1362				zone->name);
1363	for (order = 0; order < MAX_ORDER; ++order) {
1364		fill_contig_page_info(zone, order, &info);
1365		index = unusable_free_index(order, &info);
1366		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1367	}
1368
1369	seq_putc(m, '\n');
1370}
1371
1372/*
1373 * Display unusable free space index
1374 *
1375 * The unusable free space index measures how much of the available free
1376 * memory cannot be used to satisfy an allocation of a given size and is a
1377 * value between 0 and 1. The higher the value, the more of free memory is
1378 * unusable and by implication, the worse the external fragmentation is. This
1379 * can be expressed as a percentage by multiplying by 100.
1380 */
1381static int unusable_show(struct seq_file *m, void *arg)
1382{
1383	pg_data_t *pgdat = (pg_data_t *)arg;
1384
1385	/* check memoryless node */
1386	if (!node_state(pgdat->node_id, N_MEMORY))
1387		return 0;
1388
1389	walk_zones_in_node(m, pgdat, unusable_show_print);
1390
1391	return 0;
1392}
1393
1394static const struct seq_operations unusable_op = {
1395	.start	= frag_start,
1396	.next	= frag_next,
1397	.stop	= frag_stop,
1398	.show	= unusable_show,
1399};
1400
1401static int unusable_open(struct inode *inode, struct file *file)
1402{
1403	return seq_open(file, &unusable_op);
1404}
1405
1406static const struct file_operations unusable_file_ops = {
1407	.open		= unusable_open,
1408	.read		= seq_read,
1409	.llseek		= seq_lseek,
1410	.release	= seq_release,
1411};
1412
1413static void extfrag_show_print(struct seq_file *m,
1414					pg_data_t *pgdat, struct zone *zone)
1415{
1416	unsigned int order;
1417	int index;
1418
1419	/* Alloc on stack as interrupts are disabled for zone walk */
1420	struct contig_page_info info;
1421
1422	seq_printf(m, "Node %d, zone %8s ",
1423				pgdat->node_id,
1424				zone->name);
1425	for (order = 0; order < MAX_ORDER; ++order) {
1426		fill_contig_page_info(zone, order, &info);
1427		index = __fragmentation_index(order, &info);
1428		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1429	}
1430
1431	seq_putc(m, '\n');
1432}
1433
1434/*
1435 * Display fragmentation index for orders that allocations would fail for
1436 */
1437static int extfrag_show(struct seq_file *m, void *arg)
1438{
1439	pg_data_t *pgdat = (pg_data_t *)arg;
1440
1441	walk_zones_in_node(m, pgdat, extfrag_show_print);
1442
1443	return 0;
1444}
1445
1446static const struct seq_operations extfrag_op = {
1447	.start	= frag_start,
1448	.next	= frag_next,
1449	.stop	= frag_stop,
1450	.show	= extfrag_show,
1451};
1452
1453static int extfrag_open(struct inode *inode, struct file *file)
1454{
1455	return seq_open(file, &extfrag_op);
1456}
1457
1458static const struct file_operations extfrag_file_ops = {
1459	.open		= extfrag_open,
1460	.read		= seq_read,
1461	.llseek		= seq_lseek,
1462	.release	= seq_release,
1463};
1464
1465static int __init extfrag_debug_init(void)
1466{
1467	struct dentry *extfrag_debug_root;
1468
1469	extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1470	if (!extfrag_debug_root)
1471		return -ENOMEM;
1472
1473	if (!debugfs_create_file("unusable_index", 0444,
1474			extfrag_debug_root, NULL, &unusable_file_ops))
1475		goto fail;
1476
1477	if (!debugfs_create_file("extfrag_index", 0444,
1478			extfrag_debug_root, NULL, &extfrag_file_ops))
1479		goto fail;
1480
1481	return 0;
1482fail:
1483	debugfs_remove_recursive(extfrag_debug_root);
1484	return -ENOMEM;
1485}
1486
1487module_init(extfrag_debug_init);
1488#endif