Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: fix Committed_AS underflow on large NR_CPUS environment

The Committed_AS field can underflow in certain situations:

> # while true; do cat /proc/meminfo | grep _AS; sleep 1; done | uniq -c
> 1 Committed_AS: 18446744073709323392 kB
> 11 Committed_AS: 18446744073709455488 kB
> 6 Committed_AS: 35136 kB
> 5 Committed_AS: 18446744073709454400 kB
> 7 Committed_AS: 35904 kB
> 3 Committed_AS: 18446744073709453248 kB
> 2 Committed_AS: 34752 kB
> 9 Committed_AS: 18446744073709453248 kB
> 8 Committed_AS: 34752 kB
> 3 Committed_AS: 18446744073709320960 kB
> 7 Committed_AS: 18446744073709454080 kB
> 3 Committed_AS: 18446744073709320960 kB
> 5 Committed_AS: 18446744073709454080 kB
> 6 Committed_AS: 18446744073709320960 kB

Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
not check for underflow.

But NR_CPUS proportional isn't good calculation. In general,
possibility of lock contention is proportional to the number of online
cpus, not theorical maximum cpus (NR_CPUS).

The current kernel has generic percpu-counter stuff. using it is right
way. it makes code simplify and percpu_counter_read_positive() don't
make underflow issue.

Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org> [All kernel versions]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

KOSAKI Motohiro and committed by
Linus Torvalds
00a62ce9 0763ed23

+17 -65
+1 -1
fs/proc/meminfo.c
··· 35 35 #define K(x) ((x) << (PAGE_SHIFT - 10)) 36 36 si_meminfo(&i); 37 37 si_swapinfo(&i); 38 - committed = atomic_long_read(&vm_committed_space); 38 + committed = percpu_counter_read_positive(&vm_committed_as); 39 39 allowed = ((totalram_pages - hugetlb_total_pages()) 40 40 * sysctl_overcommit_ratio / 100) + total_swap_pages; 41 41
+3 -6
include/linux/mman.h
··· 12 12 13 13 #ifdef __KERNEL__ 14 14 #include <linux/mm.h> 15 + #include <linux/percpu_counter.h> 15 16 16 17 #include <asm/atomic.h> 17 18 18 19 extern int sysctl_overcommit_memory; 19 20 extern int sysctl_overcommit_ratio; 20 - extern atomic_long_t vm_committed_space; 21 + extern struct percpu_counter vm_committed_as; 21 22 22 - #ifdef CONFIG_SMP 23 - extern void vm_acct_memory(long pages); 24 - #else 25 23 static inline void vm_acct_memory(long pages) 26 24 { 27 - atomic_long_add(pages, &vm_committed_space); 25 + percpu_counter_add(&vm_committed_as, pages); 28 26 } 29 - #endif 30 27 31 28 static inline void vm_unacct_memory(long pages) 32 29 {
+6 -6
mm/mmap.c
··· 85 85 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 86 86 int sysctl_overcommit_ratio = 50; /* default is 50% */ 87 87 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 88 - atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 88 + struct percpu_counter vm_committed_as; 89 89 90 90 /* 91 91 * Check that a process has enough memory to allocate a new virtual ··· 179 179 if (mm) 180 180 allowed -= mm->total_vm / 32; 181 181 182 - /* 183 - * cast `allowed' as a signed long because vm_committed_space 184 - * sometimes has a negative value 185 - */ 186 - if (atomic_long_read(&vm_committed_space) < (long)allowed) 182 + if (percpu_counter_read_positive(&vm_committed_as) < allowed) 187 183 return 0; 188 184 error: 189 185 vm_unacct_memory(pages); ··· 2477 2481 */ 2478 2482 void __init mmap_init(void) 2479 2483 { 2484 + int ret; 2485 + 2486 + ret = percpu_counter_init(&vm_committed_as, 0); 2487 + VM_BUG_ON(ret); 2480 2488 }
+7 -6
mm/nommu.c
··· 62 62 struct page *mem_map; 63 63 unsigned long max_mapnr; 64 64 unsigned long num_physpages; 65 - atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 65 + struct percpu_counter vm_committed_as; 66 66 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 67 67 int sysctl_overcommit_ratio = 50; /* default is 50% */ 68 68 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; ··· 463 463 */ 464 464 void __init mmap_init(void) 465 465 { 466 + int ret; 467 + 468 + ret = percpu_counter_init(&vm_committed_as, 0); 469 + VM_BUG_ON(ret); 466 470 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 467 471 } 468 472 ··· 1851 1847 if (mm) 1852 1848 allowed -= mm->total_vm / 32; 1853 1849 1854 - /* 1855 - * cast `allowed' as a signed long because vm_committed_space 1856 - * sometimes has a negative value 1857 - */ 1858 - if (atomic_long_read(&vm_committed_space) < (long)allowed) 1850 + if (percpu_counter_read_positive(&vm_committed_as) < allowed) 1859 1851 return 0; 1852 + 1860 1853 error: 1861 1854 vm_unacct_memory(pages); 1862 1855
-46
mm/swap.c
··· 491 491 492 492 EXPORT_SYMBOL(pagevec_lookup_tag); 493 493 494 - #ifdef CONFIG_SMP 495 - /* 496 - * We tolerate a little inaccuracy to avoid ping-ponging the counter between 497 - * CPUs 498 - */ 499 - #define ACCT_THRESHOLD max(16, NR_CPUS * 2) 500 - 501 - static DEFINE_PER_CPU(long, committed_space); 502 - 503 - void vm_acct_memory(long pages) 504 - { 505 - long *local; 506 - 507 - preempt_disable(); 508 - local = &__get_cpu_var(committed_space); 509 - *local += pages; 510 - if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { 511 - atomic_long_add(*local, &vm_committed_space); 512 - *local = 0; 513 - } 514 - preempt_enable(); 515 - } 516 - 517 - #ifdef CONFIG_HOTPLUG_CPU 518 - 519 - /* Drop the CPU's cached committed space back into the central pool. */ 520 - static int cpu_swap_callback(struct notifier_block *nfb, 521 - unsigned long action, 522 - void *hcpu) 523 - { 524 - long *committed; 525 - 526 - committed = &per_cpu(committed_space, (long)hcpu); 527 - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 528 - atomic_long_add(*committed, &vm_committed_space); 529 - *committed = 0; 530 - drain_cpu_pagevecs((long)hcpu); 531 - } 532 - return NOTIFY_OK; 533 - } 534 - #endif /* CONFIG_HOTPLUG_CPU */ 535 - #endif /* CONFIG_SMP */ 536 - 537 494 /* 538 495 * Perform any setup for the swap system 539 496 */ ··· 511 554 * Right now other parts of the system means that we 512 555 * _really_ don't want to cluster much more 513 556 */ 514 - #ifdef CONFIG_HOTPLUG_CPU 515 - hotcpu_notifier(cpu_swap_callback, 0); 516 - #endif 517 557 }