Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] Reduce size of huge boot per_cpu_pageset

Reduce size of the huge per_cpu_pageset structure in __initdata introduced
into mm1 with the pageset localization patchset. Use one specially
configured pageset per cpu for all zones and nodes during bootup.

- Avoid duplication of pageset initialization code.
- do the adding to the pageset list before potential free_pages_bulk
in free_hot_cold_page (otherwise we would have to hold a page
in a pageset during the period that the boot pagesets are in use).
- remove mistaken __cpuinitdata attribute and revert back to __initdata
for the boot pageset. A boot pageset is not necessary for cpu hotplug.

Tested for UP SMP NUMA on x86_64 (2.6.12-rc6-mm1): UP SMP NUMA Tested on
IA64 (2.6.12-rc5-mm2): NUMA (2.6.12-rc6-mm1 broken for IA64 because of
sparsemem patches)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Christoph Lameter and committed by
Linus Torvalds
2caaad41 4ae7c039

+42 -66
+42 -66
mm/page_alloc.c
··· 71 71 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 72 72 EXPORT_SYMBOL(zone_table); 73 73 74 - #ifdef CONFIG_NUMA 75 - static struct per_cpu_pageset 76 - pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata; 77 - #endif 78 - 79 74 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 80 75 int min_free_kbytes = 1024; 81 76 ··· 647 652 free_pages_check(__FUNCTION__, page); 648 653 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 649 654 local_irq_save(flags); 650 - if (pcp->count >= pcp->high) 651 - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 652 655 list_add(&page->lru, &pcp->list); 653 656 pcp->count++; 657 + if (pcp->count >= pcp->high) 658 + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 654 659 local_irq_restore(flags); 655 660 put_cpu(); 656 661 } ··· 1709 1714 return batch; 1710 1715 } 1711 1716 1717 + inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 1718 + { 1719 + struct per_cpu_pages *pcp; 1720 + 1721 + pcp = &p->pcp[0]; /* hot */ 1722 + pcp->count = 0; 1723 + pcp->low = 2 * batch; 1724 + pcp->high = 6 * batch; 1725 + pcp->batch = max(1UL, 1 * batch); 1726 + INIT_LIST_HEAD(&pcp->list); 1727 + 1728 + pcp = &p->pcp[1]; /* cold*/ 1729 + pcp->count = 0; 1730 + pcp->low = 0; 1731 + pcp->high = 2 * batch; 1732 + pcp->batch = max(1UL, 1 * batch); 1733 + INIT_LIST_HEAD(&pcp->list); 1734 + } 1735 + 1712 1736 #ifdef CONFIG_NUMA 1713 1737 /* 1714 - * Dynamicaly allocate memory for the 1738 + * Boot pageset table. One per cpu which is going to be used for all 1739 + * zones and all nodes. The parameters will be set in such a way 1740 + * that an item put on a list will immediately be handed over to 1741 + * the buddy list. This is safe since pageset manipulation is done 1742 + * with interrupts disabled. 1743 + * 1744 + * Some NUMA counter updates may also be caught by the boot pagesets. 1745 + * These will be discarded when bootup is complete. 1746 + */ 1747 + static struct per_cpu_pageset 1748 + boot_pageset[NR_CPUS] __initdata; 1749 + 1750 + /* 1751 + * Dynamically allocate memory for the 1715 1752 * per cpu pageset array in struct zone. 1716 1753 */ 1717 1754 static int __devinit process_zones(int cpu) 1718 1755 { 1719 1756 struct zone *zone, *dzone; 1720 - int i; 1721 1757 1722 1758 for_each_zone(zone) { 1723 - struct per_cpu_pageset *npageset = NULL; 1724 1759 1725 - npageset = kmalloc_node(sizeof(struct per_cpu_pageset), 1760 + zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1726 1761 GFP_KERNEL, cpu_to_node(cpu)); 1727 - if (!npageset) { 1728 - zone->pageset[cpu] = NULL; 1762 + if (!zone->pageset[cpu]) 1729 1763 goto bad; 1730 - } 1731 1764 1732 - if (zone->pageset[cpu]) { 1733 - memcpy(npageset, zone->pageset[cpu], 1734 - sizeof(struct per_cpu_pageset)); 1735 - 1736 - /* Relocate lists */ 1737 - for (i = 0; i < 2; i++) { 1738 - INIT_LIST_HEAD(&npageset->pcp[i].list); 1739 - list_splice(&zone->pageset[cpu]->pcp[i].list, 1740 - &npageset->pcp[i].list); 1741 - } 1742 - } else { 1743 - struct per_cpu_pages *pcp; 1744 - unsigned long batch; 1745 - 1746 - batch = zone_batchsize(zone); 1747 - 1748 - pcp = &npageset->pcp[0]; /* hot */ 1749 - pcp->count = 0; 1750 - pcp->low = 2 * batch; 1751 - pcp->high = 6 * batch; 1752 - pcp->batch = 1 * batch; 1753 - INIT_LIST_HEAD(&pcp->list); 1754 - 1755 - pcp = &npageset->pcp[1]; /* cold*/ 1756 - pcp->count = 0; 1757 - pcp->low = 0; 1758 - pcp->high = 2 * batch; 1759 - pcp->batch = 1 * batch; 1760 - INIT_LIST_HEAD(&pcp->list); 1761 - } 1762 - zone->pageset[cpu] = npageset; 1765 + setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1763 1766 } 1764 1767 1765 1768 return 0; ··· 1871 1878 batch = zone_batchsize(zone); 1872 1879 1873 1880 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1874 - struct per_cpu_pages *pcp; 1875 1881 #ifdef CONFIG_NUMA 1876 - struct per_cpu_pageset *pgset; 1877 - pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS + 1878 - (j * NR_CPUS) + cpu]; 1879 - 1880 - zone->pageset[cpu] = pgset; 1882 + /* Early boot. Slab allocator not functional yet */ 1883 + zone->pageset[cpu] = &boot_pageset[cpu]; 1884 + setup_pageset(&boot_pageset[cpu],0); 1881 1885 #else 1882 - struct per_cpu_pageset *pgset = zone_pcp(zone, cpu); 1886 + setup_pageset(zone_pcp(zone,cpu), batch); 1883 1887 #endif 1884 - 1885 - pcp = &pgset->pcp[0]; /* hot */ 1886 - pcp->count = 0; 1887 - pcp->low = 2 * batch; 1888 - pcp->high = 6 * batch; 1889 - pcp->batch = 1 * batch; 1890 - INIT_LIST_HEAD(&pcp->list); 1891 - 1892 - pcp = &pgset->pcp[1]; /* cold */ 1893 - pcp->count = 0; 1894 - pcp->low = 0; 1895 - pcp->high = 2 * batch; 1896 - pcp->batch = 1 * batch; 1897 - INIT_LIST_HEAD(&pcp->list); 1898 1888 } 1899 1889 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1900 1890 zone_names[j], realsize, batch);