arch/x86_64/mm/numa.c at v2.6.21

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / arch / x86_64 / mm / numa.c
at v2.6.21 555 lines 15 kB view raw
wrap content
  1/* 
  2 * Generic VM initialization for x86-64 NUMA setups.
  3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
  4 */ 
  5#include <linux/kernel.h>
  6#include <linux/mm.h>
  7#include <linux/string.h>
  8#include <linux/init.h>
  9#include <linux/bootmem.h>
 10#include <linux/mmzone.h>
 11#include <linux/ctype.h>
 12#include <linux/module.h>
 13#include <linux/nodemask.h>
 14
 15#include <asm/e820.h>
 16#include <asm/proto.h>
 17#include <asm/dma.h>
 18#include <asm/numa.h>
 19#include <asm/acpi.h>
 20
 21#ifndef Dprintk
 22#define Dprintk(x...)
 23#endif
 24
 25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 27
 28struct memnode memnode;
 29
 30unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
 31	[0 ... NR_CPUS-1] = NUMA_NO_NODE
 32};
 33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 34 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 35};
 36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
 37
 38int numa_off __initdata;
 39unsigned long __initdata nodemap_addr;
 40unsigned long __initdata nodemap_size;
 41
 42
 43/*
 44 * Given a shift value, try to populate memnodemap[]
 45 * Returns :
 46 * 1 if OK
 47 * 0 if memnodmap[] too small (of shift too small)
 48 * -1 if node overlap or lost ram (shift too big)
 49 */
 50static int __init
 51populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
 52{
 53	int i; 
 54	int res = -1;
 55	unsigned long addr, end;
 56
 57	memset(memnodemap, 0xff, memnodemapsize);
 58	for (i = 0; i < numnodes; i++) {
 59		addr = nodes[i].start;
 60		end = nodes[i].end;
 61		if (addr >= end)
 62			continue;
 63		if ((end >> shift) >= memnodemapsize)
 64			return 0;
 65		do {
 66			if (memnodemap[addr >> shift] != 0xff)
 67				return -1;
 68			memnodemap[addr >> shift] = i;
 69			addr += (1UL << shift);
 70		} while (addr < end);
 71		res = 1;
 72	} 
 73	return res;
 74}
 75
 76static int __init allocate_cachealigned_memnodemap(void)
 77{
 78	unsigned long pad, pad_addr;
 79
 80	memnodemap = memnode.embedded_map;
 81	if (memnodemapsize <= 48)
 82		return 0;
 83
 84	pad = L1_CACHE_BYTES - 1;
 85	pad_addr = 0x8000;
 86	nodemap_size = pad + memnodemapsize;
 87	nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
 88				      nodemap_size);
 89	if (nodemap_addr == -1UL) {
 90		printk(KERN_ERR
 91		       "NUMA: Unable to allocate Memory to Node hash map\n");
 92		nodemap_addr = nodemap_size = 0;
 93		return -1;
 94	}
 95	pad_addr = (nodemap_addr + pad) & ~pad;
 96	memnodemap = phys_to_virt(pad_addr);
 97
 98	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 99	       nodemap_addr, nodemap_addr + nodemap_size);
100	return 0;
101}
102
103/*
104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
106 */
107static int __init
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
109{
110	int i, nodes_used = 0;
111	unsigned long start, end;
112	unsigned long bitfield = 0, memtop = 0;
113
114	for (i = 0; i < numnodes; i++) {
115		start = nodes[i].start;
116		end = nodes[i].end;
117		if (start >= end)
118			continue;
119		bitfield |= start;
120		nodes_used++;
121		if (end > memtop)
122			memtop = end;
123	}
124	if (nodes_used <= 1)
125		i = 63;
126	else
127		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
128	memnodemapsize = (memtop >> i)+1;
129	return i;
130}
131
132int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
133{
134	int shift;
135
136	shift = extract_lsb_from_nodes(nodes, numnodes);
137	if (allocate_cachealigned_memnodemap())
138		return -1;
139	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
140		shift);
141
142	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143		printk(KERN_INFO
144	"Your memory is not aligned you need to rebuild your kernel "
145	"with a bigger NODEMAPSIZE shift=%d\n",
146			shift);
147		return -1;
148	}
149	return shift;
150}
151
152#ifdef CONFIG_SPARSEMEM
153int early_pfn_to_nid(unsigned long pfn)
154{
155	return phys_to_nid(pfn << PAGE_SHIFT);
156}
157#endif
158
159static void * __init
160early_node_mem(int nodeid, unsigned long start, unsigned long end,
161	      unsigned long size)
162{
163	unsigned long mem = find_e820_area(start, end, size);
164	void *ptr;
165	if (mem != -1L)
166		return __va(mem);
167	ptr = __alloc_bootmem_nopanic(size,
168				SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
169	if (ptr == 0) {
170		printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171			size, nodeid);
172		return NULL;
173	}
174	return ptr;
175}
176
177/* Initialize bootmem allocator for a node */
178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
179{ 
180	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
181	unsigned long nodedata_phys;
182	void *bootmap;
183	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184
185	start = round_up(start, ZONE_ALIGN); 
186
187	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
188
189	start_pfn = start >> PAGE_SHIFT;
190	end_pfn = end >> PAGE_SHIFT;
191
192	node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
193	if (node_data[nodeid] == NULL)
194		return;
195	nodedata_phys = __pa(node_data[nodeid]);
196
197	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
199	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
200	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201
202	/* Find a place for the bootmem map */
203	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
204	bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
205	bootmap = early_node_mem(nodeid, bootmap_start, end,
206					bootmap_pages<<PAGE_SHIFT);
207	if (bootmap == NULL)  {
208		if (nodedata_phys < start || nodedata_phys >= end)
209			free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
210		node_data[nodeid] = NULL;
211		return;
212	}
213	bootmap_start = __pa(bootmap);
214	Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
215	
216	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217					 bootmap_start >> PAGE_SHIFT, 
218					 start_pfn, end_pfn); 
219
220	free_bootmem_with_active_regions(nodeid, end);
221
222	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
223	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
224#ifdef CONFIG_ACPI_NUMA
225	srat_reserve_add_area(nodeid);
226#endif
227	node_set_online(nodeid);
228} 
229
230/* Initialize final allocator for a zone */
231void __init setup_node_zones(int nodeid)
232{ 
233	unsigned long start_pfn, end_pfn, memmapsize, limit;
234
235 	start_pfn = node_start_pfn(nodeid);
236 	end_pfn = node_end_pfn(nodeid);
237
238	Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239		nodeid, start_pfn, end_pfn);
240
241	/* Try to allocate mem_map at end to not fill up precious <4GB
242	   memory. */
243	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244	limit = end_pfn << PAGE_SHIFT;
245#ifdef CONFIG_FLAT_NODE_MEM_MAP
246	NODE_DATA(nodeid)->node_mem_map = 
247		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
248				memmapsize, SMP_CACHE_BYTES, 
249				round_down(limit - memmapsize, PAGE_SIZE), 
250				limit);
251#endif
252} 
253
254void __init numa_init_array(void)
255{
256	int rr, i;
257	/* There are unfortunately some poorly designed mainboards around
258	   that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259	   mapping. To avoid this fill in the mapping for all possible
260	   CPUs, as the number of CPUs is not known yet. 
261	   We round robin the existing nodes. */
262	rr = first_node(node_online_map);
263	for (i = 0; i < NR_CPUS; i++) {
264		if (cpu_to_node[i] != NUMA_NO_NODE)
265			continue;
266 		numa_set_node(i, rr);
267		rr = next_node(rr, node_online_map);
268		if (rr == MAX_NUMNODES)
269			rr = first_node(node_online_map);
270	}
271
272}
273
274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */
276int numa_fake __initdata = 0;
277
278/*
279 * This function is used to find out if the start and end correspond to
280 * different zones.
281 */
282int zone_cross_over(unsigned long start, unsigned long end)
283{
284	if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
285			(end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
286		return 1;
287	return 0;
288}
289
290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
291{
292 	int i, big;
293 	struct bootnode nodes[MAX_NUMNODES];
294 	unsigned long sz, old_sz;
295	unsigned long hole_size;
296	unsigned long start, end;
297	unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299	start = (start_pfn << PAGE_SHIFT);
300	hole_size = e820_hole_size(start, max_addr);
301	sz = (max_addr - start - hole_size) / numa_fake;
302
303 	/* Kludge needed for the hash function */
304
305	old_sz = sz;
306	/*
307	 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308	 */
309	sz &= FAKE_NODE_MIN_HASH_MASK;
310
311	/*
312	 * We ensure that each node is at least 64MB big.  Smaller than this
313	 * size can cause VM hiccups.
314	 */
315	if (sz == 0) {
316		printk(KERN_INFO "Not enough memory for %d nodes.  Reducing "
317				"the number of nodes\n", numa_fake);
318		numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
319		printk(KERN_INFO "Number of fake nodes will be = %d\n",
320				numa_fake);
321		sz = FAKE_NODE_MIN_SIZE;
322	}
323	/*
324	 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
325	 * This logic ensures the extra memory gets distributed among as many
326	 * nodes as possible (as compared to one single node getting all that
327	 * extra memory.
328	 */
329	big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330	printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331			"%d\n",
332			(sz >> 20), (hole_size >> 20), big);
333 	memset(&nodes,0,sizeof(nodes));
334	end = start;
335 	for (i = 0; i < numa_fake; i++) {
336		/*
337		 * In case we are not able to allocate enough memory for all
338		 * the nodes, we reduce the number of fake nodes.
339		 */
340		if (end >= max_addr) {
341			numa_fake = i - 1;
342			break;
343		}
344 		start = nodes[i].start = end;
345		/*
346		 * Final node can have all the remaining memory.
347		 */
348 		if (i == numa_fake-1)
349 			sz = max_addr - start;
350 		end = nodes[i].start + sz;
351		/*
352		 * Fir "big" number of nodes get extra granule.
353		 */
354		if (i < big)
355			end += FAKE_NODE_MIN_SIZE;
356		/*
357		 * Iterate over the range to ensure that this node gets at
358		 * least sz amount of RAM (excluding holes)
359		 */
360		while ((end - start - e820_hole_size(start, end)) < sz) {
361			end += FAKE_NODE_MIN_SIZE;
362			if (end >= max_addr)
363				break;
364		}
365		/*
366		 * Look at the next node to make sure there is some real memory
367		 * to map.  Bad things happen when the only memory present
368		 * in a zone on a fake node is IO hole.
369		 */
370		while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
371			if (zone_cross_over(start, end + sz)) {
372				end = (MAX_DMA32_PFN << PAGE_SHIFT);
373				break;
374			}
375			if (end >= max_addr)
376				break;
377			end += FAKE_NODE_MIN_SIZE;
378		}
379		if (end > max_addr)
380			end = max_addr;
381		nodes[i].end = end;
382 		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
383 		       i,
384 		       nodes[i].start, nodes[i].end,
385 		       (nodes[i].end - nodes[i].start) >> 20);
386		node_set_online(i);
387 	}
388 	memnode_shift = compute_hash_shift(nodes, numa_fake);
389 	if (memnode_shift < 0) {
390 		memnode_shift = 0;
391 		printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
392 		return -1;
393 	}
394 	for_each_online_node(i) {
395		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
396						nodes[i].end >> PAGE_SHIFT);
397 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
398	}
399 	numa_init_array();
400 	return 0;
401}
402#endif
403
404void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
405{ 
406	int i;
407
408#ifdef CONFIG_NUMA_EMU
409	if (numa_fake && !numa_emulation(start_pfn, end_pfn))
410 		return;
411#endif
412
413#ifdef CONFIG_ACPI_NUMA
414	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
415					  end_pfn << PAGE_SHIFT))
416 		return;
417#endif
418
419#ifdef CONFIG_K8_NUMA
420	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
421		return;
422#endif
423	printk(KERN_INFO "%s\n",
424	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
425
426	printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
427	       start_pfn << PAGE_SHIFT,
428	       end_pfn << PAGE_SHIFT); 
429		/* setup dummy node covering all memory */ 
430	memnode_shift = 63; 
431	memnodemap = memnode.embedded_map;
432	memnodemap[0] = 0;
433	nodes_clear(node_online_map);
434	node_set_online(0);
435	for (i = 0; i < NR_CPUS; i++)
436		numa_set_node(i, 0);
437	node_to_cpumask[0] = cpumask_of_cpu(0);
438	e820_register_active_regions(0, start_pfn, end_pfn);
439	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
440}
441
442__cpuinit void numa_add_cpu(int cpu)
443{
444	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
445} 
446
447void __cpuinit numa_set_node(int cpu, int node)
448{
449	cpu_pda(cpu)->nodenumber = node;
450	cpu_to_node[cpu] = node;
451}
452
453unsigned long __init numa_free_all_bootmem(void) 
454{ 
455	int i;
456	unsigned long pages = 0;
457	for_each_online_node(i) {
458		pages += free_all_bootmem_node(NODE_DATA(i));
459	}
460	return pages;
461} 
462
463void __init paging_init(void)
464{ 
465	int i;
466	unsigned long max_zone_pfns[MAX_NR_ZONES];
467	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
468	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
469	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
470	max_zone_pfns[ZONE_NORMAL] = end_pfn;
471
472	sparse_memory_present_with_active_regions(MAX_NUMNODES);
473	sparse_init();
474
475	for_each_online_node(i) {
476		setup_node_zones(i); 
477	}
478
479	free_area_init_nodes(max_zone_pfns);
480} 
481
482static __init int numa_setup(char *opt)
483{ 
484	if (!opt)
485		return -EINVAL;
486	if (!strncmp(opt,"off",3))
487		numa_off = 1;
488#ifdef CONFIG_NUMA_EMU
489	if(!strncmp(opt, "fake=", 5)) {
490		numa_fake = simple_strtoul(opt+5,NULL,0); ;
491		if (numa_fake >= MAX_NUMNODES)
492			numa_fake = MAX_NUMNODES;
493	}
494#endif
495#ifdef CONFIG_ACPI_NUMA
496 	if (!strncmp(opt,"noacpi",6))
497 		acpi_numa = -1;
498	if (!strncmp(opt,"hotadd=", 7))
499		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
500#endif
501	return 0;
502} 
503
504early_param("numa", numa_setup);
505
506/*
507 * Setup early cpu_to_node.
508 *
509 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
510 * and apicid_to_node[] tables have valid entries for a CPU.
511 * This means we skip cpu_to_node[] initialisation for NUMA
512 * emulation and faking node case (when running a kernel compiled
513 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
514 * is already initialized in a round robin manner at numa_init_array,
515 * prior to this call, and this initialization is good enough
516 * for the fake NUMA cases.
517 */
518void __init init_cpu_to_node(void)
519{
520	int i;
521 	for (i = 0; i < NR_CPUS; i++) {
522		u8 apicid = x86_cpu_to_apicid[i];
523		if (apicid == BAD_APICID)
524			continue;
525		if (apicid_to_node[apicid] == NUMA_NO_NODE)
526			continue;
527		numa_set_node(i,apicid_to_node[apicid]);
528	}
529}
530
531EXPORT_SYMBOL(cpu_to_node);
532EXPORT_SYMBOL(node_to_cpumask);
533EXPORT_SYMBOL(memnode);
534EXPORT_SYMBOL(node_data);
535
536#ifdef CONFIG_DISCONTIGMEM
537/*
538 * Functions to convert PFNs from/to per node page addresses.
539 * These are out of line because they are quite big.
540 * They could be all tuned by pre caching more state.
541 * Should do that.
542 */
543
544int pfn_valid(unsigned long pfn)
545{
546	unsigned nid;
547	if (pfn >= num_physpages)
548		return 0;
549	nid = pfn_to_nid(pfn);
550	if (nid == 0xff)
551		return 0;
552	return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
553}
554EXPORT_SYMBOL(pfn_valid);
555#endif
Configure Feed

Configure Feed