Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.21 555 lines 15 kB view raw
1/* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5#include <linux/kernel.h> 6#include <linux/mm.h> 7#include <linux/string.h> 8#include <linux/init.h> 9#include <linux/bootmem.h> 10#include <linux/mmzone.h> 11#include <linux/ctype.h> 12#include <linux/module.h> 13#include <linux/nodemask.h> 14 15#include <asm/e820.h> 16#include <asm/proto.h> 17#include <asm/dma.h> 18#include <asm/numa.h> 19#include <asm/acpi.h> 20 21#ifndef Dprintk 22#define Dprintk(x...) 23#endif 24 25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 27 28struct memnode memnode; 29 30unsigned char cpu_to_node[NR_CPUS] __read_mostly = { 31 [0 ... NR_CPUS-1] = NUMA_NO_NODE 32}; 33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 35}; 36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 37 38int numa_off __initdata; 39unsigned long __initdata nodemap_addr; 40unsigned long __initdata nodemap_size; 41 42 43/* 44 * Given a shift value, try to populate memnodemap[] 45 * Returns : 46 * 1 if OK 47 * 0 if memnodmap[] too small (of shift too small) 48 * -1 if node overlap or lost ram (shift too big) 49 */ 50static int __init 51populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) 52{ 53 int i; 54 int res = -1; 55 unsigned long addr, end; 56 57 memset(memnodemap, 0xff, memnodemapsize); 58 for (i = 0; i < numnodes; i++) { 59 addr = nodes[i].start; 60 end = nodes[i].end; 61 if (addr >= end) 62 continue; 63 if ((end >> shift) >= memnodemapsize) 64 return 0; 65 do { 66 if (memnodemap[addr >> shift] != 0xff) 67 return -1; 68 memnodemap[addr >> shift] = i; 69 addr += (1UL << shift); 70 } while (addr < end); 71 res = 1; 72 } 73 return res; 74} 75 76static int __init allocate_cachealigned_memnodemap(void) 77{ 78 unsigned long pad, pad_addr; 79 80 memnodemap = memnode.embedded_map; 81 if (memnodemapsize <= 48) 82 return 0; 83 84 pad = L1_CACHE_BYTES - 1; 85 pad_addr = 0x8000; 86 nodemap_size = pad + memnodemapsize; 87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, 88 nodemap_size); 89 if (nodemap_addr == -1UL) { 90 printk(KERN_ERR 91 "NUMA: Unable to allocate Memory to Node hash map\n"); 92 nodemap_addr = nodemap_size = 0; 93 return -1; 94 } 95 pad_addr = (nodemap_addr + pad) & ~pad; 96 memnodemap = phys_to_virt(pad_addr); 97 98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 99 nodemap_addr, nodemap_addr + nodemap_size); 100 return 0; 101} 102 103/* 104 * The LSB of all start and end addresses in the node map is the value of the 105 * maximum possible shift. 106 */ 107static int __init 108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) 109{ 110 int i, nodes_used = 0; 111 unsigned long start, end; 112 unsigned long bitfield = 0, memtop = 0; 113 114 for (i = 0; i < numnodes; i++) { 115 start = nodes[i].start; 116 end = nodes[i].end; 117 if (start >= end) 118 continue; 119 bitfield |= start; 120 nodes_used++; 121 if (end > memtop) 122 memtop = end; 123 } 124 if (nodes_used <= 1) 125 i = 63; 126 else 127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8); 128 memnodemapsize = (memtop >> i)+1; 129 return i; 130} 131 132int __init compute_hash_shift(struct bootnode *nodes, int numnodes) 133{ 134 int shift; 135 136 shift = extract_lsb_from_nodes(nodes, numnodes); 137 if (allocate_cachealigned_memnodemap()) 138 return -1; 139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 140 shift); 141 142 if (populate_memnodemap(nodes, numnodes, shift) != 1) { 143 printk(KERN_INFO 144 "Your memory is not aligned you need to rebuild your kernel " 145 "with a bigger NODEMAPSIZE shift=%d\n", 146 shift); 147 return -1; 148 } 149 return shift; 150} 151 152#ifdef CONFIG_SPARSEMEM 153int early_pfn_to_nid(unsigned long pfn) 154{ 155 return phys_to_nid(pfn << PAGE_SHIFT); 156} 157#endif 158 159static void * __init 160early_node_mem(int nodeid, unsigned long start, unsigned long end, 161 unsigned long size) 162{ 163 unsigned long mem = find_e820_area(start, end, size); 164 void *ptr; 165 if (mem != -1L) 166 return __va(mem); 167 ptr = __alloc_bootmem_nopanic(size, 168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); 169 if (ptr == 0) { 170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 171 size, nodeid); 172 return NULL; 173 } 174 return ptr; 175} 176 177/* Initialize bootmem allocator for a node */ 178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 179{ 180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 181 unsigned long nodedata_phys; 182 void *bootmap; 183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 184 185 start = round_up(start, ZONE_ALIGN); 186 187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); 188 189 start_pfn = start >> PAGE_SHIFT; 190 end_pfn = end >> PAGE_SHIFT; 191 192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); 193 if (node_data[nodeid] == NULL) 194 return; 195 nodedata_phys = __pa(node_data[nodeid]); 196 197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 201 202 /* Find a place for the bootmem map */ 203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 205 bootmap = early_node_mem(nodeid, bootmap_start, end, 206 bootmap_pages<<PAGE_SHIFT); 207 if (bootmap == NULL) { 208 if (nodedata_phys < start || nodedata_phys >= end) 209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size); 210 node_data[nodeid] = NULL; 211 return; 212 } 213 bootmap_start = __pa(bootmap); 214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 215 216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 217 bootmap_start >> PAGE_SHIFT, 218 start_pfn, end_pfn); 219 220 free_bootmem_with_active_regions(nodeid, end); 221 222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 224#ifdef CONFIG_ACPI_NUMA 225 srat_reserve_add_area(nodeid); 226#endif 227 node_set_online(nodeid); 228} 229 230/* Initialize final allocator for a zone */ 231void __init setup_node_zones(int nodeid) 232{ 233 unsigned long start_pfn, end_pfn, memmapsize, limit; 234 235 start_pfn = node_start_pfn(nodeid); 236 end_pfn = node_end_pfn(nodeid); 237 238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", 239 nodeid, start_pfn, end_pfn); 240 241 /* Try to allocate mem_map at end to not fill up precious <4GB 242 memory. */ 243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn); 244 limit = end_pfn << PAGE_SHIFT; 245#ifdef CONFIG_FLAT_NODE_MEM_MAP 246 NODE_DATA(nodeid)->node_mem_map = 247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 248 memmapsize, SMP_CACHE_BYTES, 249 round_down(limit - memmapsize, PAGE_SIZE), 250 limit); 251#endif 252} 253 254void __init numa_init_array(void) 255{ 256 int rr, i; 257 /* There are unfortunately some poorly designed mainboards around 258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node 259 mapping. To avoid this fill in the mapping for all possible 260 CPUs, as the number of CPUs is not known yet. 261 We round robin the existing nodes. */ 262 rr = first_node(node_online_map); 263 for (i = 0; i < NR_CPUS; i++) { 264 if (cpu_to_node[i] != NUMA_NO_NODE) 265 continue; 266 numa_set_node(i, rr); 267 rr = next_node(rr, node_online_map); 268 if (rr == MAX_NUMNODES) 269 rr = first_node(node_online_map); 270 } 271 272} 273 274#ifdef CONFIG_NUMA_EMU 275/* Numa emulation */ 276int numa_fake __initdata = 0; 277 278/* 279 * This function is used to find out if the start and end correspond to 280 * different zones. 281 */ 282int zone_cross_over(unsigned long start, unsigned long end) 283{ 284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && 285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) 286 return 1; 287 return 0; 288} 289 290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 291{ 292 int i, big; 293 struct bootnode nodes[MAX_NUMNODES]; 294 unsigned long sz, old_sz; 295 unsigned long hole_size; 296 unsigned long start, end; 297 unsigned long max_addr = (end_pfn << PAGE_SHIFT); 298 299 start = (start_pfn << PAGE_SHIFT); 300 hole_size = e820_hole_size(start, max_addr); 301 sz = (max_addr - start - hole_size) / numa_fake; 302 303 /* Kludge needed for the hash function */ 304 305 old_sz = sz; 306 /* 307 * Round down to the nearest FAKE_NODE_MIN_SIZE. 308 */ 309 sz &= FAKE_NODE_MIN_HASH_MASK; 310 311 /* 312 * We ensure that each node is at least 64MB big. Smaller than this 313 * size can cause VM hiccups. 314 */ 315 if (sz == 0) { 316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing " 317 "the number of nodes\n", numa_fake); 318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; 319 printk(KERN_INFO "Number of fake nodes will be = %d\n", 320 numa_fake); 321 sz = FAKE_NODE_MIN_SIZE; 322 } 323 /* 324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule. 325 * This logic ensures the extra memory gets distributed among as many 326 * nodes as possible (as compared to one single node getting all that 327 * extra memory. 328 */ 329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; 330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " 331 "%d\n", 332 (sz >> 20), (hole_size >> 20), big); 333 memset(&nodes,0,sizeof(nodes)); 334 end = start; 335 for (i = 0; i < numa_fake; i++) { 336 /* 337 * In case we are not able to allocate enough memory for all 338 * the nodes, we reduce the number of fake nodes. 339 */ 340 if (end >= max_addr) { 341 numa_fake = i - 1; 342 break; 343 } 344 start = nodes[i].start = end; 345 /* 346 * Final node can have all the remaining memory. 347 */ 348 if (i == numa_fake-1) 349 sz = max_addr - start; 350 end = nodes[i].start + sz; 351 /* 352 * Fir "big" number of nodes get extra granule. 353 */ 354 if (i < big) 355 end += FAKE_NODE_MIN_SIZE; 356 /* 357 * Iterate over the range to ensure that this node gets at 358 * least sz amount of RAM (excluding holes) 359 */ 360 while ((end - start - e820_hole_size(start, end)) < sz) { 361 end += FAKE_NODE_MIN_SIZE; 362 if (end >= max_addr) 363 break; 364 } 365 /* 366 * Look at the next node to make sure there is some real memory 367 * to map. Bad things happen when the only memory present 368 * in a zone on a fake node is IO hole. 369 */ 370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { 371 if (zone_cross_over(start, end + sz)) { 372 end = (MAX_DMA32_PFN << PAGE_SHIFT); 373 break; 374 } 375 if (end >= max_addr) 376 break; 377 end += FAKE_NODE_MIN_SIZE; 378 } 379 if (end > max_addr) 380 end = max_addr; 381 nodes[i].end = end; 382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 383 i, 384 nodes[i].start, nodes[i].end, 385 (nodes[i].end - nodes[i].start) >> 20); 386 node_set_online(i); 387 } 388 memnode_shift = compute_hash_shift(nodes, numa_fake); 389 if (memnode_shift < 0) { 390 memnode_shift = 0; 391 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 392 return -1; 393 } 394 for_each_online_node(i) { 395 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 396 nodes[i].end >> PAGE_SHIFT); 397 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 398 } 399 numa_init_array(); 400 return 0; 401} 402#endif 403 404void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 405{ 406 int i; 407 408#ifdef CONFIG_NUMA_EMU 409 if (numa_fake && !numa_emulation(start_pfn, end_pfn)) 410 return; 411#endif 412 413#ifdef CONFIG_ACPI_NUMA 414 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 415 end_pfn << PAGE_SHIFT)) 416 return; 417#endif 418 419#ifdef CONFIG_K8_NUMA 420 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) 421 return; 422#endif 423 printk(KERN_INFO "%s\n", 424 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 425 426 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 427 start_pfn << PAGE_SHIFT, 428 end_pfn << PAGE_SHIFT); 429 /* setup dummy node covering all memory */ 430 memnode_shift = 63; 431 memnodemap = memnode.embedded_map; 432 memnodemap[0] = 0; 433 nodes_clear(node_online_map); 434 node_set_online(0); 435 for (i = 0; i < NR_CPUS; i++) 436 numa_set_node(i, 0); 437 node_to_cpumask[0] = cpumask_of_cpu(0); 438 e820_register_active_regions(0, start_pfn, end_pfn); 439 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 440} 441 442__cpuinit void numa_add_cpu(int cpu) 443{ 444 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 445} 446 447void __cpuinit numa_set_node(int cpu, int node) 448{ 449 cpu_pda(cpu)->nodenumber = node; 450 cpu_to_node[cpu] = node; 451} 452 453unsigned long __init numa_free_all_bootmem(void) 454{ 455 int i; 456 unsigned long pages = 0; 457 for_each_online_node(i) { 458 pages += free_all_bootmem_node(NODE_DATA(i)); 459 } 460 return pages; 461} 462 463void __init paging_init(void) 464{ 465 int i; 466 unsigned long max_zone_pfns[MAX_NR_ZONES]; 467 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 468 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 469 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 470 max_zone_pfns[ZONE_NORMAL] = end_pfn; 471 472 sparse_memory_present_with_active_regions(MAX_NUMNODES); 473 sparse_init(); 474 475 for_each_online_node(i) { 476 setup_node_zones(i); 477 } 478 479 free_area_init_nodes(max_zone_pfns); 480} 481 482static __init int numa_setup(char *opt) 483{ 484 if (!opt) 485 return -EINVAL; 486 if (!strncmp(opt,"off",3)) 487 numa_off = 1; 488#ifdef CONFIG_NUMA_EMU 489 if(!strncmp(opt, "fake=", 5)) { 490 numa_fake = simple_strtoul(opt+5,NULL,0); ; 491 if (numa_fake >= MAX_NUMNODES) 492 numa_fake = MAX_NUMNODES; 493 } 494#endif 495#ifdef CONFIG_ACPI_NUMA 496 if (!strncmp(opt,"noacpi",6)) 497 acpi_numa = -1; 498 if (!strncmp(opt,"hotadd=", 7)) 499 hotadd_percent = simple_strtoul(opt+7, NULL, 10); 500#endif 501 return 0; 502} 503 504early_param("numa", numa_setup); 505 506/* 507 * Setup early cpu_to_node. 508 * 509 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 510 * and apicid_to_node[] tables have valid entries for a CPU. 511 * This means we skip cpu_to_node[] initialisation for NUMA 512 * emulation and faking node case (when running a kernel compiled 513 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 514 * is already initialized in a round robin manner at numa_init_array, 515 * prior to this call, and this initialization is good enough 516 * for the fake NUMA cases. 517 */ 518void __init init_cpu_to_node(void) 519{ 520 int i; 521 for (i = 0; i < NR_CPUS; i++) { 522 u8 apicid = x86_cpu_to_apicid[i]; 523 if (apicid == BAD_APICID) 524 continue; 525 if (apicid_to_node[apicid] == NUMA_NO_NODE) 526 continue; 527 numa_set_node(i,apicid_to_node[apicid]); 528 } 529} 530 531EXPORT_SYMBOL(cpu_to_node); 532EXPORT_SYMBOL(node_to_cpumask); 533EXPORT_SYMBOL(memnode); 534EXPORT_SYMBOL(node_data); 535 536#ifdef CONFIG_DISCONTIGMEM 537/* 538 * Functions to convert PFNs from/to per node page addresses. 539 * These are out of line because they are quite big. 540 * They could be all tuned by pre caching more state. 541 * Should do that. 542 */ 543 544int pfn_valid(unsigned long pfn) 545{ 546 unsigned nid; 547 if (pfn >= num_physpages) 548 return 0; 549 nid = pfn_to_nid(pfn); 550 if (nid == 0xff) 551 return 0; 552 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); 553} 554EXPORT_SYMBOL(pfn_valid); 555#endif