Merge branch 'x86/numa' into x86/urgent

Merge reason: Topic is ready for upstream.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

+227 -65
+5 -6
arch/x86/include/asm/acpi.h
··· 185 185 186 186 #ifdef CONFIG_ACPI_NUMA 187 187 extern int acpi_numa; 188 - extern int acpi_get_nodes(struct bootnode *physnodes); 188 + extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 189 + unsigned long end); 189 190 extern int acpi_scan_nodes(unsigned long start, unsigned long end); 190 191 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 192 + 193 + #ifdef CONFIG_NUMA_EMU 191 194 extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 192 195 int num_nodes); 193 - #else 194 - static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, 195 - int num_nodes) 196 - { 197 - } 198 196 #endif 197 + #endif /* CONFIG_ACPI_NUMA */ 199 198 200 199 #define acpi_unlazy_tlb(x) leave_mm(x) 201 200
+5 -1
arch/x86/include/asm/amd_nb.h
··· 9 9 extern int early_is_amd_nb(u32 value); 10 10 extern int amd_cache_northbridges(void); 11 11 extern void amd_flush_garts(void); 12 - extern int amd_get_nodes(struct bootnode *nodes); 13 12 extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); 14 13 extern int amd_scan_nodes(void); 14 + 15 + #ifdef CONFIG_NUMA_EMU 16 + extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); 17 + extern void amd_get_nodes(struct bootnode *nodes); 18 + #endif 15 19 16 20 struct amd_northbridge { 17 21 struct pci_dev *misc;
+1 -1
arch/x86/include/asm/numa_64.h
··· 38 38 extern void __cpuinit numa_remove_cpu(int cpu); 39 39 40 40 #ifdef CONFIG_NUMA_EMU 41 - #define FAKE_NODE_MIN_SIZE ((u64)64 << 20) 41 + #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 42 42 #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 43 43 #endif /* CONFIG_NUMA_EMU */ 44 44 #else
+72 -14
arch/x86/mm/amdtopology_64.c
··· 27 27 #include <asm/amd_nb.h> 28 28 29 29 static struct bootnode __initdata nodes[8]; 30 + static unsigned char __initdata nodeids[8]; 30 31 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 31 32 32 33 static __init int find_northbridge(void) ··· 69 68 #endif 70 69 } 71 70 72 - int __init amd_get_nodes(struct bootnode *physnodes) 73 - { 74 - int i; 75 - int ret = 0; 76 - 77 - for_each_node_mask(i, nodes_parsed) { 78 - physnodes[ret].start = nodes[i].start; 79 - physnodes[ret].end = nodes[i].end; 80 - ret++; 81 - } 82 - return ret; 83 - } 84 - 85 71 int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 86 72 { 87 73 unsigned long start = PFN_PHYS(start_pfn); ··· 101 113 base = read_pci_config(0, nb, 1, 0x40 + i*8); 102 114 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 103 115 104 - nodeid = limit & 7; 116 + nodeids[i] = nodeid = limit & 7; 105 117 if ((base & 3) == 0) { 106 118 if (i < numnodes) 107 119 pr_info("Skipping disabled node %d\n", i); ··· 180 192 return -1; 181 193 return 0; 182 194 } 195 + 196 + #ifdef CONFIG_NUMA_EMU 197 + static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { 198 + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 199 + }; 200 + 201 + void __init amd_get_nodes(struct bootnode *physnodes) 202 + { 203 + int i; 204 + 205 + for_each_node_mask(i, nodes_parsed) { 206 + physnodes[i].start = nodes[i].start; 207 + physnodes[i].end = nodes[i].end; 208 + } 209 + } 210 + 211 + static int __init find_node_by_addr(unsigned long addr) 212 + { 213 + int ret = NUMA_NO_NODE; 214 + int i; 215 + 216 + for (i = 0; i < 8; i++) 217 + if (addr >= nodes[i].start && addr < nodes[i].end) { 218 + ret = i; 219 + break; 220 + } 221 + return ret; 222 + } 223 + 224 + /* 225 + * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be 226 + * setup to represent the physical topology but reflect the emulated 227 + * environment. For each emulated node, the real node which it appears on is 228 + * found and a fake pxm to nid mapping is created which mirrors the actual 229 + * locality. node_distance() then represents the correct distances between 230 + * emulated nodes by using the fake acpi mappings to pxms. 231 + */ 232 + void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) 233 + { 234 + unsigned int bits; 235 + unsigned int cores; 236 + unsigned int apicid_base = 0; 237 + int i; 238 + 239 + bits = boot_cpu_data.x86_coreid_bits; 240 + cores = 1 << bits; 241 + early_get_boot_cpu_id(); 242 + if (boot_cpu_physical_apicid > 0) 243 + apicid_base = boot_cpu_physical_apicid; 244 + 245 + for (i = 0; i < nr_nodes; i++) { 246 + int index; 247 + int nid; 248 + int j; 249 + 250 + nid = find_node_by_addr(nodes[i].start); 251 + if (nid == NUMA_NO_NODE) 252 + continue; 253 + 254 + index = nodeids[nid] << bits; 255 + if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) 256 + for (j = apicid_base; j < cores + apicid_base; j++) 257 + fake_apicid_to_node[index + j] = i; 258 + #ifdef CONFIG_ACPI_NUMA 259 + __acpi_map_pxm_to_node(nid, i); 260 + #endif 261 + } 262 + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); 263 + } 264 + #endif /* CONFIG_NUMA_EMU */ 183 265 184 266 int __init amd_scan_nodes(void) 185 267 {
+126 -35
arch/x86/mm/numa_64.c
··· 260 260 #ifdef CONFIG_NUMA_EMU 261 261 /* Numa emulation */ 262 262 static struct bootnode nodes[MAX_NUMNODES] __initdata; 263 - static struct bootnode physnodes[MAX_NUMNODES] __initdata; 263 + static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; 264 264 static char *cmdline __initdata; 265 265 266 266 static int __init setup_physnodes(unsigned long start, unsigned long end, 267 267 int acpi, int amd) 268 268 { 269 - int nr_nodes = 0; 270 269 int ret = 0; 271 270 int i; 272 271 272 + memset(physnodes, 0, sizeof(physnodes)); 273 273 #ifdef CONFIG_ACPI_NUMA 274 274 if (acpi) 275 - nr_nodes = acpi_get_nodes(physnodes); 275 + acpi_get_nodes(physnodes, start, end); 276 276 #endif 277 277 #ifdef CONFIG_AMD_NUMA 278 278 if (amd) 279 - nr_nodes = amd_get_nodes(physnodes); 279 + amd_get_nodes(physnodes); 280 280 #endif 281 281 /* 282 282 * Basic sanity checking on the physical node map: there may be errors 283 283 * if the SRAT or AMD code incorrectly reported the topology or the mem= 284 284 * kernel parameter is used. 285 285 */ 286 - for (i = 0; i < nr_nodes; i++) { 286 + for (i = 0; i < MAX_NUMNODES; i++) { 287 287 if (physnodes[i].start == physnodes[i].end) 288 288 continue; 289 289 if (physnodes[i].start > end) { ··· 298 298 physnodes[i].start = start; 299 299 if (physnodes[i].end > end) 300 300 physnodes[i].end = end; 301 - } 302 - 303 - /* 304 - * Remove all nodes that have no memory or were truncated because of the 305 - * limited address range. 306 - */ 307 - for (i = 0; i < nr_nodes; i++) { 308 - if (physnodes[i].start == physnodes[i].end) 309 - continue; 310 - physnodes[ret].start = physnodes[i].start; 311 - physnodes[ret].end = physnodes[i].end; 312 301 ret++; 313 302 } 314 303 ··· 311 322 ret = 1; 312 323 } 313 324 return ret; 325 + } 326 + 327 + static void __init fake_physnodes(int acpi, int amd, int nr_nodes) 328 + { 329 + int i; 330 + 331 + BUG_ON(acpi && amd); 332 + #ifdef CONFIG_ACPI_NUMA 333 + if (acpi) 334 + acpi_fake_nodes(nodes, nr_nodes); 335 + #endif 336 + #ifdef CONFIG_AMD_NUMA 337 + if (amd) 338 + amd_fake_nodes(nodes, nr_nodes); 339 + #endif 340 + if (!acpi && !amd) 341 + for (i = 0; i < nr_cpu_ids; i++) 342 + numa_set_node(i, 0); 314 343 } 315 344 316 345 /* ··· 359 352 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 360 353 * to max_addr. The return value is the number of nodes allocated. 361 354 */ 362 - static int __init split_nodes_interleave(u64 addr, u64 max_addr, 363 - int nr_phys_nodes, int nr_nodes) 355 + static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) 364 356 { 365 357 nodemask_t physnode_mask = NODE_MASK_NONE; 366 358 u64 size; ··· 390 384 return -1; 391 385 } 392 386 393 - for (i = 0; i < nr_phys_nodes; i++) 387 + for (i = 0; i < MAX_NUMNODES; i++) 394 388 if (physnodes[i].start != physnodes[i].end) 395 389 node_set(i, physnode_mask); 396 390 ··· 559 553 { 560 554 u64 addr = start_pfn << PAGE_SHIFT; 561 555 u64 max_addr = last_pfn << PAGE_SHIFT; 562 - int num_phys_nodes; 563 556 int num_nodes; 564 557 int i; 565 558 566 - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); 567 559 /* 568 560 * If the numa=fake command-line contains a 'M' or 'G', it represents 569 561 * the fixed node size. Otherwise, if it is just a single number N, ··· 576 572 unsigned long n; 577 573 578 574 n = simple_strtoul(cmdline, NULL, 0); 579 - num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); 575 + num_nodes = split_nodes_interleave(addr, max_addr, n); 580 576 } 581 577 582 578 if (num_nodes < 0) ··· 599 595 nodes[i].end >> PAGE_SHIFT); 600 596 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 601 597 } 602 - acpi_fake_nodes(nodes, num_nodes); 598 + setup_physnodes(addr, max_addr, acpi, amd); 599 + fake_physnodes(acpi, amd, num_nodes); 603 600 numa_init_array(); 604 601 return 0; 605 602 } ··· 615 610 nodes_clear(node_online_map); 616 611 617 612 #ifdef CONFIG_NUMA_EMU 613 + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, 614 + acpi, amd); 618 615 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) 619 616 return; 617 + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, 618 + acpi, amd); 620 619 nodes_clear(node_possible_map); 621 620 nodes_clear(node_online_map); 622 621 #endif ··· 776 767 777 768 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 778 769 770 + #ifndef CONFIG_NUMA_EMU 779 771 void __cpuinit numa_add_cpu(int cpu) 780 772 { 781 773 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); ··· 786 776 { 787 777 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 788 778 } 779 + #else 780 + void __cpuinit numa_add_cpu(int cpu) 781 + { 782 + unsigned long addr; 783 + u16 apicid; 784 + int physnid; 785 + int nid = NUMA_NO_NODE; 786 + 787 + apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 788 + if (apicid != BAD_APICID) 789 + nid = apicid_to_node[apicid]; 790 + if (nid == NUMA_NO_NODE) 791 + nid = early_cpu_to_node(cpu); 792 + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 793 + 794 + /* 795 + * Use the starting address of the emulated node to find which physical 796 + * node it is allocated on. 797 + */ 798 + addr = node_start_pfn(nid) << PAGE_SHIFT; 799 + for (physnid = 0; physnid < MAX_NUMNODES; physnid++) 800 + if (addr >= physnodes[physnid].start && 801 + addr < physnodes[physnid].end) 802 + break; 803 + 804 + /* 805 + * Map the cpu to each emulated node that is allocated on the physical 806 + * node of the cpu's apic id. 807 + */ 808 + for_each_online_node(nid) { 809 + addr = node_start_pfn(nid) << PAGE_SHIFT; 810 + if (addr >= physnodes[physnid].start && 811 + addr < physnodes[physnid].end) 812 + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 813 + } 814 + } 815 + 816 + void __cpuinit numa_remove_cpu(int cpu) 817 + { 818 + int i; 819 + 820 + for_each_online_node(i) 821 + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 822 + } 823 + #endif /* !CONFIG_NUMA_EMU */ 789 824 790 825 #else /* CONFIG_DEBUG_PER_CPU_MAPS */ 791 - 792 - /* 793 - * --------- debug versions of the numa functions --------- 794 - */ 795 - static void __cpuinit numa_set_cpumask(int cpu, int enable) 826 + static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) 796 827 { 797 828 int node = early_cpu_to_node(cpu); 798 829 struct cpumask *mask; 799 830 char buf[64]; 800 831 801 832 mask = node_to_cpumask_map[node]; 802 - if (mask == NULL) { 803 - printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); 833 + if (!mask) { 834 + pr_err("node_to_cpumask_map[%i] NULL\n", node); 804 835 dump_stack(); 805 - return; 836 + return NULL; 806 837 } 838 + 839 + cpulist_scnprintf(buf, sizeof(buf), mask); 840 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 841 + enable ? "numa_add_cpu" : "numa_remove_cpu", 842 + cpu, node, buf); 843 + return mask; 844 + } 845 + 846 + /* 847 + * --------- debug versions of the numa functions --------- 848 + */ 849 + #ifndef CONFIG_NUMA_EMU 850 + static void __cpuinit numa_set_cpumask(int cpu, int enable) 851 + { 852 + struct cpumask *mask; 853 + 854 + mask = debug_cpumask_set_cpu(cpu, enable); 855 + if (!mask) 856 + return; 807 857 808 858 if (enable) 809 859 cpumask_set_cpu(cpu, mask); 810 860 else 811 861 cpumask_clear_cpu(cpu, mask); 812 - 813 - cpulist_scnprintf(buf, sizeof(buf), mask); 814 - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 815 - enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 816 862 } 863 + #else 864 + static void __cpuinit numa_set_cpumask(int cpu, int enable) 865 + { 866 + int node = early_cpu_to_node(cpu); 867 + struct cpumask *mask; 868 + int i; 869 + 870 + for_each_online_node(i) { 871 + unsigned long addr; 872 + 873 + addr = node_start_pfn(i) << PAGE_SHIFT; 874 + if (addr < physnodes[node].start || 875 + addr >= physnodes[node].end) 876 + continue; 877 + mask = debug_cpumask_set_cpu(cpu, enable); 878 + if (!mask) 879 + return; 880 + 881 + if (enable) 882 + cpumask_set_cpu(cpu, mask); 883 + else 884 + cpumask_clear_cpu(cpu, mask); 885 + } 886 + } 887 + #endif /* CONFIG_NUMA_EMU */ 817 888 818 889 void __cpuinit numa_add_cpu(int cpu) 819 890 {
+18 -8
arch/x86/mm/srat_64.c
··· 349 349 350 350 void __init acpi_numa_arch_fixup(void) {} 351 351 352 - int __init acpi_get_nodes(struct bootnode *physnodes) 352 + #ifdef CONFIG_NUMA_EMU 353 + void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 354 + unsigned long end) 353 355 { 354 356 int i; 355 - int ret = 0; 356 357 357 358 for_each_node_mask(i, nodes_parsed) { 358 - physnodes[ret].start = nodes[i].start; 359 - physnodes[ret].end = nodes[i].end; 360 - ret++; 359 + cutoff_node(i, start, end); 360 + physnodes[i].start = nodes[i].start; 361 + physnodes[i].end = nodes[i].end; 361 362 } 362 - return ret; 363 363 } 364 + #endif /* CONFIG_NUMA_EMU */ 364 365 365 366 /* Use the information discovered above to actually set up the nodes. */ 366 367 int __init acpi_scan_nodes(unsigned long start, unsigned long end) ··· 506 505 { 507 506 int i, j; 508 507 509 - printk(KERN_INFO "Faking PXM affinity for fake nodes on real " 510 - "topology.\n"); 511 508 for (i = 0; i < num_nodes; i++) { 512 509 int nid, pxm; 513 510 ··· 525 526 fake_apicid_to_node[j] == NUMA_NO_NODE) 526 527 fake_apicid_to_node[j] = i; 527 528 } 529 + 530 + /* 531 + * If there are apicid-to-node mappings for physical nodes that do not 532 + * have a corresponding emulated node, it should default to a guaranteed 533 + * value. 534 + */ 535 + for (i = 0; i < MAX_LOCAL_APIC; i++) 536 + if (apicid_to_node[i] != NUMA_NO_NODE && 537 + fake_apicid_to_node[i] == NUMA_NO_NODE) 538 + fake_apicid_to_node[i] = 0; 539 + 528 540 for (i = 0; i < num_nodes; i++) 529 541 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 530 542 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));