x86, numa: Fake node-to-cpumask for NUMA emulation

It's necessary to fake the node-to-cpumask mapping so that an emulated
node ID returns a cpumask that includes all cpus that have affinity to
the memory it represents.

This is a little intrusive because it requires knowledge of the physical
topology of the system. setup_physnodes() gives us that information, but
since NUMA emulation ends up altering the physnodes array, it's necessary
to reset it before cpus are brought online.

Accordingly, the physnodes array is moved out of init.data and into
cpuinit.data since it will be needed on cpuup callbacks.

This works regardless of whether numa=fake is used on the command line,
or the setup of the fake node succeeds or fails. The physnodes array
always contains the physical topology of the machine if CONFIG_NUMA_EMU
is enabled and can be used to setup the correct node-to-cpumask mappings
in all cases since setup_physnodes() is called whenever the array needs
to be repopulated with the correct data.

To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are
rewritten for CONFIG_NUMA_EMU so that we first find the physical node to
which each cpu has local affinity, then iterate through all online nodes
to find the emulated nodes that have local affinity to that physical
node, and then finally map the cpu to each of those emulated nodes.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

authored by David Rientjes and committed by H. Peter Anvin c1c3443c f51bf307

+80 -21
+80 -21
arch/x86/mm/numa_64.c
··· 260 #ifdef CONFIG_NUMA_EMU 261 /* Numa emulation */ 262 static struct bootnode nodes[MAX_NUMNODES] __initdata; 263 - static struct bootnode physnodes[MAX_NUMNODES] __initdata; 264 static char *cmdline __initdata; 265 266 static int __init setup_physnodes(unsigned long start, unsigned long end, ··· 270 int ret = 0; 271 int i; 272 273 #ifdef CONFIG_ACPI_NUMA 274 if (acpi) 275 nr_nodes = acpi_get_nodes(physnodes); ··· 371 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 372 * to max_addr. The return value is the number of nodes allocated. 373 */ 374 - static int __init split_nodes_interleave(u64 addr, u64 max_addr, 375 - int nr_phys_nodes, int nr_nodes) 376 { 377 nodemask_t physnode_mask = NODE_MASK_NONE; 378 u64 size; ··· 402 return -1; 403 } 404 405 - for (i = 0; i < nr_phys_nodes; i++) 406 if (physnodes[i].start != physnodes[i].end) 407 node_set(i, physnode_mask); 408 ··· 571 { 572 u64 addr = start_pfn << PAGE_SHIFT; 573 u64 max_addr = last_pfn << PAGE_SHIFT; 574 - int num_phys_nodes; 575 int num_nodes; 576 int i; 577 578 - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); 579 /* 580 * If the numa=fake command-line contains a 'M' or 'G', it represents 581 * the fixed node size. Otherwise, if it is just a single number N, ··· 588 unsigned long n; 589 590 n = simple_strtoul(cmdline, NULL, 0); 591 - num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); 592 } 593 594 if (num_nodes < 0) ··· 611 nodes[i].end >> PAGE_SHIFT); 612 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 613 } 614 fake_physnodes(acpi, amd, num_nodes); 615 numa_init_array(); 616 return 0; ··· 627 nodes_clear(node_online_map); 628 629 #ifdef CONFIG_NUMA_EMU 630 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) 631 return; 632 nodes_clear(node_possible_map); 633 nodes_clear(node_online_map); 634 #endif ··· 788 789 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 790 791 void __cpuinit numa_add_cpu(int cpu) 792 { 793 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); ··· 798 { 799 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 800 } 801 802 #else /* CONFIG_DEBUG_PER_CPU_MAPS */ 803 ··· 854 int node = early_cpu_to_node(cpu); 855 struct cpumask *mask; 856 char buf[64]; 857 858 - mask = node_to_cpumask_map[node]; 859 - if (mask == NULL) { 860 - printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); 861 - dump_stack(); 862 - return; 863 } 864 - 865 - if (enable) 866 - cpumask_set_cpu(cpu, mask); 867 - else 868 - cpumask_clear_cpu(cpu, mask); 869 - 870 - cpulist_scnprintf(buf, sizeof(buf), mask); 871 - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 872 - enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 873 } 874 875 void __cpuinit numa_add_cpu(int cpu)
··· 260 #ifdef CONFIG_NUMA_EMU 261 /* Numa emulation */ 262 static struct bootnode nodes[MAX_NUMNODES] __initdata; 263 + static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; 264 static char *cmdline __initdata; 265 266 static int __init setup_physnodes(unsigned long start, unsigned long end, ··· 270 int ret = 0; 271 int i; 272 273 + memset(physnodes, 0, sizeof(physnodes)); 274 #ifdef CONFIG_ACPI_NUMA 275 if (acpi) 276 nr_nodes = acpi_get_nodes(physnodes); ··· 370 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 371 * to max_addr. The return value is the number of nodes allocated. 372 */ 373 + static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) 374 { 375 nodemask_t physnode_mask = NODE_MASK_NONE; 376 u64 size; ··· 402 return -1; 403 } 404 405 + for (i = 0; i < MAX_NUMNODES; i++) 406 if (physnodes[i].start != physnodes[i].end) 407 node_set(i, physnode_mask); 408 ··· 571 { 572 u64 addr = start_pfn << PAGE_SHIFT; 573 u64 max_addr = last_pfn << PAGE_SHIFT; 574 int num_nodes; 575 int i; 576 577 /* 578 * If the numa=fake command-line contains a 'M' or 'G', it represents 579 * the fixed node size. Otherwise, if it is just a single number N, ··· 590 unsigned long n; 591 592 n = simple_strtoul(cmdline, NULL, 0); 593 + num_nodes = split_nodes_interleave(addr, max_addr, n); 594 } 595 596 if (num_nodes < 0) ··· 613 nodes[i].end >> PAGE_SHIFT); 614 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 615 } 616 + setup_physnodes(addr, max_addr, acpi, amd); 617 fake_physnodes(acpi, amd, num_nodes); 618 numa_init_array(); 619 return 0; ··· 628 nodes_clear(node_online_map); 629 630 #ifdef CONFIG_NUMA_EMU 631 + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, 632 + acpi, amd); 633 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) 634 return; 635 + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, 636 + acpi, amd); 637 nodes_clear(node_possible_map); 638 nodes_clear(node_online_map); 639 #endif ··· 785 786 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 787 788 + #ifndef CONFIG_NUMA_EMU 789 void __cpuinit numa_add_cpu(int cpu) 790 { 791 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); ··· 794 { 795 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 796 } 797 + #else 798 + void __cpuinit numa_add_cpu(int cpu) 799 + { 800 + unsigned long addr; 801 + u16 apicid; 802 + int physnid; 803 + int nid = NUMA_NO_NODE; 804 + 805 + apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 806 + if (apicid != BAD_APICID) 807 + nid = apicid_to_node[apicid]; 808 + if (nid == NUMA_NO_NODE) 809 + nid = early_cpu_to_node(cpu); 810 + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 811 + 812 + /* 813 + * Use the starting address of the emulated node to find which physical 814 + * node it is allocated on. 815 + */ 816 + addr = node_start_pfn(nid) << PAGE_SHIFT; 817 + for (physnid = 0; physnid < MAX_NUMNODES; physnid++) 818 + if (addr >= physnodes[physnid].start && 819 + addr < physnodes[physnid].end) 820 + break; 821 + 822 + /* 823 + * Map the cpu to each emulated node that is allocated on the physical 824 + * node of the cpu's apic id. 825 + */ 826 + for_each_online_node(nid) { 827 + addr = node_start_pfn(nid) << PAGE_SHIFT; 828 + if (addr >= physnodes[physnid].start && 829 + addr < physnodes[physnid].end) 830 + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 831 + } 832 + } 833 + 834 + void __cpuinit numa_remove_cpu(int cpu) 835 + { 836 + int i; 837 + 838 + for_each_online_node(i) 839 + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 840 + } 841 + #endif /* !CONFIG_NUMA_EMU */ 842 843 #else /* CONFIG_DEBUG_PER_CPU_MAPS */ 844 ··· 805 int node = early_cpu_to_node(cpu); 806 struct cpumask *mask; 807 char buf[64]; 808 + int i; 809 810 + for_each_online_node(i) { 811 + unsigned long addr; 812 + 813 + addr = node_start_pfn(i) << PAGE_SHIFT; 814 + if (addr < physnodes[node].start || 815 + addr >= physnodes[node].end) 816 + continue; 817 + mask = node_to_cpumask_map[node]; 818 + if (mask == NULL) { 819 + pr_err("node_to_cpumask_map[%i] NULL\n", i); 820 + dump_stack(); 821 + return; 822 + } 823 + 824 + if (enable) 825 + cpumask_set_cpu(cpu, mask); 826 + else 827 + cpumask_clear_cpu(cpu, mask); 828 + 829 + cpulist_scnprintf(buf, sizeof(buf), mask); 830 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 831 + enable ? "numa_add_cpu" : "numa_remove_cpu", 832 + cpu, node, buf); 833 } 834 } 835 836 void __cpuinit numa_add_cpu(int cpu)