x86, numa: Fake node-to-cpumask for NUMA emulation

It's necessary to fake the node-to-cpumask mapping so that an emulated
node ID returns a cpumask that includes all cpus that have affinity to
the memory it represents.

This is a little intrusive because it requires knowledge of the physical
topology of the system. setup_physnodes() gives us that information, but
since NUMA emulation ends up altering the physnodes array, it's necessary
to reset it before cpus are brought online.

Accordingly, the physnodes array is moved out of init.data and into
cpuinit.data since it will be needed on cpuup callbacks.

This works regardless of whether numa=fake is used on the command line,
or the setup of the fake node succeeds or fails. The physnodes array
always contains the physical topology of the machine if CONFIG_NUMA_EMU
is enabled and can be used to setup the correct node-to-cpumask mappings
in all cases since setup_physnodes() is called whenever the array needs
to be repopulated with the correct data.

To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are
rewritten for CONFIG_NUMA_EMU so that we first find the physical node to
which each cpu has local affinity, then iterate through all online nodes
to find the emulated nodes that have local affinity to that physical
node, and then finally map the cpu to each of those emulated nodes.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

authored by David Rientjes and committed by H. Peter Anvin c1c3443c f51bf307

+80 -21
+80 -21
arch/x86/mm/numa_64.c
··· 260 260 #ifdef CONFIG_NUMA_EMU 261 261 /* Numa emulation */ 262 262 static struct bootnode nodes[MAX_NUMNODES] __initdata; 263 - static struct bootnode physnodes[MAX_NUMNODES] __initdata; 263 + static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; 264 264 static char *cmdline __initdata; 265 265 266 266 static int __init setup_physnodes(unsigned long start, unsigned long end, ··· 270 270 int ret = 0; 271 271 int i; 272 272 273 + memset(physnodes, 0, sizeof(physnodes)); 273 274 #ifdef CONFIG_ACPI_NUMA 274 275 if (acpi) 275 276 nr_nodes = acpi_get_nodes(physnodes); ··· 371 370 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 372 371 * to max_addr. The return value is the number of nodes allocated. 373 372 */ 374 - static int __init split_nodes_interleave(u64 addr, u64 max_addr, 375 - int nr_phys_nodes, int nr_nodes) 373 + static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) 376 374 { 377 375 nodemask_t physnode_mask = NODE_MASK_NONE; 378 376 u64 size; ··· 402 402 return -1; 403 403 } 404 404 405 - for (i = 0; i < nr_phys_nodes; i++) 405 + for (i = 0; i < MAX_NUMNODES; i++) 406 406 if (physnodes[i].start != physnodes[i].end) 407 407 node_set(i, physnode_mask); 408 408 ··· 571 571 { 572 572 u64 addr = start_pfn << PAGE_SHIFT; 573 573 u64 max_addr = last_pfn << PAGE_SHIFT; 574 - int num_phys_nodes; 575 574 int num_nodes; 576 575 int i; 577 576 578 - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); 579 577 /* 580 578 * If the numa=fake command-line contains a 'M' or 'G', it represents 581 579 * the fixed node size. Otherwise, if it is just a single number N, ··· 588 590 unsigned long n; 589 591 590 592 n = simple_strtoul(cmdline, NULL, 0); 591 - num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); 593 + num_nodes = split_nodes_interleave(addr, max_addr, n); 592 594 } 593 595 594 596 if (num_nodes < 0) ··· 611 613 nodes[i].end >> PAGE_SHIFT); 612 614 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 613 615 } 616 + setup_physnodes(addr, max_addr, acpi, amd); 614 617 fake_physnodes(acpi, amd, num_nodes); 615 618 numa_init_array(); 616 619 return 0; ··· 627 628 nodes_clear(node_online_map); 628 629 629 630 #ifdef CONFIG_NUMA_EMU 631 + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, 632 + acpi, amd); 630 633 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) 631 634 return; 635 + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, 636 + acpi, amd); 632 637 nodes_clear(node_possible_map); 633 638 nodes_clear(node_online_map); 634 639 #endif ··· 788 785 789 786 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 790 787 788 + #ifndef CONFIG_NUMA_EMU 791 789 void __cpuinit numa_add_cpu(int cpu) 792 790 { 793 791 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); ··· 798 794 { 799 795 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 800 796 } 797 + #else 798 + void __cpuinit numa_add_cpu(int cpu) 799 + { 800 + unsigned long addr; 801 + u16 apicid; 802 + int physnid; 803 + int nid = NUMA_NO_NODE; 804 + 805 + apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 806 + if (apicid != BAD_APICID) 807 + nid = apicid_to_node[apicid]; 808 + if (nid == NUMA_NO_NODE) 809 + nid = early_cpu_to_node(cpu); 810 + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 811 + 812 + /* 813 + * Use the starting address of the emulated node to find which physical 814 + * node it is allocated on. 815 + */ 816 + addr = node_start_pfn(nid) << PAGE_SHIFT; 817 + for (physnid = 0; physnid < MAX_NUMNODES; physnid++) 818 + if (addr >= physnodes[physnid].start && 819 + addr < physnodes[physnid].end) 820 + break; 821 + 822 + /* 823 + * Map the cpu to each emulated node that is allocated on the physical 824 + * node of the cpu's apic id. 825 + */ 826 + for_each_online_node(nid) { 827 + addr = node_start_pfn(nid) << PAGE_SHIFT; 828 + if (addr >= physnodes[physnid].start && 829 + addr < physnodes[physnid].end) 830 + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 831 + } 832 + } 833 + 834 + void __cpuinit numa_remove_cpu(int cpu) 835 + { 836 + int i; 837 + 838 + for_each_online_node(i) 839 + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 840 + } 841 + #endif /* !CONFIG_NUMA_EMU */ 801 842 802 843 #else /* CONFIG_DEBUG_PER_CPU_MAPS */ 803 844 ··· 854 805 int node = early_cpu_to_node(cpu); 855 806 struct cpumask *mask; 856 807 char buf[64]; 808 + int i; 857 809 858 - mask = node_to_cpumask_map[node]; 859 - if (mask == NULL) { 860 - printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); 861 - dump_stack(); 862 - return; 810 + for_each_online_node(i) { 811 + unsigned long addr; 812 + 813 + addr = node_start_pfn(i) << PAGE_SHIFT; 814 + if (addr < physnodes[node].start || 815 + addr >= physnodes[node].end) 816 + continue; 817 + mask = node_to_cpumask_map[node]; 818 + if (mask == NULL) { 819 + pr_err("node_to_cpumask_map[%i] NULL\n", i); 820 + dump_stack(); 821 + return; 822 + } 823 + 824 + if (enable) 825 + cpumask_set_cpu(cpu, mask); 826 + else 827 + cpumask_clear_cpu(cpu, mask); 828 + 829 + cpulist_scnprintf(buf, sizeof(buf), mask); 830 + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 831 + enable ? "numa_add_cpu" : "numa_remove_cpu", 832 + cpu, node, buf); 863 833 } 864 - 865 - if (enable) 866 - cpumask_set_cpu(cpu, mask); 867 - else 868 - cpumask_clear_cpu(cpu, mask); 869 - 870 - cpulist_scnprintf(buf, sizeof(buf), mask); 871 - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 872 - enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 873 834 } 874 835 875 836 void __cpuinit numa_add_cpu(int cpu)