x86, numa: Fake apicid and pxm mappings for NUMA emulation

This patch adds the equivalent of acpi_fake_nodes() for AMD Northbridge
platforms. The goal is to fake the apicid-to-node mappings for NUMA
emulation so the physical topology of the machine is correctly maintained
within the kernel.

This change also fakes proximity domains for both ACPI and k8 code so the
physical distance between emulated nodes is maintained via
node_distance(). This exports the correct distances via
/sys/devices/system/node/.../distance based on the underlying topology.

A new helper function, fake_physnodes(), is introduced to correctly
invoke the correct NUMA code to fake these two mappings based on the
system type. If there is no underlying NUMA configuration, all cpus are
mapped to node 0 for local distance.

Since acpi_fake_nodes() is no longer called with CONFIG_ACPI_NUMA, it's
prototype can be removed from the header file for such a configuration.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221701360.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

authored by David Rientjes and committed by H. Peter Anvin f51bf307 4e76f4e6

+95 -24
-5
arch/x86/include/asm/acpi.h
··· 193 extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 194 int num_nodes); 195 #endif 196 - #else 197 - static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, 198 - int num_nodes) 199 - { 200 - } 201 #endif /* CONFIG_ACPI_NUMA */ 202 203 #define acpi_unlazy_tlb(x) leave_mm(x)
··· 193 extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 194 int num_nodes); 195 #endif 196 #endif /* CONFIG_ACPI_NUMA */ 197 198 #define acpi_unlazy_tlb(x) leave_mm(x)
+1
arch/x86/include/asm/amd_nb.h
··· 13 extern int amd_scan_nodes(void); 14 15 #ifdef CONFIG_NUMA_EMU 16 extern int amd_get_nodes(struct bootnode *nodes); 17 #endif 18
··· 13 extern int amd_scan_nodes(void); 14 15 #ifdef CONFIG_NUMA_EMU 16 + extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); 17 extern int amd_get_nodes(struct bootnode *nodes); 18 #endif 19
+75 -16
arch/x86/mm/amdtopology_64.c
··· 27 #include <asm/amd_nb.h> 28 29 static struct bootnode __initdata nodes[8]; 30 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 31 32 static __init int find_northbridge(void) ··· 70 early_init_lapic_mapping(); 71 } 72 73 - #ifdef CONFIG_NUMA_EMU 74 - int __init amd_get_nodes(struct bootnode *physnodes) 75 - { 76 - int i; 77 - int ret = 0; 78 - 79 - for_each_node_mask(i, nodes_parsed) { 80 - physnodes[ret].start = nodes[i].start; 81 - physnodes[ret].end = nodes[i].end; 82 - ret++; 83 - } 84 - return ret; 85 - } 86 - #endif /* CONFIG_NUMA_EMU */ 87 - 88 int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 89 { 90 unsigned long start = PFN_PHYS(start_pfn); ··· 102 base = read_pci_config(0, nb, 1, 0x40 + i*8); 103 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 104 105 - nodeid = limit & 7; 106 if ((base & 3) == 0) { 107 if (i < numnodes) 108 pr_info("Skipping disabled node %d\n", i); ··· 181 return -1; 182 return 0; 183 } 184 185 int __init amd_scan_nodes(void) 186 {
··· 27 #include <asm/amd_nb.h> 28 29 static struct bootnode __initdata nodes[8]; 30 + static unsigned char __initdata nodeids[8]; 31 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 32 33 static __init int find_northbridge(void) ··· 69 early_init_lapic_mapping(); 70 } 71 72 int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 73 { 74 unsigned long start = PFN_PHYS(start_pfn); ··· 116 base = read_pci_config(0, nb, 1, 0x40 + i*8); 117 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 118 119 + nodeids[i] = nodeid = limit & 7; 120 if ((base & 3) == 0) { 121 if (i < numnodes) 122 pr_info("Skipping disabled node %d\n", i); ··· 195 return -1; 196 return 0; 197 } 198 + 199 + #ifdef CONFIG_NUMA_EMU 200 + static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { 201 + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 202 + }; 203 + 204 + int __init amd_get_nodes(struct bootnode *physnodes) 205 + { 206 + int i; 207 + int ret = 0; 208 + 209 + for_each_node_mask(i, nodes_parsed) { 210 + physnodes[ret].start = nodes[i].start; 211 + physnodes[ret].end = nodes[i].end; 212 + ret++; 213 + } 214 + return ret; 215 + } 216 + 217 + static int __init find_node_by_addr(unsigned long addr) 218 + { 219 + int ret = NUMA_NO_NODE; 220 + int i; 221 + 222 + for (i = 0; i < 8; i++) 223 + if (addr >= nodes[i].start && addr < nodes[i].end) { 224 + ret = i; 225 + break; 226 + } 227 + return ret; 228 + } 229 + 230 + /* 231 + * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be 232 + * setup to represent the physical topology but reflect the emulated 233 + * environment. For each emulated node, the real node which it appears on is 234 + * found and a fake pxm to nid mapping is created which mirrors the actual 235 + * locality. node_distance() then represents the correct distances between 236 + * emulated nodes by using the fake acpi mappings to pxms. 237 + */ 238 + void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) 239 + { 240 + unsigned int bits; 241 + unsigned int cores; 242 + unsigned int apicid_base = 0; 243 + int i; 244 + 245 + bits = boot_cpu_data.x86_coreid_bits; 246 + cores = 1 << bits; 247 + early_get_boot_cpu_id(); 248 + if (boot_cpu_physical_apicid > 0) 249 + apicid_base = boot_cpu_physical_apicid; 250 + 251 + for (i = 0; i < nr_nodes; i++) { 252 + int index; 253 + int nid; 254 + int j; 255 + 256 + nid = find_node_by_addr(nodes[i].start); 257 + if (nid == NUMA_NO_NODE) 258 + continue; 259 + 260 + index = nodeids[nid] << bits; 261 + if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) 262 + for (j = apicid_base; j < cores + apicid_base; j++) 263 + fake_apicid_to_node[index + j] = i; 264 + #ifdef CONFIG_ACPI_NUMA 265 + __acpi_map_pxm_to_node(nid, i); 266 + #endif 267 + } 268 + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); 269 + } 270 + #endif /* CONFIG_NUMA_EMU */ 271 272 int __init amd_scan_nodes(void) 273 {
+19 -1
arch/x86/mm/numa_64.c
··· 324 return ret; 325 } 326 327 /* 328 * Setups up nid to range from addr to addr + size. If the end 329 * boundary is greater than max_addr, then max_addr is used instead. ··· 613 nodes[i].end >> PAGE_SHIFT); 614 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 615 } 616 - acpi_fake_nodes(nodes, num_nodes); 617 numa_init_array(); 618 return 0; 619 }
··· 324 return ret; 325 } 326 327 + static void __init fake_physnodes(int acpi, int amd, int nr_nodes) 328 + { 329 + int i; 330 + 331 + BUG_ON(acpi && amd); 332 + #ifdef CONFIG_ACPI_NUMA 333 + if (acpi) 334 + acpi_fake_nodes(nodes, nr_nodes); 335 + #endif 336 + #ifdef CONFIG_AMD_NUMA 337 + if (amd) 338 + amd_fake_nodes(nodes, nr_nodes); 339 + #endif 340 + if (!acpi && !amd) 341 + for (i = 0; i < nr_cpu_ids; i++) 342 + numa_set_node(i, 0); 343 + } 344 + 345 /* 346 * Setups up nid to range from addr to addr + size. If the end 347 * boundary is greater than max_addr, then max_addr is used instead. ··· 595 nodes[i].end >> PAGE_SHIFT); 596 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 597 } 598 + fake_physnodes(acpi, amd, num_nodes); 599 numa_init_array(); 600 return 0; 601 }
-2
arch/x86/mm/srat_64.c
··· 497 { 498 int i, j; 499 500 - printk(KERN_INFO "Faking PXM affinity for fake nodes on real " 501 - "topology.\n"); 502 for (i = 0; i < num_nodes; i++) { 503 int nid, pxm; 504
··· 497 { 498 int i, j; 499 500 for (i = 0; i < num_nodes; i++) { 501 int nid, pxm; 502