x86, numa: Fix cpu to node mapping for sparse node ids

NUMA boot code assumes that physical node ids start at 0, but the DIMMs
that the apic id represents may not be reachable. If this is the case,
node 0 is never online and cpus never end up getting appropriately
assigned to a node. This causes the cpumask of all online nodes to be
empty and machines crash with kernel code assuming online nodes have
valid cpus.

The fix is to appropriately map all the address ranges for physical nodes
and ensure the cpu to node mapping function checks all possible nodes (up
to MAX_NUMNODES) instead of simply checking nodes 0-N, where N is the
number of physical nodes, for valid address ranges.

This requires no longer "compressing" the address ranges of nodes in the
physical node map from 0-N, but rather leave indices in physnodes[] to
represent the actual node id of the physical node. Accordingly, the
topology exported by both amd_get_nodes() and acpi_get_nodes() no longer
must return the number of nodes to iterate through; all such iterations
will now be to MAX_NUMNODES.

This change also passes the end address of system RAM (which may be
different from normal operation if mem= is specified on the command line)
before the physnodes[] array is populated. ACPI parsed nodes are
truncated to fit within the address range that respect the mem=
boundaries and even some physical nodes may become unreachable in such
cases.

When NUMA emulation does succeed, any apicid to node mapping that exists
for unreachable nodes are given default values so that proximity domains
can still be assigned. This is important for node_distance() to
function as desired.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221702090.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

authored by David Rientjes and committed by H. Peter Anvin a387e95a c1c3443c

+25 -29
+2 -1
arch/x86/include/asm/acpi.h
··· 185 186 #ifdef CONFIG_ACPI_NUMA 187 extern int acpi_numa; 188 - extern int acpi_get_nodes(struct bootnode *physnodes); 189 extern int acpi_scan_nodes(unsigned long start, unsigned long end); 190 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 191
··· 185 186 #ifdef CONFIG_ACPI_NUMA 187 extern int acpi_numa; 188 + extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 189 + unsigned long end); 190 extern int acpi_scan_nodes(unsigned long start, unsigned long end); 191 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 192
+1 -1
arch/x86/include/asm/amd_nb.h
··· 14 15 #ifdef CONFIG_NUMA_EMU 16 extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); 17 - extern int amd_get_nodes(struct bootnode *nodes); 18 #endif 19 20 struct amd_northbridge {
··· 14 15 #ifdef CONFIG_NUMA_EMU 16 extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); 17 + extern void amd_get_nodes(struct bootnode *nodes); 18 #endif 19 20 struct amd_northbridge {
+3 -6
arch/x86/mm/amdtopology_64.c
··· 187 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 188 }; 189 190 - int __init amd_get_nodes(struct bootnode *physnodes) 191 { 192 int i; 193 - int ret = 0; 194 195 for_each_node_mask(i, nodes_parsed) { 196 - physnodes[ret].start = nodes[i].start; 197 - physnodes[ret].end = nodes[i].end; 198 - ret++; 199 } 200 - return ret; 201 } 202 203 static int __init find_node_by_addr(unsigned long addr)
··· 187 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 188 }; 189 190 + void __init amd_get_nodes(struct bootnode *physnodes) 191 { 192 int i; 193 194 for_each_node_mask(i, nodes_parsed) { 195 + physnodes[i].start = nodes[i].start; 196 + physnodes[i].end = nodes[i].end; 197 } 198 } 199 200 static int __init find_node_by_addr(unsigned long addr)
+3 -15
arch/x86/mm/numa_64.c
··· 266 static int __init setup_physnodes(unsigned long start, unsigned long end, 267 int acpi, int amd) 268 { 269 - int nr_nodes = 0; 270 int ret = 0; 271 int i; 272 273 memset(physnodes, 0, sizeof(physnodes)); 274 #ifdef CONFIG_ACPI_NUMA 275 if (acpi) 276 - nr_nodes = acpi_get_nodes(physnodes); 277 #endif 278 #ifdef CONFIG_AMD_NUMA 279 if (amd) 280 - nr_nodes = amd_get_nodes(physnodes); 281 #endif 282 /* 283 * Basic sanity checking on the physical node map: there may be errors 284 * if the SRAT or AMD code incorrectly reported the topology or the mem= 285 * kernel parameter is used. 286 */ 287 - for (i = 0; i < nr_nodes; i++) { 288 if (physnodes[i].start == physnodes[i].end) 289 continue; 290 if (physnodes[i].start > end) { ··· 298 physnodes[i].start = start; 299 if (physnodes[i].end > end) 300 physnodes[i].end = end; 301 - } 302 - 303 - /* 304 - * Remove all nodes that have no memory or were truncated because of the 305 - * limited address range. 306 - */ 307 - for (i = 0; i < nr_nodes; i++) { 308 - if (physnodes[i].start == physnodes[i].end) 309 - continue; 310 - physnodes[ret].start = physnodes[i].start; 311 - physnodes[ret].end = physnodes[i].end; 312 ret++; 313 } 314
··· 266 static int __init setup_physnodes(unsigned long start, unsigned long end, 267 int acpi, int amd) 268 { 269 int ret = 0; 270 int i; 271 272 memset(physnodes, 0, sizeof(physnodes)); 273 #ifdef CONFIG_ACPI_NUMA 274 if (acpi) 275 + acpi_get_nodes(physnodes, start, end); 276 #endif 277 #ifdef CONFIG_AMD_NUMA 278 if (amd) 279 + amd_get_nodes(physnodes); 280 #endif 281 /* 282 * Basic sanity checking on the physical node map: there may be errors 283 * if the SRAT or AMD code incorrectly reported the topology or the mem= 284 * kernel parameter is used. 285 */ 286 + for (i = 0; i < MAX_NUMNODES; i++) { 287 if (physnodes[i].start == physnodes[i].end) 288 continue; 289 if (physnodes[i].start > end) { ··· 299 physnodes[i].start = start; 300 if (physnodes[i].end > end) 301 physnodes[i].end = end; 302 ret++; 303 } 304
+16 -6
arch/x86/mm/srat_64.c
··· 340 void __init acpi_numa_arch_fixup(void) {} 341 342 #ifdef CONFIG_NUMA_EMU 343 - int __init acpi_get_nodes(struct bootnode *physnodes) 344 { 345 int i; 346 - int ret = 0; 347 348 for_each_node_mask(i, nodes_parsed) { 349 - physnodes[ret].start = nodes[i].start; 350 - physnodes[ret].end = nodes[i].end; 351 - ret++; 352 } 353 - return ret; 354 } 355 #endif /* CONFIG_NUMA_EMU */ 356 ··· 515 fake_apicid_to_node[j] == NUMA_NO_NODE) 516 fake_apicid_to_node[j] = i; 517 } 518 for (i = 0; i < num_nodes; i++) 519 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 520 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
··· 340 void __init acpi_numa_arch_fixup(void) {} 341 342 #ifdef CONFIG_NUMA_EMU 343 + void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, 344 + unsigned long end) 345 { 346 int i; 347 348 for_each_node_mask(i, nodes_parsed) { 349 + cutoff_node(i, start, end); 350 + physnodes[i].start = nodes[i].start; 351 + physnodes[i].end = nodes[i].end; 352 } 353 } 354 #endif /* CONFIG_NUMA_EMU */ 355 ··· 516 fake_apicid_to_node[j] == NUMA_NO_NODE) 517 fake_apicid_to_node[j] = i; 518 } 519 + 520 + /* 521 + * If there are apicid-to-node mappings for physical nodes that do not 522 + * have a corresponding emulated node, it should default to a guaranteed 523 + * value. 524 + */ 525 + for (i = 0; i < MAX_LOCAL_APIC; i++) 526 + if (apicid_to_node[i] != NUMA_NO_NODE && 527 + fake_apicid_to_node[i] == NUMA_NO_NODE) 528 + fake_apicid_to_node[i] = 0; 529 + 530 for (i = 0; i < num_nodes; i++) 531 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 532 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));