Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86_64: fake pxm-to-node mapping for fake numa

For NUMA emulation, our SLIT should represent the true NUMA topology of the
system but our proximity domain to node ID mapping needs to reflect the
emulated state.

When NUMA emulation has successfully setup fake nodes on the system, a new
function, acpi_fake_nodes() is called. This function determines the proximity
domain (_PXM) for each true node found on the system. It then finds which
emulated nodes have been allocated on this true node as determined by its
starting address. The node ID to PXM mapping is changed so that each fake
node ID points to the PXM of the true node that it is located on.

If the machine failed to register a SLIT, then we assume there is no special
requirement for emulated node affinity so we use the default LOCAL_DISTANCE,
which is newly exported to this code, as our measurement if the emulated nodes
appear in the same PXM. Otherwise, we use REMOTE_DISTANCE.

PXM_INVAL and NID_INVAL are also exported to the ACPI header file so that we
can compare node_to_pxm() results in generic code (in this case, the SRAT
code).

Cc: Len Brown <lenb@kernel.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

David Rientjes and committed by
Linus Torvalds
3484d798 3af044e0

+96 -7
+1
arch/x86_64/mm/numa.c
··· 484 484 nodes[i].end >> PAGE_SHIFT); 485 485 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 486 486 } 487 + acpi_fake_nodes(nodes, num_nodes); 487 488 numa_init_array(); 488 489 return 0; 489 490 }
+73 -3
arch/x86_64/mm/srat.c
··· 350 350 351 351 /* Sanity check to catch more bad SRATs (they are amazingly common). 352 352 Make sure the PXMs cover all memory. */ 353 - static int nodes_cover_memory(void) 353 + static int __init nodes_cover_memory(const struct bootnode *nodes) 354 354 { 355 355 int i; 356 356 unsigned long pxmram, e820ram; ··· 406 406 } 407 407 } 408 408 409 - if (!nodes_cover_memory()) { 409 + if (!nodes_cover_memory(nodes)) { 410 410 bad_srat(); 411 411 return -1; 412 412 } ··· 440 440 return 0; 441 441 } 442 442 443 + #ifdef CONFIG_NUMA_EMU 444 + static int __init find_node_by_addr(unsigned long addr) 445 + { 446 + int ret = NUMA_NO_NODE; 447 + int i; 448 + 449 + for_each_node_mask(i, nodes_parsed) { 450 + /* 451 + * Find the real node that this emulated node appears on. For 452 + * the sake of simplicity, we only use a real node's starting 453 + * address to determine which emulated node it appears on. 454 + */ 455 + if (addr >= nodes[i].start && addr < nodes[i].end) { 456 + ret = i; 457 + break; 458 + } 459 + } 460 + return i; 461 + } 462 + 463 + /* 464 + * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID 465 + * mappings that respect the real ACPI topology but reflect our emulated 466 + * environment. For each emulated node, we find which real node it appears on 467 + * and create PXM to NID mappings for those fake nodes which mirror that 468 + * locality. SLIT will now represent the correct distances between emulated 469 + * nodes as a result of the real topology. 470 + */ 471 + void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) 472 + { 473 + int i; 474 + int fake_node_to_pxm_map[MAX_NUMNODES] = { 475 + [0 ... MAX_NUMNODES-1] = PXM_INVAL 476 + }; 477 + 478 + printk(KERN_INFO "Faking PXM affinity for fake nodes on real " 479 + "topology.\n"); 480 + for (i = 0; i < num_nodes; i++) { 481 + int nid, pxm; 482 + 483 + nid = find_node_by_addr(fake_nodes[i].start); 484 + if (nid == NUMA_NO_NODE) 485 + continue; 486 + pxm = node_to_pxm(nid); 487 + if (pxm == PXM_INVAL) 488 + continue; 489 + fake_node_to_pxm_map[i] = pxm; 490 + } 491 + for (i = 0; i < num_nodes; i++) 492 + __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 493 + 494 + nodes_clear(nodes_parsed); 495 + for (i = 0; i < num_nodes; i++) 496 + if (fake_nodes[i].start != fake_nodes[i].end) 497 + node_set(i, nodes_parsed); 498 + WARN_ON(!nodes_cover_memory(fake_nodes)); 499 + } 500 + 501 + static int null_slit_node_compare(int a, int b) 502 + { 503 + return node_to_pxm(a) == node_to_pxm(b); 504 + } 505 + #else 506 + static int null_slit_node_compare(int a, int b) 507 + { 508 + return a == b; 509 + } 510 + #endif /* CONFIG_NUMA_EMU */ 511 + 443 512 void __init srat_reserve_add_area(int nodeid) 444 513 { 445 514 if (found_add_area && nodes_add[nodeid].end) { ··· 533 464 int index; 534 465 535 466 if (!acpi_slit) 536 - return a == b ? LOCAL_DISTANCE : REMOTE_DISTANCE; 467 + return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : 468 + REMOTE_DISTANCE; 537 469 index = acpi_slit->locality_count * node_to_pxm(a); 538 470 return acpi_slit->entry[index + node_to_pxm(b)]; 539 471 }
+7 -4
drivers/acpi/numa.c
··· 36 36 ACPI_MODULE_NAME("numa"); 37 37 38 38 static nodemask_t nodes_found_map = NODE_MASK_NONE; 39 - #define PXM_INVAL -1 40 - #define NID_INVAL -1 41 39 42 40 /* maps to convert between proximity domain and logical node ID */ 43 41 static int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS] ··· 57 59 return node_to_pxm_map[node]; 58 60 } 59 61 62 + void __acpi_map_pxm_to_node(int pxm, int node) 63 + { 64 + pxm_to_node_map[pxm] = node; 65 + node_to_pxm_map[node] = pxm; 66 + } 67 + 60 68 int acpi_map_pxm_to_node(int pxm) 61 69 { 62 70 int node = pxm_to_node_map[pxm]; ··· 71 67 if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) 72 68 return NID_INVAL; 73 69 node = first_unset_node(nodes_found_map); 74 - pxm_to_node_map[pxm] = node; 75 - node_to_pxm_map[node] = pxm; 70 + __acpi_map_pxm_to_node(pxm, node); 76 71 node_set(node, nodes_found_map); 77 72 } 78 73
+1
include/acpi/acpi_numa.h
··· 13 13 14 14 extern int pxm_to_node(int); 15 15 extern int node_to_pxm(int); 16 + extern void __acpi_map_pxm_to_node(int, int); 16 17 extern int acpi_map_pxm_to_node(int); 17 18 extern void __cpuinit acpi_unmap_pxm_to_node(int); 18 19
+11
include/asm-x86_64/acpi.h
··· 29 29 #ifdef __KERNEL__ 30 30 31 31 #include <acpi/pdc_intel.h> 32 + #include <asm/numa.h> 32 33 33 34 #define COMPILER_DEPENDENT_INT64 long long 34 35 #define COMPILER_DEPENDENT_UINT64 unsigned long long ··· 141 140 142 141 extern int acpi_skip_timer_override; 143 142 extern int acpi_use_timer_override; 143 + 144 + #ifdef CONFIG_ACPI_NUMA 145 + extern void __init acpi_fake_nodes(const struct bootnode *fake_nodes, 146 + int num_nodes); 147 + #else 148 + static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, 149 + int num_nodes) 150 + { 151 + } 152 + #endif 144 153 145 154 #endif /*__KERNEL__*/ 146 155
+3
include/linux/acpi.h
··· 231 231 232 232 extern int pnpacpi_disabled; 233 233 234 + #define PXM_INVAL (-1) 235 + #define NID_INVAL (-1) 236 + 234 237 #else /* CONFIG_ACPI */ 235 238 236 239 static inline int acpi_boot_init(void)