Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/fake-numa: allow later numa node hotplug

Current fake-numa implementation prevents new Numa nodes to be later
hot-plugged by drivers. A common symptom of this limitation is the "node
<X> was absent from the node_possible_map" message by associated warning
in mm/memory_hotplug.c: add_memory_resource().

This comes from the lack of remapping in both pxm_to_node_map[] and
node_to_pxm_map[] tables to take fake-numa nodes into account and thus
triggers collisions with original and physical nodes only-mapping that had
been determined from BIOS tables.

This patch fixes this by doing the necessary node-ids translation in both
pxm_to_node_map[]/node_to_pxm_map[] tables. node_distance[] table has
also been fixed accordingly.


Details:

When trying to use fake-numa feature on our system where new Numa nodes
are being "hot-plugged" upon driver load, this fails with the following
type of message and warning with stack :

node 8 was absent from the node_possible_map WARNING: CPU: 61 PID: 4259 at
mm/memory_hotplug.c:1506 add_memory_resource+0x3dc/0x418

This issue prevents the use of the fake-NUMA debug feature with the
system's full configuration, when it has proven to be sometimes extremely
useful for performance testing of multi-tasked, memory-bound applications,
as it enables better isolation of processes/ranks compared to fat NUMA
nodes.

Usual numactl output after driver has “hot-plugged”/unveiled some
new Numa nodes with and without memory :
$ numactl --hardware
available: 9 nodes (0-8)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 0 size: 490037 MB
node 0 free: 484432 MB
node 1 cpus:
node 1 size: 97280 MB
node 1 free: 97279 MB
node 2 cpus:
node 2 size: 0 MB
node 2 free: 0 MB
node 3 cpus:
node 3 size: 0 MB
node 3 free: 0 MB
node 4 cpus:
node 4 size: 0 MB
node 4 free: 0 MB
node 5 cpus:
node 5 size: 0 MB
node 5 free: 0 MB
node 6 cpus:
node 6 size: 0 MB
node 6 free: 0 MB
node 7 cpus:
node 7 size: 0 MB
node 7 free: 0 MB
node 8 cpus:
node 8 size: 0 MB
node 8 free: 0 MB
node distances:
node 0 1 2 3 4 5 6 7 8
0: 10 80 80 80 80 80 80 80 80
1: 80 10 255 255 255 255 255 255 255
2: 80 255 10 255 255 255 255 255 255
3: 80 255 255 10 255 255 255 255 255
4: 80 255 255 255 10 255 255 255 255
5: 80 255 255 255 255 10 255 255 255
6: 80 255 255 255 255 255 10 255 255
7: 80 255 255 255 255 255 255 10 255
8: 80 255 255 255 255 255 255 255 10


With recent M.Rapoport set of fake-numa patches in mm-everything
and using numa=fake=4 boot parameter :
$ numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 0 size: 122518 MB
node 0 free: 117141 MB
node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 1 size: 219911 MB
node 1 free: 219751 MB
node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 2 size: 122599 MB
node 2 free: 122541 MB
node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 3 size: 122479 MB
node 3 free: 122408 MB
node distances:
node 0 1 2 3
0: 10 10 10 10
1: 10 10 10 10
2: 10 10 10 10
3: 10 10 10 10


With recent M.Rapoport set of fake-numa patches in mm-everything,
this patch on top, using numa=fake=4 boot parameter :
# numactl —hardware
available: 12 nodes (0-11)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 0 size: 122518 MB
node 0 free: 116429 MB
node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 1 size: 122631 MB
node 1 free: 122576 MB
node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 2 size: 122599 MB
node 2 free: 122544 MB
node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
65 66 67 68 69 70 71
node 3 size: 122479 MB
node 3 free: 122419 MB
node 4 cpus:
node 4 size: 97280 MB
node 4 free: 97279 MB
node 5 cpus:
node 5 size: 0 MB
node 5 free: 0 MB
node 6 cpus:
node 6 size: 0 MB
node 6 free: 0 MB
node 7 cpus:
node 7 size: 0 MB
node 7 free: 0 MB
node 8 cpus:
node 8 size: 0 MB
node 8 free: 0 MB
node 9 cpus:
node 9 size: 0 MB
node 9 free: 0 MB
node 10 cpus:
node 10 size: 0 MB
node 10 free: 0 MB
node 11 cpus:
node 11 size: 0 MB
node 11 free: 0 MB
node distances:
node 0 1 2 3 4 5 6 7 8 9 10 11
0: 10 10 10 10 80 80 80 80 80 80 80 80
1: 10 10 10 10 80 80 80 80 80 80 80 80
2: 10 10 10 10 80 80 80 80 80 80 80 80
3: 10 10 10 10 80 80 80 80 80 80 80 80
4: 80 80 80 80 10 255 255 255 255 255 255 255
5: 80 80 80 80 255 10 255 255 255 255 255 255
6: 80 80 80 80 255 255 10 255 255 255 255 255
7: 80 80 80 80 255 255 255 10 255 255 255 255
8: 80 80 80 80 255 255 255 255 10 255 255 255
9: 80 80 80 80 255 255 255 255 255 10 255 255
10: 80 80 80 80 255 255 255 255 255 255 10 255
11: 80 80 80 80 255 255 255 255 255 255 255 10

Link: https://lkml.kernel.org/r/20250106120659.359610-2-bfaccini@nvidia.com
Signed-off-by: Bruno Faccini <bfaccini@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Bruno Faccini and committed by
Andrew Morton
63db8170 5ec4333b

+133 -8
+86
drivers/acpi/numa/srat.c
··· 81 81 } 82 82 EXPORT_SYMBOL(acpi_map_pxm_to_node); 83 83 84 + #ifdef CONFIG_NUMA_EMU 85 + /* 86 + * Take max_nid - 1 fake-numa nodes into account in both 87 + * pxm_to_node_map()/node_to_pxm_map[] tables. 88 + */ 89 + int __init fix_pxm_node_maps(int max_nid) 90 + { 91 + static int pxm_to_node_map_copy[MAX_PXM_DOMAINS] __initdata 92 + = { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE }; 93 + static int node_to_pxm_map_copy[MAX_NUMNODES] __initdata 94 + = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; 95 + int i, j, index = -1, count = 0; 96 + nodemask_t nodes_to_enable; 97 + 98 + if (numa_off || srat_disabled()) 99 + return -1; 100 + 101 + /* find fake nodes PXM mapping */ 102 + for (i = 0; i < MAX_NUMNODES; i++) { 103 + if (node_to_pxm_map[i] != PXM_INVAL) { 104 + for (j = 0; j <= max_nid; j++) { 105 + if ((emu_nid_to_phys[j] == i) && 106 + WARN(node_to_pxm_map_copy[j] != PXM_INVAL, 107 + "Node %d is already binded to PXM %d\n", 108 + j, node_to_pxm_map_copy[j])) 109 + return -1; 110 + if (emu_nid_to_phys[j] == i) { 111 + node_to_pxm_map_copy[j] = 112 + node_to_pxm_map[i]; 113 + if (j > index) 114 + index = j; 115 + count++; 116 + } 117 + } 118 + } 119 + } 120 + if (WARN(index != max_nid, "%d max nid when expected %d\n", 121 + index, max_nid)) 122 + return -1; 123 + 124 + nodes_clear(nodes_to_enable); 125 + 126 + /* map phys nodes not used for fake nodes */ 127 + for (i = 0; i < MAX_NUMNODES; i++) { 128 + if (node_to_pxm_map[i] != PXM_INVAL) { 129 + for (j = 0; j <= max_nid; j++) 130 + if (emu_nid_to_phys[j] == i) 131 + break; 132 + /* fake nodes PXM mapping has been done */ 133 + if (j <= max_nid) 134 + continue; 135 + /* find first hole */ 136 + for (j = 0; 137 + j < MAX_NUMNODES && 138 + node_to_pxm_map_copy[j] != PXM_INVAL; 139 + j++) 140 + ; 141 + if (WARN(j == MAX_NUMNODES, 142 + "Number of nodes exceeds MAX_NUMNODES\n")) 143 + return -1; 144 + node_to_pxm_map_copy[j] = node_to_pxm_map[i]; 145 + node_set(j, nodes_to_enable); 146 + count++; 147 + } 148 + } 149 + 150 + /* creating reverse mapping in pxm_to_node_map[] */ 151 + for (i = 0; i < MAX_NUMNODES; i++) 152 + if (node_to_pxm_map_copy[i] != PXM_INVAL && 153 + pxm_to_node_map_copy[node_to_pxm_map_copy[i]] == NUMA_NO_NODE) 154 + pxm_to_node_map_copy[node_to_pxm_map_copy[i]] = i; 155 + 156 + /* overwrite with new mapping */ 157 + for (i = 0; i < MAX_NUMNODES; i++) { 158 + node_to_pxm_map[i] = node_to_pxm_map_copy[i]; 159 + pxm_to_node_map[i] = pxm_to_node_map_copy[i]; 160 + } 161 + 162 + /* enable other nodes found in PXM for hotplug */ 163 + nodes_or(numa_nodes_parsed, nodes_to_enable, numa_nodes_parsed); 164 + 165 + pr_debug("found %d total number of nodes\n", count); 166 + return 0; 167 + } 168 + #endif 169 + 84 170 static void __init 85 171 acpi_table_print_srat_entry(struct acpi_subtable_header *header) 86 172 {
+5
include/acpi/acpi_numa.h
··· 17 17 extern int acpi_map_pxm_to_node(int); 18 18 extern unsigned char acpi_srat_revision; 19 19 extern void disable_srat(void); 20 + extern int fix_pxm_node_maps(int max_nid); 20 21 21 22 extern void bad_srat(void); 22 23 extern int srat_disabled(void); 23 24 24 25 #else /* CONFIG_ACPI_NUMA */ 26 + static inline int fix_pxm_node_maps(int max_nid) 27 + { 28 + return 0; 29 + } 25 30 static inline void disable_srat(void) 26 31 { 27 32 }
+3
include/linux/numa_memblks.h
··· 29 29 int __init numa_memblks_init(int (*init_func)(void), 30 30 bool memblock_force_top_down); 31 31 32 + extern int numa_distance_cnt; 33 + 32 34 #ifdef CONFIG_NUMA_EMU 35 + extern int emu_nid_to_phys[MAX_NUMNODES]; 33 36 int numa_emu_cmdline(char *str); 34 37 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, 35 38 unsigned int nr_emu_nids);
+38 -7
mm/numa_emulation.c
··· 8 8 #include <linux/memblock.h> 9 9 #include <linux/numa_memblks.h> 10 10 #include <asm/numa.h> 11 + #include <acpi/acpi_numa.h> 11 12 12 13 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 13 14 #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 14 15 15 - static int emu_nid_to_phys[MAX_NUMNODES]; 16 + int emu_nid_to_phys[MAX_NUMNODES]; 16 17 static char *emu_cmdline __initdata; 17 18 18 19 int __init numa_emu_cmdline(char *str) ··· 380 379 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 381 380 int max_emu_nid, dfl_phys_nid; 382 381 int i, j, ret; 382 + nodemask_t physnode_mask = numa_nodes_parsed; 383 383 384 384 if (!emu_cmdline) 385 385 goto no_emu; ··· 397 395 * split the system RAM into N fake nodes. 398 396 */ 399 397 if (strchr(emu_cmdline, 'U')) { 400 - nodemask_t physnode_mask = numa_nodes_parsed; 401 398 unsigned long n; 402 399 int nid = 0; 403 400 ··· 466 465 */ 467 466 max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); 468 467 469 - /* commit */ 470 - *numa_meminfo = ei; 471 - 472 468 /* Make sure numa_nodes_parsed only contains emulated nodes */ 473 469 nodes_clear(numa_nodes_parsed); 474 470 for (i = 0; i < ARRAY_SIZE(ei.blk); i++) ··· 473 475 ei.blk[i].nid != NUMA_NO_NODE) 474 476 node_set(ei.blk[i].nid, numa_nodes_parsed); 475 477 476 - numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); 478 + /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision 479 + * with faked numa nodes, particularly during later memory hotplug 480 + * handling, and also update numa_nodes_parsed accordingly. 481 + */ 482 + ret = fix_pxm_node_maps(max_emu_nid); 483 + if (ret < 0) 484 + goto no_emu; 485 + 486 + /* commit */ 487 + *numa_meminfo = ei; 488 + 489 + numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1); 477 490 478 491 /* make sure all emulated nodes are mapped to a physical node */ 479 - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 492 + for (i = 0; i < max_emu_nid + 1; i++) 480 493 if (emu_nid_to_phys[i] == NUMA_NO_NODE) 481 494 emu_nid_to_phys[i] = dfl_phys_nid; 482 495 ··· 510 501 numa_set_distance(i, j, dist); 511 502 } 512 503 } 504 + for (i = 0; i < numa_distance_cnt; i++) { 505 + for (j = 0; j < numa_distance_cnt; j++) { 506 + int physi, physj; 507 + u8 dist; 508 + 509 + /* distance between fake nodes is already ok */ 510 + if (emu_nid_to_phys[i] != NUMA_NO_NODE && 511 + emu_nid_to_phys[j] != NUMA_NO_NODE) 512 + continue; 513 + if (emu_nid_to_phys[i] != NUMA_NO_NODE) 514 + physi = emu_nid_to_phys[i]; 515 + else 516 + physi = i - max_emu_nid; 517 + if (emu_nid_to_phys[j] != NUMA_NO_NODE) 518 + physj = emu_nid_to_phys[j]; 519 + else 520 + physj = j - max_emu_nid; 521 + dist = phys_dist[physi * numa_dist_cnt + physj]; 522 + numa_set_distance(i, j, dist); 523 + } 524 + } 513 525 514 526 /* free the copied physical distance table */ 515 527 memblock_free(phys_dist, phys_size); 516 528 return; 517 529 518 530 no_emu: 531 + numa_nodes_parsed = physnode_mask; 519 532 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 520 533 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 521 534 emu_nid_to_phys[i] = i;
+1 -1
mm/numa_memblks.c
··· 7 7 #include <linux/numa.h> 8 8 #include <linux/numa_memblks.h> 9 9 10 - static int numa_distance_cnt; 10 + int numa_distance_cnt; 11 11 static u8 *numa_distance; 12 12 13 13 nodemask_t numa_nodes_parsed __initdata;