Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86: Unify cpu/apicid <-> NUMA node mapping between 32 and 64bit

The mapping between cpu/apicid and node is done via
apicid_to_node[] on 64bit and apicid_2_node[] +
apic->x86_32_numa_cpu_node() on 32bit. This difference makes it
difficult to further unify 32 and 64bit NUMA handling.

This patch unifies it by replacing both apicid_to_node[] and
apicid_2_node[] with __apicid_to_node[] array, which is accessed
by two accessors - set_apicid_to_node() and numa_cpu_node(). On
64bit, numa_cpu_node() always consults __apicid_to_node[]
directly while 32bit goes through apic->numa_cpu_node() method
to allow apic implementations to override it.

srat_detect_node() for amd cpus contains workaround for broken
NUMA configuration which assumes relationship between APIC ID,
HT node ID and NUMA topology. Leave it to access
__apicid_to_node[] directly as mapping through CPU might result
in undesirable behavior change. The comment is reformatted and
updated to note the ugliness.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-14-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: David Rientjes <rientjes@google.com>

authored by

Tejun Heo and committed by
Ingo Molnar
bbc9e2f4 89e5dc21

+101 -56
-1
arch/x86/include/asm/mpspec.h
··· 25 25 #define MAX_IRQ_SOURCES 256 26 26 27 27 extern unsigned int def_to_bigsmp; 28 - extern u8 apicid_2_node[]; 29 28 30 29 #ifdef CONFIG_X86_NUMAQ 31 30 extern int mp_bus_id_to_node[MAX_MP_BUSSES];
+28
arch/x86/include/asm/numa.h
··· 1 + #ifndef _ASM_X86_NUMA_H 2 + #define _ASM_X86_NUMA_H 3 + 4 + #include <asm/apicdef.h> 5 + 6 + #ifdef CONFIG_NUMA 7 + /* 8 + * __apicid_to_node[] stores the raw mapping between physical apicid and 9 + * node and is used to initialize cpu_to_node mapping. 10 + * 11 + * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus 12 + * should be accessed by the accessors - set_apicid_to_node() and 13 + * numa_cpu_node(). 14 + */ 15 + extern s16 __apicid_to_node[MAX_LOCAL_APIC]; 16 + 17 + static inline void set_apicid_to_node(int apicid, s16 node) 18 + { 19 + __apicid_to_node[apicid] = node; 20 + } 21 + #else /* CONFIG_NUMA */ 22 + static inline void set_apicid_to_node(int apicid, s16 node) 23 + { 24 + } 25 + #endif /* CONFIG_NUMA */ 26 + 1 27 #ifdef CONFIG_X86_32 2 28 # include "numa_32.h" 3 29 #else 4 30 # include "numa_64.h" 5 31 #endif 32 + 33 + #endif /* _ASM_X86_NUMA_H */
+6
arch/x86/include/asm/numa_32.h
··· 6 6 extern int pxm_to_nid(int pxm); 7 7 extern void numa_remove_cpu(int cpu); 8 8 9 + #ifdef CONFIG_NUMA 10 + extern int __cpuinit numa_cpu_node(int apicid); 11 + #else /* CONFIG_NUMA */ 12 + static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } 13 + #endif /* CONFIG_NUMA */ 14 + 9 15 #ifdef CONFIG_HIGHMEM 10 16 extern void set_highmem_pages_init(void); 11 17 #else
+2 -3
arch/x86/include/asm/numa_64.h
··· 2 2 #define _ASM_X86_NUMA_64_H 3 3 4 4 #include <linux/nodemask.h> 5 - #include <asm/apicdef.h> 6 5 7 6 struct bootnode { 8 7 u64 start; ··· 16 17 extern void numa_init_array(void); 17 18 extern int numa_off; 18 19 19 - extern s16 apicid_to_node[MAX_LOCAL_APIC]; 20 - 21 20 extern unsigned long numa_free_all_bootmem(void); 22 21 extern void setup_node_bootmem(int nodeid, unsigned long start, 23 22 unsigned long end); ··· 29 32 #define NODE_MIN_SIZE (4*1024*1024) 30 33 31 34 extern void __init init_cpu_to_node(void); 35 + extern int __cpuinit numa_cpu_node(int cpu); 32 36 extern void __cpuinit numa_set_node(int cpu, int node); 33 37 extern void __cpuinit numa_clear_node(int cpu); 34 38 extern void __cpuinit numa_add_cpu(int cpu); ··· 42 44 #endif /* CONFIG_NUMA_EMU */ 43 45 #else 44 46 static inline void init_cpu_to_node(void) { } 47 + static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } 45 48 static inline void numa_set_node(int cpu, int node) { } 46 49 static inline void numa_clear_node(int cpu) { } 47 50 static inline void numa_add_cpu(int cpu, int node) { }
+1 -2
arch/x86/kernel/acpi/boot.c
··· 589 589 nid = acpi_get_node(handle); 590 590 if (nid == -1 || !node_online(nid)) 591 591 return; 592 + set_apicid_to_node(physid, nid); 592 593 #ifdef CONFIG_X86_64 593 - apicid_to_node[physid] = nid; 594 594 numa_set_node(cpu, nid); 595 595 #else /* CONFIG_X86_32 */ 596 - apicid_2_node[physid] = nid; 597 596 cpu_to_node_map[cpu] = nid; 598 597 #endif 599 598
+1 -1
arch/x86/kernel/apic/apic.c
··· 2026 2026 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 2027 2027 2028 2028 if (apicid != BAD_APICID) 2029 - return apicid_2_node[apicid]; 2029 + return __apicid_to_node[apicid]; 2030 2030 return NUMA_NO_NODE; 2031 2031 #else 2032 2032 return 0;
+30 -17
arch/x86/kernel/cpu/amd.c
··· 234 234 #endif 235 235 236 236 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 237 + /* 238 + * To workaround broken NUMA config. Read the comment in 239 + * srat_detect_node(). 240 + */ 237 241 static int __cpuinit nearby_node(int apicid) 238 242 { 239 243 int i, node; 240 244 241 245 for (i = apicid - 1; i >= 0; i--) { 242 - node = apicid_to_node[i]; 246 + node = __apicid_to_node[i]; 243 247 if (node != NUMA_NO_NODE && node_online(node)) 244 248 return node; 245 249 } 246 250 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 247 - node = apicid_to_node[i]; 251 + node = __apicid_to_node[i]; 248 252 if (node != NUMA_NO_NODE && node_online(node)) 249 253 return node; 250 254 } ··· 343 339 int node; 344 340 unsigned apicid = c->apicid; 345 341 346 - node = per_cpu(cpu_llc_id, cpu); 342 + node = numa_cpu_node(cpu); 343 + if (node == NUMA_NO_NODE) 344 + node = per_cpu(cpu_llc_id, cpu); 347 345 348 - if (apicid_to_node[apicid] != NUMA_NO_NODE) 349 - node = apicid_to_node[apicid]; 350 346 if (!node_online(node)) { 351 - /* Two possibilities here: 352 - - The CPU is missing memory and no node was created. 353 - In that case try picking one from a nearby CPU 354 - - The APIC IDs differ from the HyperTransport node IDs 355 - which the K8 northbridge parsing fills in. 356 - Assume they are all increased by a constant offset, 357 - but in the same order as the HT nodeids. 358 - If that doesn't result in a usable node fall back to the 359 - path for the previous case. */ 360 - 347 + /* 348 + * Two possibilities here: 349 + * 350 + * - The CPU is missing memory and no node was created. In 351 + * that case try picking one from a nearby CPU. 352 + * 353 + * - The APIC IDs differ from the HyperTransport node IDs 354 + * which the K8 northbridge parsing fills in. Assume 355 + * they are all increased by a constant offset, but in 356 + * the same order as the HT nodeids. If that doesn't 357 + * result in a usable node fall back to the path for the 358 + * previous case. 359 + * 360 + * This workaround operates directly on the mapping between 361 + * APIC ID and NUMA node, assuming certain relationship 362 + * between APIC ID, HT node ID and NUMA topology. As going 363 + * through CPU mapping may alter the outcome, directly 364 + * access __apicid_to_node[]. 365 + */ 361 366 int ht_nodeid = c->initial_apicid; 362 367 363 368 if (ht_nodeid >= 0 && 364 - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 365 - node = apicid_to_node[ht_nodeid]; 369 + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 370 + node = __apicid_to_node[ht_nodeid]; 366 371 /* Pick a nearby node */ 367 372 if (!node_online(node)) 368 373 node = nearby_node(apicid);
+1 -2
arch/x86/kernel/cpu/intel.c
··· 279 279 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 280 280 unsigned node; 281 281 int cpu = smp_processor_id(); 282 - int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; 283 282 284 283 /* Don't do the funky fallback heuristics the AMD version employs 285 284 for now. */ 286 - node = apicid_to_node[apicid]; 285 + node = numa_cpu_node(cpu); 287 286 if (node == NUMA_NO_NODE || !node_online(node)) { 288 287 /* reuse the value from init_cpu_to_node() */ 289 288 node = cpu_to_node(cpu);
+1 -5
arch/x86/kernel/smpboot.c
··· 71 71 #include <asm/smpboot_hooks.h> 72 72 #include <asm/i8259.h> 73 73 74 - #ifdef CONFIG_X86_32 75 - u8 apicid_2_node[MAX_LOCAL_APIC]; 76 - #endif 77 - 78 74 /* State of each CPU */ 79 75 DEFINE_PER_CPU(int, cpu_state) = { 0 }; 80 76 ··· 166 170 int cpu = smp_processor_id(); 167 171 int node; 168 172 169 - node = apic->x86_32_numa_cpu_node(cpu); 173 + node = numa_cpu_node(cpu); 170 174 if (!node_online(node)) 171 175 node = first_online_node; 172 176
+2 -2
arch/x86/mm/amdtopology_64.c
··· 247 247 __acpi_map_pxm_to_node(nid, i); 248 248 #endif 249 249 } 250 - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); 250 + memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); 251 251 } 252 252 #endif /* CONFIG_NUMA_EMU */ 253 253 ··· 285 285 nodes[i].start >> PAGE_SHIFT, 286 286 nodes[i].end >> PAGE_SHIFT); 287 287 for (j = apicid_base; j < cores + apicid_base; j++) 288 - apicid_to_node[(i << bits) + j] = i; 288 + set_apicid_to_node((i << bits) + j, i); 289 289 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 290 290 } 291 291
+5 -1
arch/x86/mm/numa.c
··· 26 26 early_param("numa", numa_setup); 27 27 28 28 /* 29 - * Which logical CPUs are on which nodes 29 + * apicid, cpu, node mappings 30 30 */ 31 + s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 32 + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 33 + }; 34 + 31 35 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 32 36 EXPORT_SYMBOL(node_to_cpumask_map); 33 37
+6
arch/x86/mm/numa_32.c
··· 110 110 111 111 static unsigned long kva_start_pfn; 112 112 static unsigned long kva_pages; 113 + 114 + int __cpuinit numa_cpu_node(int cpu) 115 + { 116 + return apic->x86_32_numa_cpu_node(cpu); 117 + } 118 + 113 119 /* 114 120 * FLAT - support for basic PC memory model with discontig enabled, essentially 115 121 * a single node with all available processors in it with a flat
+11 -15
arch/x86/mm/numa_64.c
··· 26 26 27 27 struct memnode memnode; 28 28 29 - s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 30 - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 31 - }; 32 - 33 29 static unsigned long __initdata nodemap_addr; 34 30 static unsigned long __initdata nodemap_size; 35 31 ··· 712 716 BUG_ON(cpu_to_apicid == NULL); 713 717 714 718 for_each_possible_cpu(cpu) { 715 - int node; 716 - u16 apicid = cpu_to_apicid[cpu]; 719 + int node = numa_cpu_node(cpu); 717 720 718 - if (apicid == BAD_APICID) 719 - continue; 720 - node = apicid_to_node[apicid]; 721 721 if (node == NUMA_NO_NODE) 722 722 continue; 723 723 if (!node_online(node)) ··· 723 731 } 724 732 #endif 725 733 734 + int __cpuinit numa_cpu_node(int cpu) 735 + { 736 + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 737 + 738 + if (apicid != BAD_APICID) 739 + return __apicid_to_node[apicid]; 740 + return NUMA_NO_NODE; 741 + } 726 742 727 743 void __cpuinit numa_set_node(int cpu, int node) 728 744 { ··· 776 776 void __cpuinit numa_add_cpu(int cpu) 777 777 { 778 778 unsigned long addr; 779 - u16 apicid; 780 - int physnid; 781 - int nid = NUMA_NO_NODE; 779 + int physnid, nid; 782 780 783 - apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 784 - if (apicid != BAD_APICID) 785 - nid = apicid_to_node[apicid]; 781 + nid = numa_cpu_node(cpu); 786 782 if (nid == NUMA_NO_NODE) 787 783 nid = early_cpu_to_node(cpu); 788 784 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+1 -1
arch/x86/mm/srat_32.c
··· 255 255 num_memory_chunks); 256 256 257 257 for (i = 0; i < MAX_LOCAL_APIC; i++) 258 - apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 258 + set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); 259 259 260 260 for (j = 0; j < num_memory_chunks; j++){ 261 261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
+6 -6
arch/x86/mm/srat_64.c
··· 79 79 printk(KERN_ERR "SRAT: SRAT not used.\n"); 80 80 acpi_numa = -1; 81 81 for (i = 0; i < MAX_LOCAL_APIC; i++) 82 - apicid_to_node[i] = NUMA_NO_NODE; 82 + set_apicid_to_node(i, NUMA_NO_NODE); 83 83 for (i = 0; i < MAX_NUMNODES; i++) { 84 84 nodes[i].start = nodes[i].end = 0; 85 85 nodes_add[i].start = nodes_add[i].end = 0; ··· 138 138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 139 139 return; 140 140 } 141 - apicid_to_node[apic_id] = node; 141 + set_apicid_to_node(apic_id, node); 142 142 node_set(node, cpu_nodes_parsed); 143 143 acpi_numa = 1; 144 144 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", ··· 178 178 return; 179 179 } 180 180 181 - apicid_to_node[apic_id] = node; 181 + set_apicid_to_node(apic_id, node); 182 182 node_set(node, cpu_nodes_parsed); 183 183 acpi_numa = 1; 184 184 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", ··· 521 521 * node, it must now point to the fake node ID. 522 522 */ 523 523 for (j = 0; j < MAX_LOCAL_APIC; j++) 524 - if (apicid_to_node[j] == nid && 524 + if (__apicid_to_node[j] == nid && 525 525 fake_apicid_to_node[j] == NUMA_NO_NODE) 526 526 fake_apicid_to_node[j] = i; 527 527 } ··· 532 532 * value. 533 533 */ 534 534 for (i = 0; i < MAX_LOCAL_APIC; i++) 535 - if (apicid_to_node[i] != NUMA_NO_NODE && 535 + if (__apicid_to_node[i] != NUMA_NO_NODE && 536 536 fake_apicid_to_node[i] == NUMA_NO_NODE) 537 537 fake_apicid_to_node[i] = 0; 538 538 539 539 for (i = 0; i < num_nodes; i++) 540 540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 541 - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); 541 + memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node)); 542 542 543 543 nodes_clear(nodes_parsed); 544 544 for (i = 0; i < num_nodes; i++)