Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask()

Leon [1] and Vinicius [2] noted a topology_span_sane() warning during
their testing starting from v6.16-rc1. Debug that followed pointed to
the tl->mask() for the NODE domain being incorrectly resolved to that of
the highest NUMA domain.

tl->mask() for NODE is set to the sd_numa_mask() which depends on the
global "sched_domains_curr_level" hack. "sched_domains_curr_level" is
set to the "tl->numa_level" during tl traversal in build_sched_domains()
calling sd_init() but was not reset before topology_span_sane().

Since "tl->numa_level" still reflected the old value from
build_sched_domains(), topology_span_sane() for the NODE domain trips
when the span of the last NUMA domain overlaps.

Instead of replicating the "sched_domains_curr_level" hack, get rid of
it entirely and instead, pass the entire "sched_domain_topology_level"
object to tl->cpumask() function to prevent such mishap in the future.

sd_numa_mask() now directly references "tl->numa_level" instead of
relying on the global "sched_domains_curr_level" hack to index into
sched_domains_numa_masks[].

The original warning was reproducible on the following NUMA topology
reported by Leon:

$ sudo numactl -H
available: 5 nodes (0-4)
node 0 cpus: 0 1
node 0 size: 2927 MB
node 0 free: 1603 MB
node 1 cpus: 2 3
node 1 size: 3023 MB
node 1 free: 3008 MB
node 2 cpus: 4 5
node 2 size: 3023 MB
node 2 free: 3007 MB
node 3 cpus: 6 7
node 3 size: 3023 MB
node 3 free: 3002 MB
node 4 cpus: 8 9
node 4 size: 3022 MB
node 4 free: 2718 MB
node distances:
node 0 1 2 3 4
0: 10 39 38 37 36
1: 39 10 38 37 36
2: 38 38 10 37 36
3: 37 37 37 10 36
4: 36 36 36 36 10

The above topology can be mimicked using the following QEMU cmd that was
used to reproduce the warning and test the fix:

sudo qemu-system-x86_64 -enable-kvm -cpu host \
-m 20G -smp cpus=10,sockets=10 -machine q35 \
-object memory-backend-ram,size=4G,id=m0 \
-object memory-backend-ram,size=4G,id=m1 \
-object memory-backend-ram,size=4G,id=m2 \
-object memory-backend-ram,size=4G,id=m3 \
-object memory-backend-ram,size=4G,id=m4 \
-numa node,cpus=0-1,memdev=m0,nodeid=0 \
-numa node,cpus=2-3,memdev=m1,nodeid=1 \
-numa node,cpus=4-5,memdev=m2,nodeid=2 \
-numa node,cpus=6-7,memdev=m3,nodeid=3 \
-numa node,cpus=8-9,memdev=m4,nodeid=4 \
-numa dist,src=0,dst=1,val=39 \
-numa dist,src=0,dst=2,val=38 \
-numa dist,src=0,dst=3,val=37 \
-numa dist,src=0,dst=4,val=36 \
-numa dist,src=1,dst=0,val=39 \
-numa dist,src=1,dst=2,val=38 \
-numa dist,src=1,dst=3,val=37 \
-numa dist,src=1,dst=4,val=36 \
-numa dist,src=2,dst=0,val=38 \
-numa dist,src=2,dst=1,val=38 \
-numa dist,src=2,dst=3,val=37 \
-numa dist,src=2,dst=4,val=36 \
-numa dist,src=3,dst=0,val=37 \
-numa dist,src=3,dst=1,val=37 \
-numa dist,src=3,dst=2,val=37 \
-numa dist,src=3,dst=4,val=36 \
-numa dist,src=4,dst=0,val=36 \
-numa dist,src=4,dst=1,val=36 \
-numa dist,src=4,dst=2,val=36 \
-numa dist,src=4,dst=3,val=36 \
...

[ prateek: Moved common functions to include/linux/sched/topology.h,
reuse the common bits for s390 and ppc, commit message ]

Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1]
Fixes: ccf74128d66c ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da84cd, f55dac1dafb3
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Tested-by: Valentin Schneider <vschneid@redhat.com> # x86
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com> # powerpc
Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2]

+66 -53
+4
arch/powerpc/Kconfig
··· 971 971 when dealing with POWER5 cpus at a cost of slightly increased 972 972 overhead in some places. If unsure say N here. 973 973 974 + config SCHED_MC 975 + def_bool y 976 + depends on SMP 977 + 974 978 config PPC_DENORMALISATION 975 979 bool "PowerPC denormalisation exception handling" 976 980 depends on PPC_BOOK3S_64
+2
arch/powerpc/include/asm/topology.h
··· 131 131 #ifdef CONFIG_SMP 132 132 #include <asm/cputable.h> 133 133 134 + struct cpumask *cpu_coregroup_mask(int cpu); 135 + 134 136 #ifdef CONFIG_PPC64 135 137 #include <asm/smp.h> 136 138
+11 -16
arch/powerpc/kernel/smp.c
··· 1028 1028 * We can't just pass cpu_l2_cache_mask() directly because 1029 1029 * returns a non-const pointer and the compiler barfs on that. 1030 1030 */ 1031 - static const struct cpumask *shared_cache_mask(int cpu) 1031 + static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) 1032 1032 { 1033 1033 return per_cpu(cpu_l2_cache_map, cpu); 1034 1034 } 1035 1035 1036 1036 #ifdef CONFIG_SCHED_SMT 1037 - static const struct cpumask *smallcore_smt_mask(int cpu) 1037 + static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) 1038 1038 { 1039 1039 return cpu_smallcore_mask(cpu); 1040 1040 } 1041 1041 #endif 1042 1042 1043 - static struct cpumask *cpu_coregroup_mask(int cpu) 1043 + struct cpumask *cpu_coregroup_mask(int cpu) 1044 1044 { 1045 1045 return per_cpu(cpu_coregroup_map, cpu); 1046 1046 } ··· 1052 1052 return 0; 1053 1053 1054 1054 return coregroup_enabled; 1055 - } 1056 - 1057 - static const struct cpumask *cpu_mc_mask(int cpu) 1058 - { 1059 - return cpu_coregroup_mask(cpu); 1060 1055 } 1061 1056 1062 1057 static int __init init_big_cores(void) ··· 1443 1448 return false; 1444 1449 } 1445 1450 1446 - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); 1451 + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); 1447 1452 1448 1453 /* Update l2-cache mask with all the CPUs that are part of submask */ 1449 1454 or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); ··· 1533 1538 return; 1534 1539 } 1535 1540 1536 - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); 1541 + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); 1537 1542 1538 1543 /* Update coregroup mask with all the CPUs that are part of submask */ 1539 1544 or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); ··· 1596 1601 1597 1602 /* If chip_id is -1; limit the cpu_core_mask to within PKG */ 1598 1603 if (chip_id == -1) 1599 - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); 1604 + cpumask_and(mask, mask, cpu_node_mask(cpu)); 1600 1605 1601 1606 for_each_cpu(i, mask) { 1602 1607 if (chip_id == cpu_to_chip_id(i)) { ··· 1696 1701 if (has_big_cores) { 1697 1702 pr_info("Big cores detected but using small core scheduling\n"); 1698 1703 powerpc_topology[i++] = 1699 - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); 1704 + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); 1700 1705 } else { 1701 - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); 1706 + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); 1702 1707 } 1703 1708 #endif 1704 1709 if (shared_caches) { 1705 1710 powerpc_topology[i++] = 1706 - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); 1711 + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); 1707 1712 } 1708 1713 1709 1714 if (has_coregroup_support()) { 1710 1715 powerpc_topology[i++] = 1711 - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); 1716 + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); 1712 1717 } 1713 1718 1714 - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); 1719 + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); 1715 1720 1716 1721 /* There must be one trailing NULL entry left. */ 1717 1722 BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
+7 -13
arch/s390/kernel/topology.c
··· 509 509 return rc; 510 510 } 511 511 512 - static const struct cpumask *cpu_thread_mask(int cpu) 513 - { 514 - return &cpu_topology[cpu].thread_mask; 515 - } 516 - 517 - 518 512 const struct cpumask *cpu_coregroup_mask(int cpu) 519 513 { 520 514 return &cpu_topology[cpu].core_mask; 521 515 } 522 516 523 - static const struct cpumask *cpu_book_mask(int cpu) 517 + static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) 524 518 { 525 519 return &cpu_topology[cpu].book_mask; 526 520 } 527 521 528 - static const struct cpumask *cpu_drawer_mask(int cpu) 522 + static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) 529 523 { 530 524 return &cpu_topology[cpu].drawer_mask; 531 525 } 532 526 533 527 static struct sched_domain_topology_level s390_topology[] = { 534 - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), 535 - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), 536 - SDTL_INIT(cpu_book_mask, NULL, BOOK), 537 - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), 538 - SDTL_INIT(cpu_cpu_mask, NULL, PKG), 528 + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), 529 + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), 530 + SDTL_INIT(tl_book_mask, NULL, BOOK), 531 + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), 532 + SDTL_INIT(tl_pkg_mask, NULL, PKG), 539 533 { NULL, }, 540 534 }; 541 535
+4 -4
arch/x86/kernel/smpboot.c
··· 479 479 static bool x86_has_numa_in_package; 480 480 481 481 static struct sched_domain_topology_level x86_topology[] = { 482 - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), 482 + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), 483 483 #ifdef CONFIG_SCHED_CLUSTER 484 - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), 484 + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), 485 485 #endif 486 486 #ifdef CONFIG_SCHED_MC 487 - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), 487 + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), 488 488 #endif 489 - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), 489 + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), 490 490 { NULL }, 491 491 }; 492 492
+27 -1
include/linux/sched/topology.h
··· 30 30 }; 31 31 extern const struct sd_flag_debug sd_flag_debug[]; 32 32 33 + struct sched_domain_topology_level; 34 + 33 35 #ifdef CONFIG_SCHED_SMT 34 36 static inline int cpu_smt_flags(void) 35 37 { 36 38 return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; 39 + } 40 + 41 + static inline const 42 + struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) 43 + { 44 + return cpu_smt_mask(cpu); 37 45 } 38 46 #endif 39 47 ··· 50 42 { 51 43 return SD_CLUSTER | SD_SHARE_LLC; 52 44 } 45 + 46 + static inline const 47 + struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) 48 + { 49 + return cpu_clustergroup_mask(cpu); 50 + } 53 51 #endif 54 52 55 53 #ifdef CONFIG_SCHED_MC ··· 63 49 { 64 50 return SD_SHARE_LLC; 65 51 } 52 + 53 + static inline const 54 + struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) 55 + { 56 + return cpu_coregroup_mask(cpu); 57 + } 66 58 #endif 59 + 60 + static inline const 61 + struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) 62 + { 63 + return cpu_node_mask(cpu); 64 + } 67 65 68 66 #ifdef CONFIG_NUMA 69 67 static inline int cpu_numa_flags(void) ··· 198 172 bool cpus_share_cache(int this_cpu, int that_cpu); 199 173 bool cpus_share_resources(int this_cpu, int that_cpu); 200 174 201 - typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 175 + typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); 202 176 typedef int (*sched_domain_flags_f)(void); 203 177 204 178 struct sd_data {
+1 -1
include/linux/topology.h
··· 260 260 261 261 #endif 262 262 263 - static inline const struct cpumask *cpu_cpu_mask(int cpu) 263 + static inline const struct cpumask *cpu_node_mask(int cpu) 264 264 { 265 265 return cpumask_of_node(cpu_to_node(cpu)); 266 266 }
+10 -18
kernel/sched/topology.c
··· 1591 1591 enum numa_topology_type sched_numa_topology_type; 1592 1592 1593 1593 static int sched_domains_numa_levels; 1594 - static int sched_domains_curr_level; 1595 1594 1596 1595 int sched_max_numa_distance; 1597 1596 static int *sched_domains_numa_distance; ··· 1631 1632 int sd_id, sd_weight, sd_flags = 0; 1632 1633 struct cpumask *sd_span; 1633 1634 1634 - #ifdef CONFIG_NUMA 1635 - /* 1636 - * Ugly hack to pass state to sd_numa_mask()... 1637 - */ 1638 - sched_domains_curr_level = tl->numa_level; 1639 - #endif 1640 - 1641 - sd_weight = cpumask_weight(tl->mask(cpu)); 1635 + sd_weight = cpumask_weight(tl->mask(tl, cpu)); 1642 1636 1643 1637 if (tl->sd_flags) 1644 1638 sd_flags = (*tl->sd_flags)(); ··· 1669 1677 }; 1670 1678 1671 1679 sd_span = sched_domain_span(sd); 1672 - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); 1680 + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); 1673 1681 sd_id = cpumask_first(sd_span); 1674 1682 1675 1683 sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); ··· 1729 1737 */ 1730 1738 static struct sched_domain_topology_level default_topology[] = { 1731 1739 #ifdef CONFIG_SCHED_SMT 1732 - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), 1740 + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), 1733 1741 #endif 1734 1742 1735 1743 #ifdef CONFIG_SCHED_CLUSTER 1736 - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), 1744 + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), 1737 1745 #endif 1738 1746 1739 1747 #ifdef CONFIG_SCHED_MC 1740 - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), 1748 + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), 1741 1749 #endif 1742 - SDTL_INIT(cpu_cpu_mask, NULL, PKG), 1750 + SDTL_INIT(tl_pkg_mask, NULL, PKG), 1743 1751 { NULL, }, 1744 1752 }; 1745 1753 ··· 1761 1769 1762 1770 #ifdef CONFIG_NUMA 1763 1771 1764 - static const struct cpumask *sd_numa_mask(int cpu) 1772 + static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) 1765 1773 { 1766 - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 1774 + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; 1767 1775 } 1768 1776 1769 1777 static void sched_numa_warn(const char *str) ··· 2403 2411 * breaks the linking done for an earlier span. 2404 2412 */ 2405 2413 for_each_cpu(cpu, cpu_map) { 2406 - const struct cpumask *tl_cpu_mask = tl->mask(cpu); 2414 + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); 2407 2415 int id; 2408 2416 2409 2417 /* lowest bit set in this mask is used as a unique id */ ··· 2411 2419 2412 2420 if (cpumask_test_cpu(id, id_seen)) { 2413 2421 /* First CPU has already been seen, ensure identical spans */ 2414 - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) 2422 + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) 2415 2423 return false; 2416 2424 } else { 2417 2425 /* First CPU hasn't been seen before, ensure it's a completely new span */