Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/topo: Fix SNC topology mess

Per 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in
SNC-3 mode"), the original crazy SNC-3 SLIT table was:

node distances:
node 0 1 2 3 4 5
0: 10 15 17 21 28 26
1: 15 10 15 23 26 23
2: 17 15 10 26 23 21
3: 21 28 26 10 15 17
4: 23 26 23 15 10 15
5: 26 23 21 17 15 10

And per:

https://lore.kernel.org/lkml/20250825075642.GQ3245006@noisy.programming.kicks-ass.net/

The suggestion was to average the off-trace clusters to restore sanity.

However, 4d6dd05d07d0 implements this under various assumptions:

- anything GNR/CWF with numa_in_package;
- there will never be more than 2 packages;
- the off-trace cluster will have distance >20

And then HPE shows up with a machine that matches the
Vendor-Family-Model checks but looks like this:

Here's an 8 socket (2 chassis) HPE system with SNC enabled:

node 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0: 10 12 16 16 16 16 18 18 40 40 40 40 40 40 40 40
1: 12 10 16 16 16 16 18 18 40 40 40 40 40 40 40 40
2: 16 16 10 12 18 18 16 16 40 40 40 40 40 40 40 40
3: 16 16 12 10 18 18 16 16 40 40 40 40 40 40 40 40
4: 16 16 18 18 10 12 16 16 40 40 40 40 40 40 40 40
5: 16 16 18 18 12 10 16 16 40 40 40 40 40 40 40 40
6: 18 18 16 16 16 16 10 12 40 40 40 40 40 40 40 40
7: 18 18 16 16 16 16 12 10 40 40 40 40 40 40 40 40
8: 40 40 40 40 40 40 40 40 10 12 16 16 16 16 18 18
9: 40 40 40 40 40 40 40 40 12 10 16 16 16 16 18 18
10: 40 40 40 40 40 40 40 40 16 16 10 12 18 18 16 16
11: 40 40 40 40 40 40 40 40 16 16 12 10 18 18 16 16
12: 40 40 40 40 40 40 40 40 16 16 18 18 10 12 16 16
13: 40 40 40 40 40 40 40 40 16 16 18 18 12 10 16 16
14: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 10 12
15: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 12 10

10 = Same chassis and socket
12 = Same chassis and socket (SNC)
16 = Same chassis and adjacent socket
18 = Same chassis and non-adjacent socket
40 = Different chassis

Turns out, the 'max 2 packages' thing is only relevant to the SNC-3 parts, the
smaller parts do 8 sockets (like usual). The above SLIT table is sane, but
violates the previous assumptions and trips a WARN.

Now that the topology code has a sensible measure of nodes-per-package, we can
use that to divinate the SNC mode at hand, and only fix up SNC-3 topologies.

There is a 'healthy' amount of paranoia code validating the assumptions on the
SLIT table, a simple pr_err(FW_BUG) print on failure and a fallback to using
the regular table. Lets see how long this lasts :-)

Fixes: 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode")
Reported-by: Kyle Meyer <kyle.meyer@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Link: https://patch.msgid.link/20260303110100.238361290@infradead.org

+142 -46
+142 -46
arch/x86/kernel/smpboot.c
··· 506 506 } 507 507 508 508 #ifdef CONFIG_NUMA 509 - static int sched_avg_remote_distance; 510 - static int avg_remote_numa_distance(void) 509 + /* 510 + * Test if the on-trace cluster at (N,N) is symmetric. 511 + * Uses upper triangle iteration to avoid obvious duplicates. 512 + */ 513 + static bool slit_cluster_symmetric(int N) 511 514 { 512 - int i, j; 513 - int distance, nr_remote, total_distance; 515 + int u = topology_num_nodes_per_package(); 514 516 515 - if (sched_avg_remote_distance > 0) 516 - return sched_avg_remote_distance; 517 - 518 - nr_remote = 0; 519 - total_distance = 0; 520 - for_each_node_state(i, N_CPU) { 521 - for_each_node_state(j, N_CPU) { 522 - distance = node_distance(i, j); 523 - 524 - if (distance >= REMOTE_DISTANCE) { 525 - nr_remote++; 526 - total_distance += distance; 527 - } 517 + for (int k = 0; k < u; k++) { 518 + for (int l = k; l < u; l++) { 519 + if (node_distance(N + k, N + l) != 520 + node_distance(N + l, N + k)) 521 + return false; 528 522 } 529 523 } 530 - if (nr_remote) 531 - sched_avg_remote_distance = total_distance / nr_remote; 532 - else 533 - sched_avg_remote_distance = REMOTE_DISTANCE; 534 524 535 - return sched_avg_remote_distance; 525 + return true; 526 + } 527 + 528 + /* 529 + * Return the package-id of the cluster, or ~0 if indeterminate. 530 + * Each node in the on-trace cluster should have the same package-id. 531 + */ 532 + static u32 slit_cluster_package(int N) 533 + { 534 + int u = topology_num_nodes_per_package(); 535 + u32 pkg_id = ~0; 536 + 537 + for (int n = 0; n < u; n++) { 538 + const struct cpumask *cpus = cpumask_of_node(N + n); 539 + int cpu; 540 + 541 + for_each_cpu(cpu, cpus) { 542 + u32 id = topology_logical_package_id(cpu); 543 + 544 + if (pkg_id == ~0) 545 + pkg_id = id; 546 + if (pkg_id != id) 547 + return ~0; 548 + } 549 + } 550 + 551 + return pkg_id; 552 + } 553 + 554 + /* 555 + * Validate the SLIT table is of the form expected for SNC, specifically: 556 + * 557 + * - each on-trace cluster should be symmetric, 558 + * - each on-trace cluster should have a unique package-id. 559 + * 560 + * If you NUMA_EMU on top of SNC, you get to keep the pieces. 561 + */ 562 + static bool slit_validate(void) 563 + { 564 + int u = topology_num_nodes_per_package(); 565 + u32 pkg_id, prev_pkg_id = ~0; 566 + 567 + for (int pkg = 0; pkg < topology_max_packages(); pkg++) { 568 + int n = pkg * u; 569 + 570 + /* 571 + * Ensure the on-trace cluster is symmetric and each cluster 572 + * has a different package id. 573 + */ 574 + if (!slit_cluster_symmetric(n)) 575 + return false; 576 + pkg_id = slit_cluster_package(n); 577 + if (pkg_id == ~0) 578 + return false; 579 + if (pkg && pkg_id == prev_pkg_id) 580 + return false; 581 + 582 + prev_pkg_id = pkg_id; 583 + } 584 + 585 + return true; 586 + } 587 + 588 + /* 589 + * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with 590 + * asymmetric off-trace clusters, reflecting physical assymmetries. However 591 + * this leads to 'unfortunate' sched_domain configurations. 592 + * 593 + * For example dual socket GNR with SNC-3: 594 + * 595 + * node distances: 596 + * node 0 1 2 3 4 5 597 + * 0: 10 15 17 21 28 26 598 + * 1: 15 10 15 23 26 23 599 + * 2: 17 15 10 26 23 21 600 + * 3: 21 28 26 10 15 17 601 + * 4: 23 26 23 15 10 15 602 + * 5: 26 23 21 17 15 10 603 + * 604 + * Fix things up by averaging out the off-trace clusters; resulting in: 605 + * 606 + * node 0 1 2 3 4 5 607 + * 0: 10 15 17 24 24 24 608 + * 1: 15 10 15 24 24 24 609 + * 2: 17 15 10 24 24 24 610 + * 3: 24 24 24 10 15 17 611 + * 4: 24 24 24 15 10 15 612 + * 5: 24 24 24 17 15 10 613 + */ 614 + static int slit_cluster_distance(int i, int j) 615 + { 616 + static int slit_valid = -1; 617 + int u = topology_num_nodes_per_package(); 618 + long d = 0; 619 + int x, y; 620 + 621 + if (slit_valid < 0) { 622 + slit_valid = slit_validate(); 623 + if (!slit_valid) 624 + pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n"); 625 + else 626 + pr_info("Fixing up SNC SLIT table.\n"); 627 + } 628 + 629 + /* 630 + * Is this a unit cluster on the trace? 631 + */ 632 + if ((i / u) == (j / u) || !slit_valid) 633 + return node_distance(i, j); 634 + 635 + /* 636 + * Off-trace cluster. 637 + * 638 + * Notably average out the symmetric pair of off-trace clusters to 639 + * ensure the resulting SLIT table is symmetric. 640 + */ 641 + x = i - (i % u); 642 + y = j - (j % u); 643 + 644 + for (i = x; i < x + u; i++) { 645 + for (j = y; j < y + u; j++) { 646 + d += node_distance(i, j); 647 + d += node_distance(j, i); 648 + } 649 + } 650 + 651 + return d / (2*u*u); 536 652 } 537 653 538 654 int arch_sched_node_distance(int from, int to) ··· 658 542 switch (boot_cpu_data.x86_vfm) { 659 543 case INTEL_GRANITERAPIDS_X: 660 544 case INTEL_ATOM_DARKMONT_X: 661 - 662 - if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 || 663 - d < REMOTE_DISTANCE) 545 + if (topology_max_packages() == 1 || 546 + topology_num_nodes_per_package() < 3) 664 547 return d; 665 548 666 549 /* 667 - * With SNC enabled, there could be too many levels of remote 668 - * NUMA node distances, creating NUMA domain levels 669 - * including local nodes and partial remote nodes. 670 - * 671 - * Trim finer distance tuning for NUMA nodes in remote package 672 - * for the purpose of building sched domains. Group NUMA nodes 673 - * in the remote package in the same sched group. 674 - * Simplify NUMA domains and avoid extra NUMA levels including 675 - * different remote NUMA nodes and local nodes. 676 - * 677 - * GNR and CWF don't expect systems with more than 2 packages 678 - * and more than 2 hops between packages. Single average remote 679 - * distance won't be appropriate if there are more than 2 680 - * packages as average distance to different remote packages 681 - * could be different. 550 + * Handle SNC-3 asymmetries. 682 551 */ 683 - WARN_ONCE(topology_max_packages() > 2, 684 - "sched: Expect only up to 2 packages for GNR or CWF, " 685 - "but saw %d packages when building sched domains.", 686 - topology_max_packages()); 687 - 688 - d = avg_remote_numa_distance(); 552 + return slit_cluster_distance(from, to); 689 553 } 690 554 return d; 691 555 }