Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/numa: Rewrite the CONFIG_NUMA sched domain support

The current code groups up to 16 nodes in a level and then puts an
ALLNODES domain spanning the entire tree on top of that. This doesn't
reflect the numa topology and esp for the smaller not-fully-connected
machines out there today this might make a difference.

Therefore, build a proper numa topology based on node_distance().

Since there's no fixed numa layers anymore, the static SD_NODE_INIT
and SD_ALLNODES_INIT aren't usable anymore, the new code tries to
construct something similar and scales some values either on the
number of cpus in the domain and/or the node_distance() ratio.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Anton Blanchard <anton@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: linux-alpha@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-sh@vger.kernel.org
Cc: Matt Turner <mattst88@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: sparclinux@vger.kernel.org
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86@kernel.org
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Greg Pearson <greg.pearson@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: bob.picco@oracle.com
Cc: chris.mason@oracle.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-r74n3n8hhuc2ynbrnp3vt954@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Peter Zijlstra and committed by
Ingo Molnar
cb83b629 bd939f45

+185 -318
-25
arch/ia64/include/asm/topology.h
··· 70 70 .nr_balance_failed = 0, \ 71 71 } 72 72 73 - /* sched_domains SD_NODE_INIT for IA64 NUMA machines */ 74 - #define SD_NODE_INIT (struct sched_domain) { \ 75 - .parent = NULL, \ 76 - .child = NULL, \ 77 - .groups = NULL, \ 78 - .min_interval = 8, \ 79 - .max_interval = 8*(min(num_online_cpus(), 32U)), \ 80 - .busy_factor = 64, \ 81 - .imbalance_pct = 125, \ 82 - .cache_nice_tries = 2, \ 83 - .busy_idx = 3, \ 84 - .idle_idx = 2, \ 85 - .newidle_idx = 0, \ 86 - .wake_idx = 0, \ 87 - .forkexec_idx = 0, \ 88 - .flags = SD_LOAD_BALANCE \ 89 - | SD_BALANCE_NEWIDLE \ 90 - | SD_BALANCE_EXEC \ 91 - | SD_BALANCE_FORK \ 92 - | SD_SERIALIZE, \ 93 - .last_balance = jiffies, \ 94 - .balance_interval = 64, \ 95 - .nr_balance_failed = 0, \ 96 - } 97 - 98 73 #endif /* CONFIG_NUMA */ 99 74 100 75 #ifdef CONFIG_SMP
-17
arch/mips/include/asm/mach-ip27/topology.h
··· 36 36 37 37 #define node_distance(from, to) (__node_distances[(from)][(to)]) 38 38 39 - /* sched_domains SD_NODE_INIT for SGI IP27 machines */ 40 - #define SD_NODE_INIT (struct sched_domain) { \ 41 - .parent = NULL, \ 42 - .child = NULL, \ 43 - .groups = NULL, \ 44 - .min_interval = 8, \ 45 - .max_interval = 32, \ 46 - .busy_factor = 32, \ 47 - .imbalance_pct = 125, \ 48 - .cache_nice_tries = 1, \ 49 - .flags = SD_LOAD_BALANCE | \ 50 - SD_BALANCE_EXEC, \ 51 - .last_balance = jiffies, \ 52 - .balance_interval = 1, \ 53 - .nr_balance_failed = 0, \ 54 - } 55 - 56 39 #include <asm-generic/topology.h> 57 40 58 41 #endif /* _ASM_MACH_TOPOLOGY_H */
-36
arch/powerpc/include/asm/topology.h
··· 18 18 */ 19 19 #define RECLAIM_DISTANCE 10 20 20 21 - /* 22 - * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest 23 - * POWER7 boxes which have a maximum of 32 nodes. 24 - */ 25 - #define SD_NODES_PER_DOMAIN 32 26 - 27 21 #include <asm/mmzone.h> 28 22 29 23 static inline int cpu_to_node(int cpu) ··· 44 50 #define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ 45 51 cpu_all_mask : \ 46 52 cpumask_of_node(pcibus_to_node(bus))) 47 - 48 - /* sched_domains SD_NODE_INIT for PPC64 machines */ 49 - #define SD_NODE_INIT (struct sched_domain) { \ 50 - .min_interval = 8, \ 51 - .max_interval = 32, \ 52 - .busy_factor = 32, \ 53 - .imbalance_pct = 125, \ 54 - .cache_nice_tries = 1, \ 55 - .busy_idx = 3, \ 56 - .idle_idx = 1, \ 57 - .newidle_idx = 0, \ 58 - .wake_idx = 0, \ 59 - .forkexec_idx = 0, \ 60 - \ 61 - .flags = 1*SD_LOAD_BALANCE \ 62 - | 0*SD_BALANCE_NEWIDLE \ 63 - | 1*SD_BALANCE_EXEC \ 64 - | 1*SD_BALANCE_FORK \ 65 - | 0*SD_BALANCE_WAKE \ 66 - | 1*SD_WAKE_AFFINE \ 67 - | 0*SD_PREFER_LOCAL \ 68 - | 0*SD_SHARE_CPUPOWER \ 69 - | 0*SD_POWERSAVINGS_BALANCE \ 70 - | 0*SD_SHARE_PKG_RESOURCES \ 71 - | 1*SD_SERIALIZE \ 72 - | 0*SD_PREFER_SIBLING \ 73 - , \ 74 - .last_balance = jiffies, \ 75 - .balance_interval = 1, \ 76 - } 77 53 78 54 extern int __node_distance(int, int); 79 55 #define node_distance(a, b) __node_distance(a, b)
-25
arch/sh/include/asm/topology.h
··· 3 3 4 4 #ifdef CONFIG_NUMA 5 5 6 - /* sched_domains SD_NODE_INIT for sh machines */ 7 - #define SD_NODE_INIT (struct sched_domain) { \ 8 - .parent = NULL, \ 9 - .child = NULL, \ 10 - .groups = NULL, \ 11 - .min_interval = 8, \ 12 - .max_interval = 32, \ 13 - .busy_factor = 32, \ 14 - .imbalance_pct = 125, \ 15 - .cache_nice_tries = 2, \ 16 - .busy_idx = 3, \ 17 - .idle_idx = 2, \ 18 - .newidle_idx = 0, \ 19 - .wake_idx = 0, \ 20 - .forkexec_idx = 0, \ 21 - .flags = SD_LOAD_BALANCE \ 22 - | SD_BALANCE_FORK \ 23 - | SD_BALANCE_EXEC \ 24 - | SD_BALANCE_NEWIDLE \ 25 - | SD_SERIALIZE, \ 26 - .last_balance = jiffies, \ 27 - .balance_interval = 1, \ 28 - .nr_balance_failed = 0, \ 29 - } 30 - 31 6 #define cpu_to_node(cpu) ((void)(cpu),0) 32 7 #define parent_node(node) ((void)(node),0) 33 8
-19
arch/sparc/include/asm/topology_64.h
··· 31 31 cpu_all_mask : \ 32 32 cpumask_of_node(pcibus_to_node(bus))) 33 33 34 - #define SD_NODE_INIT (struct sched_domain) { \ 35 - .min_interval = 8, \ 36 - .max_interval = 32, \ 37 - .busy_factor = 32, \ 38 - .imbalance_pct = 125, \ 39 - .cache_nice_tries = 2, \ 40 - .busy_idx = 3, \ 41 - .idle_idx = 2, \ 42 - .newidle_idx = 0, \ 43 - .wake_idx = 0, \ 44 - .forkexec_idx = 0, \ 45 - .flags = SD_LOAD_BALANCE \ 46 - | SD_BALANCE_FORK \ 47 - | SD_BALANCE_EXEC \ 48 - | SD_SERIALIZE, \ 49 - .last_balance = jiffies, \ 50 - .balance_interval = 1, \ 51 - } 52 - 53 34 #else /* CONFIG_NUMA */ 54 35 55 36 #include <asm-generic/topology.h>
-26
arch/tile/include/asm/topology.h
··· 78 78 .balance_interval = 32, \ 79 79 } 80 80 81 - /* sched_domains SD_NODE_INIT for TILE architecture */ 82 - #define SD_NODE_INIT (struct sched_domain) { \ 83 - .min_interval = 16, \ 84 - .max_interval = 512, \ 85 - .busy_factor = 32, \ 86 - .imbalance_pct = 125, \ 87 - .cache_nice_tries = 1, \ 88 - .busy_idx = 3, \ 89 - .idle_idx = 1, \ 90 - .newidle_idx = 2, \ 91 - .wake_idx = 1, \ 92 - .flags = 1*SD_LOAD_BALANCE \ 93 - | 1*SD_BALANCE_NEWIDLE \ 94 - | 1*SD_BALANCE_EXEC \ 95 - | 1*SD_BALANCE_FORK \ 96 - | 0*SD_BALANCE_WAKE \ 97 - | 0*SD_WAKE_AFFINE \ 98 - | 0*SD_PREFER_LOCAL \ 99 - | 0*SD_SHARE_CPUPOWER \ 100 - | 0*SD_SHARE_PKG_RESOURCES \ 101 - | 1*SD_SERIALIZE \ 102 - , \ 103 - .last_balance = jiffies, \ 104 - .balance_interval = 128, \ 105 - } 106 - 107 81 /* By definition, we create nodes based on online memory. */ 108 82 #define node_has_online_mem(nid) 1 109 83
-38
arch/x86/include/asm/topology.h
··· 92 92 93 93 #define pcibus_to_node(bus) __pcibus_to_node(bus) 94 94 95 - #ifdef CONFIG_X86_32 96 - # define SD_CACHE_NICE_TRIES 1 97 - # define SD_IDLE_IDX 1 98 - #else 99 - # define SD_CACHE_NICE_TRIES 2 100 - # define SD_IDLE_IDX 2 101 - #endif 102 - 103 - /* sched_domains SD_NODE_INIT for NUMA machines */ 104 - #define SD_NODE_INIT (struct sched_domain) { \ 105 - .min_interval = 8, \ 106 - .max_interval = 32, \ 107 - .busy_factor = 32, \ 108 - .imbalance_pct = 125, \ 109 - .cache_nice_tries = SD_CACHE_NICE_TRIES, \ 110 - .busy_idx = 3, \ 111 - .idle_idx = SD_IDLE_IDX, \ 112 - .newidle_idx = 0, \ 113 - .wake_idx = 0, \ 114 - .forkexec_idx = 0, \ 115 - \ 116 - .flags = 1*SD_LOAD_BALANCE \ 117 - | 1*SD_BALANCE_NEWIDLE \ 118 - | 1*SD_BALANCE_EXEC \ 119 - | 1*SD_BALANCE_FORK \ 120 - | 0*SD_BALANCE_WAKE \ 121 - | 1*SD_WAKE_AFFINE \ 122 - | 0*SD_PREFER_LOCAL \ 123 - | 0*SD_SHARE_CPUPOWER \ 124 - | 0*SD_POWERSAVINGS_BALANCE \ 125 - | 0*SD_SHARE_PKG_RESOURCES \ 126 - | 1*SD_SERIALIZE \ 127 - | 0*SD_PREFER_SIBLING \ 128 - , \ 129 - .last_balance = jiffies, \ 130 - .balance_interval = 1, \ 131 - } 132 - 133 95 extern int __node_distance(int, int); 134 96 #define node_distance(a, b) __node_distance(a, b) 135 97
-37
include/linux/topology.h
··· 70 70 * Below are the 3 major initializers used in building sched_domains: 71 71 * SD_SIBLING_INIT, for SMT domains 72 72 * SD_CPU_INIT, for SMP domains 73 - * SD_NODE_INIT, for NUMA domains 74 73 * 75 74 * Any architecture that cares to do any tuning to these values should do so 76 75 * by defining their own arch-specific initializer in include/asm/topology.h. ··· 175 176 } 176 177 #endif 177 178 178 - /* sched_domains SD_ALLNODES_INIT for NUMA machines */ 179 - #define SD_ALLNODES_INIT (struct sched_domain) { \ 180 - .min_interval = 64, \ 181 - .max_interval = 64*num_online_cpus(), \ 182 - .busy_factor = 128, \ 183 - .imbalance_pct = 133, \ 184 - .cache_nice_tries = 1, \ 185 - .busy_idx = 3, \ 186 - .idle_idx = 3, \ 187 - .flags = 1*SD_LOAD_BALANCE \ 188 - | 1*SD_BALANCE_NEWIDLE \ 189 - | 0*SD_BALANCE_EXEC \ 190 - | 0*SD_BALANCE_FORK \ 191 - | 0*SD_BALANCE_WAKE \ 192 - | 0*SD_WAKE_AFFINE \ 193 - | 0*SD_SHARE_CPUPOWER \ 194 - | 0*SD_POWERSAVINGS_BALANCE \ 195 - | 0*SD_SHARE_PKG_RESOURCES \ 196 - | 1*SD_SERIALIZE \ 197 - | 0*SD_PREFER_SIBLING \ 198 - , \ 199 - .last_balance = jiffies, \ 200 - .balance_interval = 64, \ 201 - } 202 - 203 - #ifndef SD_NODES_PER_DOMAIN 204 - #define SD_NODES_PER_DOMAIN 16 205 - #endif 206 - 207 179 #ifdef CONFIG_SCHED_BOOK 208 180 #ifndef SD_BOOK_INIT 209 181 #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! 210 182 #endif 211 183 #endif /* CONFIG_SCHED_BOOK */ 212 - 213 - #ifdef CONFIG_NUMA 214 - #ifndef SD_NODE_INIT 215 - #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! 216 - #endif 217 - 218 - #endif /* CONFIG_NUMA */ 219 184 220 185 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 221 186 DECLARE_PER_CPU(int, numa_node);
+185 -95
kernel/sched/core.c
··· 5560 5560 break; 5561 5561 } 5562 5562 5563 - if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5563 + if (!(sd->flags & SD_OVERLAP) && 5564 + cpumask_intersects(groupmask, sched_group_cpus(group))) { 5564 5565 printk(KERN_CONT "\n"); 5565 5566 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5566 5567 break; ··· 5899 5898 5900 5899 __setup("isolcpus=", isolated_cpu_setup); 5901 5900 5902 - #ifdef CONFIG_NUMA 5903 - 5904 - /** 5905 - * find_next_best_node - find the next node to include in a sched_domain 5906 - * @node: node whose sched_domain we're building 5907 - * @used_nodes: nodes already in the sched_domain 5908 - * 5909 - * Find the next node to include in a given scheduling domain. Simply 5910 - * finds the closest node not already in the @used_nodes map. 5911 - * 5912 - * Should use nodemask_t. 5913 - */ 5914 - static int find_next_best_node(int node, nodemask_t *used_nodes) 5915 - { 5916 - int i, n, val, min_val, best_node = -1; 5917 - 5918 - min_val = INT_MAX; 5919 - 5920 - for (i = 0; i < nr_node_ids; i++) { 5921 - /* Start at @node */ 5922 - n = (node + i) % nr_node_ids; 5923 - 5924 - if (!nr_cpus_node(n)) 5925 - continue; 5926 - 5927 - /* Skip already used nodes */ 5928 - if (node_isset(n, *used_nodes)) 5929 - continue; 5930 - 5931 - /* Simple min distance search */ 5932 - val = node_distance(node, n); 5933 - 5934 - if (val < min_val) { 5935 - min_val = val; 5936 - best_node = n; 5937 - } 5938 - } 5939 - 5940 - if (best_node != -1) 5941 - node_set(best_node, *used_nodes); 5942 - return best_node; 5943 - } 5944 - 5945 - /** 5946 - * sched_domain_node_span - get a cpumask for a node's sched_domain 5947 - * @node: node whose cpumask we're constructing 5948 - * @span: resulting cpumask 5949 - * 5950 - * Given a node, construct a good cpumask for its sched_domain to span. It 5951 - * should be one that prevents unnecessary balancing, but also spreads tasks 5952 - * out optimally. 5953 - */ 5954 - static void sched_domain_node_span(int node, struct cpumask *span) 5955 - { 5956 - nodemask_t used_nodes; 5957 - int i; 5958 - 5959 - cpumask_clear(span); 5960 - nodes_clear(used_nodes); 5961 - 5962 - cpumask_or(span, span, cpumask_of_node(node)); 5963 - node_set(node, used_nodes); 5964 - 5965 - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 5966 - int next_node = find_next_best_node(node, &used_nodes); 5967 - if (next_node < 0) 5968 - break; 5969 - cpumask_or(span, span, cpumask_of_node(next_node)); 5970 - } 5971 - } 5972 - 5973 - static const struct cpumask *cpu_node_mask(int cpu) 5974 - { 5975 - lockdep_assert_held(&sched_domains_mutex); 5976 - 5977 - sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); 5978 - 5979 - return sched_domains_tmpmask; 5980 - } 5981 - 5982 - static const struct cpumask *cpu_allnodes_mask(int cpu) 5983 - { 5984 - return cpu_possible_mask; 5985 - } 5986 - #endif /* CONFIG_NUMA */ 5987 - 5988 5901 static const struct cpumask *cpu_cpu_mask(int cpu) 5989 5902 { 5990 5903 return cpumask_of_node(cpu_to_node(cpu)); ··· 5935 6020 sched_domain_init_f init; 5936 6021 sched_domain_mask_f mask; 5937 6022 int flags; 6023 + int numa_level; 5938 6024 struct sd_data data; 5939 6025 }; 5940 6026 ··· 6129 6213 } 6130 6214 6131 6215 SD_INIT_FUNC(CPU) 6132 - #ifdef CONFIG_NUMA 6133 - SD_INIT_FUNC(ALLNODES) 6134 - SD_INIT_FUNC(NODE) 6135 - #endif 6136 6216 #ifdef CONFIG_SCHED_SMT 6137 6217 SD_INIT_FUNC(SIBLING) 6138 6218 #endif ··· 6250 6338 { sd_init_BOOK, cpu_book_mask, }, 6251 6339 #endif 6252 6340 { sd_init_CPU, cpu_cpu_mask, }, 6253 - #ifdef CONFIG_NUMA 6254 - { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, 6255 - { sd_init_ALLNODES, cpu_allnodes_mask, }, 6256 - #endif 6257 6341 { NULL, }, 6258 6342 }; 6259 6343 6260 6344 static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6345 + 6346 + #ifdef CONFIG_NUMA 6347 + 6348 + static int sched_domains_numa_levels; 6349 + static int sched_domains_numa_scale; 6350 + static int *sched_domains_numa_distance; 6351 + static struct cpumask ***sched_domains_numa_masks; 6352 + static int sched_domains_curr_level; 6353 + 6354 + static inline unsigned long numa_scale(unsigned long x, int level) 6355 + { 6356 + return x * sched_domains_numa_distance[level] / sched_domains_numa_scale; 6357 + } 6358 + 6359 + static inline int sd_local_flags(int level) 6360 + { 6361 + if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) 6362 + return 0; 6363 + 6364 + return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6365 + } 6366 + 6367 + static struct sched_domain * 6368 + sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 6369 + { 6370 + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6371 + int level = tl->numa_level; 6372 + int sd_weight = cpumask_weight( 6373 + sched_domains_numa_masks[level][cpu_to_node(cpu)]); 6374 + 6375 + *sd = (struct sched_domain){ 6376 + .min_interval = sd_weight, 6377 + .max_interval = 2*sd_weight, 6378 + .busy_factor = 32, 6379 + .imbalance_pct = 100 + numa_scale(25, level), 6380 + .cache_nice_tries = 2, 6381 + .busy_idx = 3, 6382 + .idle_idx = 2, 6383 + .newidle_idx = 0, 6384 + .wake_idx = 0, 6385 + .forkexec_idx = 0, 6386 + 6387 + .flags = 1*SD_LOAD_BALANCE 6388 + | 1*SD_BALANCE_NEWIDLE 6389 + | 0*SD_BALANCE_EXEC 6390 + | 0*SD_BALANCE_FORK 6391 + | 0*SD_BALANCE_WAKE 6392 + | 0*SD_WAKE_AFFINE 6393 + | 0*SD_PREFER_LOCAL 6394 + | 0*SD_SHARE_CPUPOWER 6395 + | 0*SD_POWERSAVINGS_BALANCE 6396 + | 0*SD_SHARE_PKG_RESOURCES 6397 + | 1*SD_SERIALIZE 6398 + | 0*SD_PREFER_SIBLING 6399 + | sd_local_flags(level) 6400 + , 6401 + .last_balance = jiffies, 6402 + .balance_interval = sd_weight, 6403 + }; 6404 + SD_INIT_NAME(sd, NUMA); 6405 + sd->private = &tl->data; 6406 + 6407 + /* 6408 + * Ugly hack to pass state to sd_numa_mask()... 6409 + */ 6410 + sched_domains_curr_level = tl->numa_level; 6411 + 6412 + return sd; 6413 + } 6414 + 6415 + static const struct cpumask *sd_numa_mask(int cpu) 6416 + { 6417 + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6418 + } 6419 + 6420 + static void sched_init_numa(void) 6421 + { 6422 + int next_distance, curr_distance = node_distance(0, 0); 6423 + struct sched_domain_topology_level *tl; 6424 + int level = 0; 6425 + int i, j, k; 6426 + 6427 + sched_domains_numa_scale = curr_distance; 6428 + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6429 + if (!sched_domains_numa_distance) 6430 + return; 6431 + 6432 + /* 6433 + * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6434 + * unique distances in the node_distance() table. 6435 + * 6436 + * Assumes node_distance(0,j) includes all distances in 6437 + * node_distance(i,j) in order to avoid cubic time. 6438 + * 6439 + * XXX: could be optimized to O(n log n) by using sort() 6440 + */ 6441 + next_distance = curr_distance; 6442 + for (i = 0; i < nr_node_ids; i++) { 6443 + for (j = 0; j < nr_node_ids; j++) { 6444 + int distance = node_distance(0, j); 6445 + if (distance > curr_distance && 6446 + (distance < next_distance || 6447 + next_distance == curr_distance)) 6448 + next_distance = distance; 6449 + } 6450 + if (next_distance != curr_distance) { 6451 + sched_domains_numa_distance[level++] = next_distance; 6452 + sched_domains_numa_levels = level; 6453 + curr_distance = next_distance; 6454 + } else break; 6455 + } 6456 + /* 6457 + * 'level' contains the number of unique distances, excluding the 6458 + * identity distance node_distance(i,i). 6459 + * 6460 + * The sched_domains_nume_distance[] array includes the actual distance 6461 + * numbers. 6462 + */ 6463 + 6464 + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6465 + if (!sched_domains_numa_masks) 6466 + return; 6467 + 6468 + /* 6469 + * Now for each level, construct a mask per node which contains all 6470 + * cpus of nodes that are that many hops away from us. 6471 + */ 6472 + for (i = 0; i < level; i++) { 6473 + sched_domains_numa_masks[i] = 6474 + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6475 + if (!sched_domains_numa_masks[i]) 6476 + return; 6477 + 6478 + for (j = 0; j < nr_node_ids; j++) { 6479 + struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); 6480 + if (!mask) 6481 + return; 6482 + 6483 + sched_domains_numa_masks[i][j] = mask; 6484 + 6485 + for (k = 0; k < nr_node_ids; k++) { 6486 + if (node_distance(cpu_to_node(j), k) > 6487 + sched_domains_numa_distance[i]) 6488 + continue; 6489 + 6490 + cpumask_or(mask, mask, cpumask_of_node(k)); 6491 + } 6492 + } 6493 + } 6494 + 6495 + tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6496 + sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6497 + if (!tl) 6498 + return; 6499 + 6500 + /* 6501 + * Copy the default topology bits.. 6502 + */ 6503 + for (i = 0; default_topology[i].init; i++) 6504 + tl[i] = default_topology[i]; 6505 + 6506 + /* 6507 + * .. and append 'j' levels of NUMA goodness. 6508 + */ 6509 + for (j = 0; j < level; i++, j++) { 6510 + tl[i] = (struct sched_domain_topology_level){ 6511 + .init = sd_numa_init, 6512 + .mask = sd_numa_mask, 6513 + .flags = SDTL_OVERLAP, 6514 + .numa_level = j, 6515 + }; 6516 + } 6517 + 6518 + sched_domain_topology = tl; 6519 + } 6520 + #else 6521 + static inline void sched_init_numa(void) 6522 + { 6523 + } 6524 + #endif /* CONFIG_NUMA */ 6261 6525 6262 6526 static int __sdt_alloc(const struct cpumask *cpu_map) 6263 6527 { ··· 6927 6839 6928 6840 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6929 6841 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6842 + 6843 + sched_init_numa(); 6930 6844 6931 6845 get_online_cpus(); 6932 6846 mutex_lock(&sched_domains_mutex);