sched_ext: Use the NUMA scheduling domain for NUMA optimizations

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Rely on the NUMA scheduling domain topology, instead of accessing NUMA
topology information directly.

There is basically no functional change, but in this way we ensure
consistent use of the same topology information determined by the
scheduling subsystem.

Fixes: f6ce6b949304 ("sched_ext: Do not enable LLC/NUMA optimizations when domains overlap")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Andrea Righi and committed by

Tejun Heo 1 year ago 45725418 f24d1929

+86 -28

1 changed file

expand all

kernel

sched

ext.c

+86 -28

kernel/sched/ext.c

··· 3216 3216 } 3217 3217 3218 3218 /* 3219 + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC 3220 + * domain is not defined). 3221 + */ 3222 + static unsigned int llc_weight(s32 cpu) 3223 + { 3224 + struct sched_domain *sd; 3225 + 3226 + sd = rcu_dereference(per_cpu(sd_llc, cpu)); 3227 + if (!sd) 3228 + return 0; 3229 + 3230 + return sd->span_weight; 3231 + } 3232 + 3233 + /* 3234 + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC 3235 + * domain is not defined). 3236 + */ 3237 + static struct cpumask *llc_span(s32 cpu) 3238 + { 3239 + struct sched_domain *sd; 3240 + 3241 + sd = rcu_dereference(per_cpu(sd_llc, cpu)); 3242 + if (!sd) 3243 + return 0; 3244 + 3245 + return sched_domain_span(sd); 3246 + } 3247 + 3248 + /* 3249 + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the 3250 + * NUMA domain is not defined). 3251 + */ 3252 + static unsigned int numa_weight(s32 cpu) 3253 + { 3254 + struct sched_domain *sd; 3255 + struct sched_group *sg; 3256 + 3257 + sd = rcu_dereference(per_cpu(sd_numa, cpu)); 3258 + if (!sd) 3259 + return 0; 3260 + sg = sd->groups; 3261 + if (!sg) 3262 + return 0; 3263 + 3264 + return sg->group_weight; 3265 + } 3266 + 3267 + /* 3268 + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA 3269 + * domain is not defined). 3270 + */ 3271 + static struct cpumask *numa_span(s32 cpu) 3272 + { 3273 + struct sched_domain *sd; 3274 + struct sched_group *sg; 3275 + 3276 + sd = rcu_dereference(per_cpu(sd_numa, cpu)); 3277 + if (!sd) 3278 + return NULL; 3279 + sg = sd->groups; 3280 + if (!sg) 3281 + return NULL; 3282 + 3283 + return sched_group_span(sg); 3284 + } 3285 + 3286 + /* 3219 3287 * Return true if the LLC domains do not perfectly overlap with the NUMA 3220 3288 * domains, false otherwise. 3221 3289 */ ··· 3314 3246 * overlapping, which is incorrect (as NUMA 1 has two distinct LLC 3315 3247 * domains). 3316 3248 */ 3317 - for_each_online_cpu(cpu) { 3318 - const struct cpumask *numa_cpus; 3319 - struct sched_domain *sd; 3320 - 3321 - sd = rcu_dereference(per_cpu(sd_llc, cpu)); 3322 - if (!sd) 3249 + for_each_online_cpu(cpu) 3250 + if (llc_weight(cpu) != numa_weight(cpu)) 3323 3251 return true; 3324 - 3325 - numa_cpus = cpumask_of_node(cpu_to_node(cpu)); 3326 - if (sd->span_weight != cpumask_weight(numa_cpus)) 3327 - return true; 3328 - } 3329 3252 3330 3253 return false; 3331 3254 } ··· 3335 3276 static void update_selcpu_topology(void) 3336 3277 { 3337 3278 bool enable_llc = false, enable_numa = false; 3338 - struct sched_domain *sd; 3339 - const struct cpumask *cpus; 3279 + unsigned int nr_cpus; 3340 3280 s32 cpu = cpumask_first(cpu_online_mask); 3341 3281 3342 3282 /* ··· 3349 3291 * CPUs. 3350 3292 */ 3351 3293 rcu_read_lock(); 3352 - sd = rcu_dereference(per_cpu(sd_llc, cpu)); 3353 - if (sd) { 3354 - if (sd->span_weight < num_online_cpus()) 3294 + nr_cpus = llc_weight(cpu); 3295 + if (nr_cpus > 0) { 3296 + if (nr_cpus < num_online_cpus()) 3355 3297 enable_llc = true; 3298 + pr_debug("sched_ext: LLC=%*pb weight=%u\n", 3299 + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); 3356 3300 } 3357 3301 3358 3302 /* ··· 3366 3306 * enabling both NUMA and LLC optimizations is unnecessary, as checking 3367 3307 * for an idle CPU in the same domain twice is redundant. 3368 3308 */ 3369 - cpus = cpumask_of_node(cpu_to_node(cpu)); 3370 - if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch()) 3371 - enable_numa = true; 3309 + nr_cpus = numa_weight(cpu); 3310 + if (nr_cpus > 0) { 3311 + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) 3312 + enable_numa = true; 3313 + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", 3314 + cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); 3315 + } 3372 3316 rcu_read_unlock(); 3373 3317 3374 3318 pr_debug("sched_ext: LLC idle selection %s\n", ··· 3424 3360 3425 3361 *found = false; 3426 3362 3427 - 3428 3363 /* 3429 3364 * This is necessary to protect llc_cpus. 3430 3365 */ ··· 3442 3379 */ 3443 3380 if (p->nr_cpus_allowed >= num_possible_cpus()) { 3444 3381 if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) 3445 - numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu)); 3382 + numa_cpus = numa_span(prev_cpu); 3446 3383 3447 - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { 3448 - struct sched_domain *sd; 3449 - 3450 - sd = rcu_dereference(per_cpu(sd_llc, prev_cpu)); 3451 - if (sd) 3452 - llc_cpus = sched_domain_span(sd); 3453 - } 3384 + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) 3385 + llc_cpus = llc_span(prev_cpu); 3454 3386 } 3455 3387 3456 3388 /*