Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/topology: Change behaviour of the 'sched_energy_aware' sysctl, based on the platform

The 'sched_energy_aware' sysctl is available for the admin to disable/enable
energy aware scheduling(EAS). EAS is enabled only if few conditions are
met by the platform. They are, asymmetric CPU capacity, no SMT,
schedutil CPUfreq governor, frequency invariant load tracking etc.
A platform may boot without EAS capability, but could gain such
capability at runtime. For example, changing/registering the cpufreq
governor to schedutil.

At present, though platform doesn't support EAS, this sysctl returns 1
and it ends up calling build_perf_domains on write to 1 and
NOP when writing to 0. That is confusing and un-necessary.

Desired behavior would be to have this sysctl to enable/disable the EAS
on supported platform. On non-supported platform write to the sysctl
would return not supported error and read of the sysctl would return
empty. So sched_energy_aware returns empty - EAS is not possible at this moment
This will include EAS capable platforms which have at least one EAS
condition false during startup, e.g. not using the schedutil cpufreq governor
sched_energy_aware returns 0 - EAS is supported but disabled by admin.
sched_energy_aware returns 1 - EAS is supported and enabled.

User can find out the reason why EAS is not possible by checking
info messages. sched_is_eas_possible returns true if the platform
can do EAS at this moment.

Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lore.kernel.org/r/20231009060037.170765-3-sshegde@linux.vnet.ibm.com

authored by

Shrikanth Hegde and committed by
Ingo Molnar
8f833c82 e03dc9fa

+76 -39
+2 -1
Documentation/admin-guide/sysctl/kernel.rst
··· 1182 1182 platforms with asymmetric CPU topologies and having an Energy 1183 1183 Model available). If your platform happens to meet the 1184 1184 requirements for EAS but you do not want to use it, change 1185 - this value to 0. 1185 + this value to 0. On Non-EAS platforms, write operation fails and 1186 + read doesn't return anything. 1186 1187 1187 1188 task_delayacct 1188 1189 ===============
+74 -38
kernel/sched/topology.c
··· 212 212 static DEFINE_MUTEX(sched_energy_mutex); 213 213 static bool sched_energy_update; 214 214 215 + extern struct cpufreq_governor schedutil_gov; 216 + static bool sched_is_eas_possible(const struct cpumask *cpu_mask) 217 + { 218 + bool any_asym_capacity = false; 219 + struct cpufreq_policy *policy; 220 + struct cpufreq_governor *gov; 221 + int i; 222 + 223 + /* EAS is enabled for asymmetric CPU capacity topologies. */ 224 + for_each_cpu(i, cpu_mask) { 225 + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { 226 + any_asym_capacity = true; 227 + break; 228 + } 229 + } 230 + if (!any_asym_capacity) { 231 + if (sched_debug()) { 232 + pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n", 233 + cpumask_pr_args(cpu_mask)); 234 + } 235 + return false; 236 + } 237 + 238 + /* EAS definitely does *not* handle SMT */ 239 + if (sched_smt_active()) { 240 + if (sched_debug()) { 241 + pr_info("rd %*pbl: Checking EAS, SMT is not supported\n", 242 + cpumask_pr_args(cpu_mask)); 243 + } 244 + return false; 245 + } 246 + 247 + if (!arch_scale_freq_invariant()) { 248 + if (sched_debug()) { 249 + pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported", 250 + cpumask_pr_args(cpu_mask)); 251 + } 252 + return false; 253 + } 254 + 255 + /* Do not attempt EAS if schedutil is not being used. */ 256 + for_each_cpu(i, cpu_mask) { 257 + policy = cpufreq_cpu_get(i); 258 + if (!policy) { 259 + if (sched_debug()) { 260 + pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", 261 + cpumask_pr_args(cpu_mask), i); 262 + } 263 + return false; 264 + } 265 + gov = policy->governor; 266 + cpufreq_cpu_put(policy); 267 + if (gov != &schedutil_gov) { 268 + if (sched_debug()) { 269 + pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", 270 + cpumask_pr_args(cpu_mask)); 271 + } 272 + return false; 273 + } 274 + } 275 + 276 + return true; 277 + } 278 + 215 279 void rebuild_sched_domains_energy(void) 216 280 { 217 281 mutex_lock(&sched_energy_mutex); ··· 293 229 294 230 if (write && !capable(CAP_SYS_ADMIN)) 295 231 return -EPERM; 232 + 233 + if (!sched_is_eas_possible(cpu_active_mask)) { 234 + if (write) { 235 + return -EOPNOTSUPP; 236 + } else { 237 + *lenp = 0; 238 + return 0; 239 + } 240 + } 296 241 297 242 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 298 243 if (!ret && write) { ··· 424 351 * 4. schedutil is driving the frequency of all CPUs of the rd; 425 352 * 5. frequency invariance support is present; 426 353 */ 427 - extern struct cpufreq_governor schedutil_gov; 428 354 static bool build_perf_domains(const struct cpumask *cpu_map) 429 355 { 430 356 int i; 431 357 struct perf_domain *pd = NULL, *tmp; 432 358 int cpu = cpumask_first(cpu_map); 433 359 struct root_domain *rd = cpu_rq(cpu)->rd; 434 - struct cpufreq_policy *policy; 435 - struct cpufreq_governor *gov; 436 360 437 361 if (!sysctl_sched_energy_aware) 438 362 goto free; 439 363 440 - /* EAS is enabled for asymmetric CPU capacity topologies. */ 441 - if (!per_cpu(sd_asym_cpucapacity, cpu)) { 442 - if (sched_debug()) { 443 - pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", 444 - cpumask_pr_args(cpu_map)); 445 - } 364 + if (!sched_is_eas_possible(cpu_map)) 446 365 goto free; 447 - } 448 - 449 - /* EAS definitely does *not* handle SMT */ 450 - if (sched_smt_active()) { 451 - pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", 452 - cpumask_pr_args(cpu_map)); 453 - goto free; 454 - } 455 - 456 - if (!arch_scale_freq_invariant()) { 457 - if (sched_debug()) { 458 - pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", 459 - cpumask_pr_args(cpu_map)); 460 - } 461 - goto free; 462 - } 463 366 464 367 for_each_cpu(i, cpu_map) { 465 368 /* Skip already covered CPUs. */ 466 369 if (find_pd(pd, i)) 467 370 continue; 468 - 469 - /* Do not attempt EAS if schedutil is not being used. */ 470 - policy = cpufreq_cpu_get(i); 471 - if (!policy) 472 - goto free; 473 - gov = policy->governor; 474 - cpufreq_cpu_put(policy); 475 - if (gov != &schedutil_gov) { 476 - if (rd->pd) 477 - pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", 478 - cpumask_pr_args(cpu_map)); 479 - goto free; 480 - } 481 371 482 372 /* Create the new pd and add it to the local list. */ 483 373 tmp = pd_init(i);