Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branches 'pm-cpuidle' and 'pm-powercap'

Merge cpuidle and power capping updates for 6.19-rc1:

- Use residency threshold in polling state override decisions in the
menu cpuidle governor (Aboorva Devarajan)

- Add sanity check for exit latency and target residency in the cpufreq
core (Rafael Wysocki)

- Use this_cpu_ptr() where possible in the teo governor (Christian
Loehle)

- Rework the handling of tick wakeups in the teo cpuidle governor to
increase the likelihood of stopping the scheduler tick in the cases
when tick wakeups can be counted as non-timer ones (Rafael Wysocki)

- Fix a reverse condition in the teo cpuidle governor and drop a
misguided target residency check from it (Rafael Wysocki)

- Clean up muliple minor defects in the teo cpuidle governor (Rafael
Wysocki)

- Update header inclusion to make it follow the Include What You Use
principle (Andy Shevchenko)

- Enable MSR-based RAPL PMU support in the intel_rapl power capping
driver and arrange for using it on the Panther Lake and Wildcat Lake
processors (Kuppuswamy Sathyanarayanan)

- Add support for Nova Lake and Wildcat Lake processors to the
intel_rapl power capping driver (Kaushlendra Kumar, Srinivas
Pandruvada)

* pm-cpuidle:
cpuidle: Warn instead of bailing out if target residency check fails
cpuidle: Update header inclusion
cpuidle: governors: teo: Add missing space to the description
cpuidle: governors: teo: Simplify intercepts-based state lookup
cpuidle: governors: teo: Fix tick_intercepts handling in teo_update()
cpuidle: governors: teo: Rework the handling of tick wakeups
cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold
cpuidle: governors: teo: Use s64 consistently in teo_update()
cpuidle: governors: teo: Drop redundant function parameter
cpuidle: governors: teo: Drop misguided target residency check
cpuidle: teo: Use this_cpu_ptr() where possible
cpuidle: Add sanity check for exit latency and target residency
cpuidle: menu: Use residency threshold in polling state override decisions

* pm-powercap:
powercap: intel_rapl: Enable MSR-based RAPL PMU support
powercap: intel_rapl: Prepare read_raw() interface for atomic-context callers
powercap: intel_rapl: Add support for Nova Lake processors
powercap: intel_rapl: Add support for Wildcat Lake platform

+156 -114
+10
drivers/cpuidle/driver.c
··· 8 8 * This code is licenced under the GPL. 9 9 */ 10 10 11 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 + 11 13 #include <linux/mutex.h> 12 14 #include <linux/module.h> 13 15 #include <linux/sched.h> ··· 195 193 s->exit_latency_ns = 0; 196 194 else 197 195 s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); 196 + 197 + /* 198 + * Warn if the exit latency of a CPU idle state exceeds its 199 + * target residency which is assumed to never happen in cpuidle 200 + * in multiple places. 201 + */ 202 + if (s->exit_latency_ns > s->target_residency_ns) 203 + pr_warn("Idle state %d target residency too low\n", i); 198 204 } 199 205 } 200 206
+5 -4
drivers/cpuidle/governors/menu.c
··· 317 317 } 318 318 319 319 /* 320 - * Use a physical idle state, not busy polling, unless a timer 321 - * is going to trigger soon enough or the exit latency of the 322 - * idle state in question is greater than the predicted idle 323 - * duration. 320 + * Use a physical idle state instead of busy polling so long as 321 + * its target residency is below the residency threshold, its 322 + * exit latency is not greater than the predicted idle duration, 323 + * and the next timer doesn't expire soon. 324 324 */ 325 325 if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && 326 + s->target_residency_ns < RESIDENCY_THRESHOLD_NS && 326 327 s->target_residency_ns <= data->next_timer_ns && 327 328 s->exit_latency_ns <= predicted_ns) { 328 329 predicted_ns = s->target_residency_ns;
+72 -87
drivers/cpuidle/governors/teo.c
··· 76 76 * likely woken up by a non-timer wakeup source). 77 77 * 78 78 * 2. If the second sum computed in step 1 is greater than a half of the sum of 79 - * both metrics for the candidate state bin and all subsequent bins(if any), 79 + * both metrics for the candidate state bin and all subsequent bins (if any), 80 80 * a shallower idle state is likely to be more suitable, so look for it. 81 81 * 82 82 * - Traverse the enabled idle states shallower than the candidate one in the ··· 133 133 * @sleep_length_ns: Time till the closest timer event (at the selection time). 134 134 * @state_bins: Idle state data bins for this CPU. 135 135 * @total: Grand total of the "intercepts" and "hits" metrics for all bins. 136 + * @total_tick: Wakeups by the scheduler tick. 136 137 * @tick_intercepts: "Intercepts" before TICK_NSEC. 137 138 * @short_idles: Wakeups after short idle periods. 138 - * @artificial_wakeup: Set if the wakeup has been triggered by a safety net. 139 + * @tick_wakeup: Set if the last wakeup was by the scheduler tick. 139 140 */ 140 141 struct teo_cpu { 141 142 s64 sleep_length_ns; 142 143 struct teo_bin state_bins[CPUIDLE_STATE_MAX]; 143 144 unsigned int total; 145 + unsigned int total_tick; 144 146 unsigned int tick_intercepts; 145 147 unsigned int short_idles; 146 - bool artificial_wakeup; 148 + bool tick_wakeup; 147 149 }; 148 150 149 151 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); 152 + 153 + static void teo_decay(unsigned int *metric) 154 + { 155 + unsigned int delta = *metric >> DECAY_SHIFT; 156 + 157 + if (delta) 158 + *metric -= delta; 159 + else 160 + *metric = 0; 161 + } 150 162 151 163 /** 152 164 * teo_update - Update CPU metrics after wakeup. ··· 167 155 */ 168 156 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) 169 157 { 170 - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 158 + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); 171 159 int i, idx_timer = 0, idx_duration = 0; 172 - s64 target_residency_ns; 173 - u64 measured_ns; 160 + s64 target_residency_ns, measured_ns; 161 + unsigned int total = 0; 174 162 175 - cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; 163 + teo_decay(&cpu_data->short_idles); 176 164 177 - if (cpu_data->artificial_wakeup) { 165 + if (dev->poll_time_limit) { 166 + dev->poll_time_limit = false; 178 167 /* 179 - * If one of the safety nets has triggered, assume that this 168 + * Polling state timeout has triggered, so assume that this 180 169 * might have been a long sleep. 181 170 */ 182 - measured_ns = U64_MAX; 171 + measured_ns = S64_MAX; 183 172 } else { 184 - u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; 173 + s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; 185 174 186 175 measured_ns = dev->last_residency_ns; 187 176 /* ··· 209 196 for (i = 0; i < drv->state_count; i++) { 210 197 struct teo_bin *bin = &cpu_data->state_bins[i]; 211 198 212 - bin->hits -= bin->hits >> DECAY_SHIFT; 213 - bin->intercepts -= bin->intercepts >> DECAY_SHIFT; 199 + teo_decay(&bin->hits); 200 + total += bin->hits; 201 + teo_decay(&bin->intercepts); 202 + total += bin->intercepts; 214 203 215 204 target_residency_ns = drv->states[i].target_residency_ns; 216 205 ··· 223 208 } 224 209 } 225 210 226 - cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT; 211 + cpu_data->total = total + PULSE; 212 + 213 + teo_decay(&cpu_data->tick_intercepts); 214 + 215 + teo_decay(&cpu_data->total_tick); 216 + if (cpu_data->tick_wakeup) { 217 + cpu_data->total_tick += PULSE; 218 + /* 219 + * If tick wakeups dominate the wakeup pattern, count this one 220 + * as a hit on the deepest available idle state to increase the 221 + * likelihood of stopping the tick. 222 + */ 223 + if (3 * cpu_data->total_tick > 2 * cpu_data->total) { 224 + cpu_data->state_bins[drv->state_count-1].hits += PULSE; 225 + return; 226 + } 227 + } 228 + 227 229 /* 228 230 * If the measured idle duration falls into the same bin as the sleep 229 231 * length, this is a "hit", so update the "hits" metric for that bin. ··· 251 219 cpu_data->state_bins[idx_timer].hits += PULSE; 252 220 } else { 253 221 cpu_data->state_bins[idx_duration].intercepts += PULSE; 254 - if (TICK_NSEC <= measured_ns) 222 + if (measured_ns <= TICK_NSEC) 255 223 cpu_data->tick_intercepts += PULSE; 256 224 } 257 - 258 - cpu_data->total -= cpu_data->total >> DECAY_SHIFT; 259 - cpu_data->total += PULSE; 260 - } 261 - 262 - static bool teo_state_ok(int i, struct cpuidle_driver *drv) 263 - { 264 - return !tick_nohz_tick_stopped() || 265 - drv->states[i].target_residency_ns >= TICK_NSEC; 266 225 } 267 226 268 227 /** ··· 262 239 * @dev: Target CPU. 263 240 * @state_idx: Index of the capping idle state. 264 241 * @duration_ns: Idle duration value to match. 265 - * @no_poll: Don't consider polling states. 266 242 */ 267 243 static int teo_find_shallower_state(struct cpuidle_driver *drv, 268 244 struct cpuidle_device *dev, int state_idx, 269 - s64 duration_ns, bool no_poll) 245 + s64 duration_ns) 270 246 { 271 247 int i; 272 248 273 249 for (i = state_idx - 1; i >= 0; i--) { 274 - if (dev->states_usage[i].disable || 275 - (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) 250 + if (dev->states_usage[i].disable) 276 251 continue; 277 252 278 253 state_idx = i; ··· 289 268 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, 290 269 bool *stop_tick) 291 270 { 292 - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 271 + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); 293 272 s64 latency_req = cpuidle_governor_latency_req(dev->cpu); 294 273 ktime_t delta_tick = TICK_NSEC / 2; 295 274 unsigned int idx_intercept_sum = 0; ··· 377 356 * better choice. 378 357 */ 379 358 if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { 380 - int first_suitable_idx = idx; 359 + int min_idx = idx0; 360 + 361 + if (tick_nohz_tick_stopped()) { 362 + /* 363 + * Look for the shallowest idle state below the current 364 + * candidate one whose target residency is at least 365 + * equal to the tick period length. 366 + */ 367 + while (min_idx < idx && 368 + drv->states[min_idx].target_residency_ns < TICK_NSEC) 369 + min_idx++; 370 + } 381 371 382 372 /* 383 373 * Look for the deepest idle state whose target residency had ··· 398 366 * Take the possible duration limitation present if the tick 399 367 * has been stopped already into account. 400 368 */ 401 - intercept_sum = 0; 402 - 403 - for (i = idx - 1; i >= 0; i--) { 404 - struct teo_bin *bin = &cpu_data->state_bins[i]; 405 - 406 - intercept_sum += bin->intercepts; 407 - 408 - if (2 * intercept_sum > idx_intercept_sum) { 409 - /* 410 - * Use the current state unless it is too 411 - * shallow or disabled, in which case take the 412 - * first enabled state that is deep enough. 413 - */ 414 - if (teo_state_ok(i, drv) && 415 - !dev->states_usage[i].disable) { 416 - idx = i; 417 - break; 418 - } 419 - idx = first_suitable_idx; 420 - break; 421 - } 369 + for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) { 370 + intercept_sum += cpu_data->state_bins[i].intercepts; 422 371 423 372 if (dev->states_usage[i].disable) 424 373 continue; 425 374 426 - if (teo_state_ok(i, drv)) { 427 - /* 428 - * The current state is deep enough, but still 429 - * there may be a better one. 430 - */ 431 - first_suitable_idx = i; 432 - continue; 433 - } 434 - 435 - /* 436 - * The current state is too shallow, so if no suitable 437 - * states other than the initial candidate have been 438 - * found, give up (the remaining states to check are 439 - * shallower still), but otherwise the first suitable 440 - * state other than the initial candidate may turn out 441 - * to be preferable. 442 - */ 443 - if (first_suitable_idx == idx) 375 + idx = i; 376 + if (2 * intercept_sum > idx_intercept_sum) 444 377 break; 445 378 } 446 379 } ··· 455 458 * If the closest expected timer is before the target residency of the 456 459 * candidate state, a shallower one needs to be found. 457 460 */ 458 - if (drv->states[idx].target_residency_ns > duration_ns) { 459 - i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); 460 - if (teo_state_ok(i, drv)) 461 - idx = i; 462 - } 461 + if (drv->states[idx].target_residency_ns > duration_ns) 462 + idx = teo_find_shallower_state(drv, dev, idx, duration_ns); 463 463 464 464 /* 465 465 * If the selected state's target residency is below the tick length ··· 484 490 */ 485 491 if (idx > idx0 && 486 492 drv->states[idx].target_residency_ns > delta_tick) 487 - idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); 493 + idx = teo_find_shallower_state(drv, dev, idx, delta_tick); 488 494 489 495 out_tick: 490 496 *stop_tick = false; ··· 498 504 */ 499 505 static void teo_reflect(struct cpuidle_device *dev, int state) 500 506 { 501 - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 507 + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); 508 + 509 + cpu_data->tick_wakeup = tick_nohz_idle_got_tick(); 502 510 503 511 dev->last_state_idx = state; 504 - if (dev->poll_time_limit || 505 - (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { 506 - /* 507 - * The wakeup was not "genuine", but triggered by one of the 508 - * safety nets. 509 - */ 510 - dev->poll_time_limit = false; 511 - cpu_data->artificial_wakeup = true; 512 - } else { 513 - cpu_data->artificial_wakeup = false; 514 - } 515 512 } 516 513 517 514 /**
+4
drivers/cpuidle/poll_state.c
··· 4 4 */ 5 5 6 6 #include <linux/cpuidle.h> 7 + #include <linux/export.h> 8 + #include <linux/irqflags.h> 7 9 #include <linux/sched.h> 8 10 #include <linux/sched/clock.h> 9 11 #include <linux/sched/idle.h> 12 + #include <linux/sprintf.h> 13 + #include <linux/types.h> 10 14 11 15 #define POLL_IDLE_RELAX_COUNT 200 12 16
+22 -17
drivers/powercap/intel_rapl_common.c
··· 253 253 static void rapl_init_domains(struct rapl_package *rp); 254 254 static int rapl_read_data_raw(struct rapl_domain *rd, 255 255 enum rapl_primitives prim, 256 - bool xlate, u64 *data); 256 + bool xlate, u64 *data, 257 + bool atomic); 257 258 static int rapl_write_data_raw(struct rapl_domain *rd, 258 259 enum rapl_primitives prim, 259 260 unsigned long long value); ··· 290 289 cpus_read_lock(); 291 290 rd = power_zone_to_rapl_domain(power_zone); 292 291 293 - if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { 292 + if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now, false)) { 294 293 *energy_raw = energy_now; 295 294 cpus_read_unlock(); 296 295 ··· 831 830 * 63-------------------------- 31--------------------------- 0 832 831 */ 833 832 static int rapl_read_data_raw(struct rapl_domain *rd, 834 - enum rapl_primitives prim, bool xlate, u64 *data) 833 + enum rapl_primitives prim, bool xlate, u64 *data, 834 + bool atomic) 835 835 { 836 836 u64 value; 837 837 enum rapl_primitives prim_fixed = prim_fixups(rd, prim); ··· 854 852 855 853 ra.mask = rpi->mask; 856 854 857 - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { 855 + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, atomic)) { 858 856 pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); 859 857 return -EIO; 860 858 } ··· 906 904 if (!is_pl_valid(rd, pl)) 907 905 return -EINVAL; 908 906 909 - return rapl_read_data_raw(rd, prim, xlate, data); 907 + return rapl_read_data_raw(rd, prim, xlate, data, false); 910 908 } 911 909 912 910 static int rapl_write_pl_data(struct rapl_domain *rd, int pl, ··· 943 941 944 942 ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; 945 943 ra.mask = ~0; 946 - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { 944 + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { 947 945 pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", 948 946 ra.reg.val, rd->rp->name, rd->name); 949 947 return -ENODEV; ··· 971 969 972 970 ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; 973 971 ra.mask = ~0; 974 - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { 972 + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { 975 973 pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", 976 974 ra.reg.val, rd->rp->name, rd->name); 977 975 return -ENODEV; ··· 1158 1156 1159 1157 ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; 1160 1158 ra.mask = ~0; 1161 - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { 1159 + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { 1162 1160 pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", 1163 1161 ra.reg.val, rd->rp->name, rd->name); 1164 1162 return -ENODEV; ··· 1286 1284 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), 1287 1285 X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), 1288 1286 X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), 1287 + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), 1288 + X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), 1289 + X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), 1289 1290 X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), 1290 1291 X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), 1291 1292 X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), ··· 1330 1325 struct rapl_primitive_info *rpi = get_rpi(rp, prim); 1331 1326 1332 1327 if (!rapl_read_data_raw(&rp->domains[dmn], prim, 1333 - rpi->unit, &val)) 1328 + rpi->unit, &val, false)) 1334 1329 rp->domains[dmn].rdd.primitives[prim] = val; 1335 1330 } 1336 1331 } ··· 1430 1425 */ 1431 1426 1432 1427 ra.mask = ENERGY_STATUS_MASK; 1433 - if (rp->priv->read_raw(get_rid(rp), &ra) || !ra.value) 1428 + if (rp->priv->read_raw(get_rid(rp), &ra, false) || !ra.value) 1434 1429 return -ENODEV; 1435 1430 1436 1431 return 0; ··· 1597 1592 if (!rp->has_pmu) 1598 1593 return nr_cpu_ids; 1599 1594 1600 - /* Only TPMI RAPL is supported for now */ 1601 - if (rp->priv->type != RAPL_IF_TPMI) 1595 + /* Only TPMI & MSR RAPL are supported for now */ 1596 + if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) 1602 1597 return nr_cpu_ids; 1603 1598 1604 - /* TPMI RAPL uses any CPU in the package for PMU */ 1599 + /* TPMI/MSR RAPL uses any CPU in the package for PMU */ 1605 1600 for_each_online_cpu(cpu) 1606 1601 if (topology_physical_package_id(cpu) == rp->id) 1607 1602 return cpu; ··· 1614 1609 if (!rp->has_pmu) 1615 1610 return false; 1616 1611 1617 - /* Only TPMI RAPL is supported for now */ 1618 - if (rp->priv->type != RAPL_IF_TPMI) 1612 + /* Only TPMI & MSR RAPL are supported for now */ 1613 + if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) 1619 1614 return false; 1620 1615 1621 - /* TPMI RAPL uses any CPU in the package for PMU */ 1616 + /* TPMI/MSR RAPL uses any CPU in the package for PMU */ 1622 1617 return topology_physical_package_id(cpu) == rp->id; 1623 1618 } 1624 1619 ··· 1641 1636 if (event->hw.idx < 0) 1642 1637 return 0; 1643 1638 1644 - ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); 1639 + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val, true); 1645 1640 1646 1641 /* Return 0 for failed read */ 1647 1642 if (ret)
+40 -3
drivers/powercap/intel_rapl_msr.c
··· 33 33 /* private data for RAPL MSR Interface */ 34 34 static struct rapl_if_priv *rapl_msr_priv; 35 35 36 + static bool rapl_msr_pmu __ro_after_init; 37 + 36 38 static struct rapl_if_priv rapl_msr_priv_intel = { 37 39 .type = RAPL_IF_MSR, 38 40 .reg_unit.msr = MSR_RAPL_POWER_UNIT, ··· 81 79 rp = rapl_add_package_cpuslocked(cpu, rapl_msr_priv, true); 82 80 if (IS_ERR(rp)) 83 81 return PTR_ERR(rp); 82 + if (rapl_msr_pmu) 83 + rapl_package_add_pmu(rp); 84 84 } 85 85 cpumask_set_cpu(cpu, &rp->cpumask); 86 86 return 0; ··· 99 95 100 96 cpumask_clear_cpu(cpu, &rp->cpumask); 101 97 lead_cpu = cpumask_first(&rp->cpumask); 102 - if (lead_cpu >= nr_cpu_ids) 98 + if (lead_cpu >= nr_cpu_ids) { 99 + if (rapl_msr_pmu) 100 + rapl_package_remove_pmu(rp); 103 101 rapl_remove_package_cpuslocked(rp); 104 - else if (rp->lead_cpu == cpu) 102 + } else if (rp->lead_cpu == cpu) { 105 103 rp->lead_cpu = lead_cpu; 104 + } 105 + 106 106 return 0; 107 107 } 108 108 109 - static int rapl_msr_read_raw(int cpu, struct reg_action *ra) 109 + static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool atomic) 110 110 { 111 + /* 112 + * When called from atomic-context (eg PMU event handler) 113 + * perform MSR read directly using rdmsrq(). 114 + */ 115 + if (atomic) { 116 + if (unlikely(smp_processor_id() != cpu)) 117 + return -EIO; 118 + 119 + rdmsrq(ra->reg.msr, ra->value); 120 + goto out; 121 + } 122 + 111 123 if (rdmsrq_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { 112 124 pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu); 113 125 return -EIO; 114 126 } 127 + 128 + out: 115 129 ra->value &= ra->mask; 116 130 return 0; 117 131 } ··· 173 151 X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL), 174 152 X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), 175 153 X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), 154 + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), 155 + X86_MATCH_VFM(INTEL_NOVALAKE, NULL), 156 + X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL), 157 + {} 158 + }; 159 + 160 + /* List of MSR-based RAPL PMU support CPUs */ 161 + static const struct x86_cpu_id pmu_support_ids[] = { 162 + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), 163 + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), 176 164 {} 177 165 }; 178 166 ··· 211 179 rapl_msr_priv->regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PL4].msr = 212 180 MSR_VR_CURRENT_CONFIG; 213 181 pr_info("PL4 support detected.\n"); 182 + } 183 + 184 + if (x86_match_cpu(pmu_support_ids)) { 185 + rapl_msr_pmu = true; 186 + pr_info("MSR-based RAPL PMU support enabled\n"); 214 187 } 215 188 216 189 rapl_msr_priv->control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+1 -1
drivers/powercap/intel_rapl_tpmi.c
··· 60 60 61 61 static struct powercap_control_type *tpmi_control_type; 62 62 63 - static int tpmi_rapl_read_raw(int id, struct reg_action *ra) 63 + static int tpmi_rapl_read_raw(int id, struct reg_action *ra, bool atomic) 64 64 { 65 65 if (!ra->reg.mmio) 66 66 return -EINVAL;
+1 -1
drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c
··· 19 19 .limits[RAPL_DOMAIN_DRAM] = BIT(POWER_LIMIT2), 20 20 }; 21 21 22 - static int rapl_mmio_read_raw(int cpu, struct reg_action *ra) 22 + static int rapl_mmio_read_raw(int cpu, struct reg_action *ra, bool atomic) 23 23 { 24 24 if (!ra->reg.mmio) 25 25 return -EINVAL;
+1 -1
include/linux/intel_rapl.h
··· 152 152 union rapl_reg reg_unit; 153 153 union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; 154 154 int limits[RAPL_DOMAIN_MAX]; 155 - int (*read_raw)(int id, struct reg_action *ra); 155 + int (*read_raw)(int id, struct reg_action *ra, bool atomic); 156 156 int (*write_raw)(int id, struct reg_action *ra); 157 157 void *defaults; 158 158 void *rpi;