Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cpuidle: Fix ct_idle_*() usage

The whole disable-RCU, enable-IRQS dance is very intricate since
changing IRQ state is traced, which depends on RCU.

Add two helpers for the cpuidle case that mirror the entry code:

ct_cpuidle_enter()
ct_cpuidle_exit()

And fix all the cases where the enter/exit dance was buggy.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Tony Lindgren <tony@atomide.com>
Tested-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20230112195540.130014793@infradead.org

authored by

Peter Zijlstra and committed by
Ingo Molnar
a01353cf 0c5ffc3d

+86 -66
+2 -2
arch/arm/mach-imx/cpuidle-imx6q.c
··· 25 25 imx6_set_lpm(WAIT_UNCLOCKED); 26 26 raw_spin_unlock(&cpuidle_lock); 27 27 28 - ct_idle_enter(); 28 + ct_cpuidle_enter(); 29 29 cpu_do_idle(); 30 - ct_idle_exit(); 30 + ct_cpuidle_exit(); 31 31 32 32 raw_spin_lock(&cpuidle_lock); 33 33 if (num_idle_cpus-- == num_online_cpus())
+2 -2
arch/arm/mach-imx/cpuidle-imx6sx.c
··· 47 47 cpu_pm_enter(); 48 48 cpu_cluster_pm_enter(); 49 49 50 - ct_idle_enter(); 50 + ct_cpuidle_enter(); 51 51 cpu_suspend(0, imx6sx_idle_finish); 52 - ct_idle_exit(); 52 + ct_cpuidle_exit(); 53 53 54 54 cpu_cluster_pm_exit(); 55 55 cpu_pm_exit();
+2 -2
arch/arm/mach-omap2/cpuidle34xx.c
··· 133 133 } 134 134 135 135 /* Execute ARM wfi */ 136 - ct_idle_enter(); 136 + ct_cpuidle_enter(); 137 137 omap_sram_idle(); 138 - ct_idle_exit(); 138 + ct_cpuidle_exit(); 139 139 140 140 /* 141 141 * Call idle CPU PM enter notifier chain to restore
+4 -4
arch/arm/mach-omap2/cpuidle44xx.c
··· 105 105 } 106 106 raw_spin_unlock_irqrestore(&mpu_lock, flag); 107 107 108 - ct_idle_enter(); 108 + ct_cpuidle_enter(); 109 109 omap4_enter_lowpower(dev->cpu, cx->cpu_state); 110 - ct_idle_exit(); 110 + ct_cpuidle_exit(); 111 111 112 112 raw_spin_lock_irqsave(&mpu_lock, flag); 113 113 if (cx->mpu_state_vote == num_online_cpus()) ··· 186 186 } 187 187 } 188 188 189 - ct_idle_enter(); 189 + ct_cpuidle_enter(); 190 190 omap4_enter_lowpower(dev->cpu, cx->cpu_state); 191 191 cpu_done[dev->cpu] = true; 192 - ct_idle_exit(); 192 + ct_cpuidle_exit(); 193 193 194 194 /* Wakeup CPU1 only if it is not offlined */ 195 195 if (dev->cpu == 0 && cpumask_test_cpu(1, cpu_online_mask)) {
+6 -2
drivers/acpi/processor_idle.c
··· 642 642 */ 643 643 bool dis_bm = pr->flags.bm_control; 644 644 645 + instrumentation_begin(); 646 + 645 647 /* If we can skip BM, demote to a safe state. */ 646 648 if (!cx->bm_sts_skip && acpi_idle_bm_check()) { 647 649 dis_bm = false; ··· 665 663 raw_spin_unlock(&c3_lock); 666 664 } 667 665 668 - ct_idle_enter(); 666 + ct_cpuidle_enter(); 669 667 670 668 acpi_idle_do_entry(cx); 671 669 672 - ct_idle_exit(); 670 + ct_cpuidle_exit(); 673 671 674 672 /* Re-enable bus master arbitration */ 675 673 if (dis_bm) { ··· 678 676 c3_cpu_count--; 679 677 raw_spin_unlock(&c3_lock); 680 678 } 679 + 680 + instrumentation_end(); 681 681 682 682 return index; 683 683 }
+2 -2
drivers/cpuidle/cpuidle-big_little.c
··· 126 126 struct cpuidle_driver *drv, int idx) 127 127 { 128 128 cpu_pm_enter(); 129 - ct_idle_enter(); 129 + ct_cpuidle_enter(); 130 130 131 131 cpu_suspend(0, bl_powerdown_finisher); 132 132 133 133 /* signals the MCPM core that CPU is out of low power state */ 134 134 mcpm_cpu_powered_up(); 135 - ct_idle_exit(); 135 + ct_cpuidle_exit(); 136 136 137 137 cpu_pm_exit(); 138 138
+2 -2
drivers/cpuidle/cpuidle-mvebu-v7.c
··· 36 36 if (drv->states[index].flags & MVEBU_V7_FLAG_DEEP_IDLE) 37 37 deepidle = true; 38 38 39 - ct_idle_enter(); 39 + ct_cpuidle_enter(); 40 40 ret = mvebu_v7_cpu_suspend(deepidle); 41 - ct_idle_exit(); 41 + ct_cpuidle_exit(); 42 42 43 43 cpu_pm_exit(); 44 44
+2 -2
drivers/cpuidle/cpuidle-psci.c
··· 74 74 else 75 75 pm_runtime_put_sync_suspend(pd_dev); 76 76 77 - ct_idle_enter(); 77 + ct_cpuidle_enter(); 78 78 79 79 state = psci_get_domain_state(); 80 80 if (!state) ··· 82 82 83 83 ret = psci_cpu_suspend_enter(state) ? -1 : idx; 84 84 85 - ct_idle_exit(); 85 + ct_cpuidle_exit(); 86 86 87 87 if (s2idle) 88 88 dev_pm_genpd_resume(pd_dev);
+2 -2
drivers/cpuidle/cpuidle-riscv-sbi.c
··· 126 126 else 127 127 pm_runtime_put_sync_suspend(pd_dev); 128 128 129 - ct_idle_enter(); 129 + ct_cpuidle_enter(); 130 130 131 131 if (sbi_is_domain_state_available()) 132 132 state = sbi_get_domain_state(); ··· 135 135 136 136 ret = sbi_suspend(state) ? -1 : idx; 137 137 138 - ct_idle_exit(); 138 + ct_cpuidle_exit(); 139 139 140 140 if (s2idle) 141 141 dev_pm_genpd_resume(pd_dev);
+4 -4
drivers/cpuidle/cpuidle-tegra.c
··· 183 183 tegra_pm_set_cpu_in_lp2(); 184 184 cpu_pm_enter(); 185 185 186 - ct_idle_enter(); 186 + ct_cpuidle_enter(); 187 187 188 188 switch (index) { 189 189 case TEGRA_C7: ··· 199 199 break; 200 200 } 201 201 202 - ct_idle_exit(); 202 + ct_cpuidle_exit(); 203 203 204 204 cpu_pm_exit(); 205 205 tegra_pm_clear_cpu_in_lp2(); ··· 240 240 241 241 if (index == TEGRA_C1) { 242 242 if (do_rcu) 243 - ct_idle_enter(); 243 + ct_cpuidle_enter(); 244 244 ret = arm_cpuidle_simple_enter(dev, drv, index); 245 245 if (do_rcu) 246 - ct_idle_exit(); 246 + ct_cpuidle_exit(); 247 247 } else 248 248 ret = tegra_cpuidle_state_enter(dev, index, cpu); 249 249
+6 -5
drivers/cpuidle/cpuidle.c
··· 14 14 #include <linux/mutex.h> 15 15 #include <linux/sched.h> 16 16 #include <linux/sched/clock.h> 17 + #include <linux/sched/idle.h> 17 18 #include <linux/notifier.h> 18 19 #include <linux/pm_qos.h> 19 20 #include <linux/cpu.h> ··· 153 152 */ 154 153 stop_critical_timings(); 155 154 if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) 156 - ct_idle_enter(); 155 + ct_cpuidle_enter(); 157 156 target_state->enter_s2idle(dev, drv, index); 158 157 if (WARN_ON_ONCE(!irqs_disabled())) 159 - local_irq_disable(); 158 + raw_local_irq_disable(); 160 159 if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) 161 - ct_idle_exit(); 160 + ct_cpuidle_exit(); 162 161 tick_unfreeze(); 163 162 start_critical_timings(); 164 163 ··· 236 235 237 236 stop_critical_timings(); 238 237 if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) 239 - ct_idle_enter(); 238 + ct_cpuidle_enter(); 240 239 241 240 entered_state = target_state->enter(dev, drv, index); 242 241 if (WARN_ONCE(!irqs_disabled(), "%ps leaked IRQ state", target_state->enter)) 243 242 raw_local_irq_disable(); 244 243 245 244 if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) 246 - ct_idle_exit(); 245 + ct_cpuidle_exit(); 247 246 start_critical_timings(); 248 247 249 248 sched_clock_idle_wakeup_event();
+2 -2
include/linux/clockchips.h
··· 211 211 extern void tick_setup_hrtimer_broadcast(void); 212 212 extern int tick_check_broadcast_expired(void); 213 213 # else 214 - static inline int tick_check_broadcast_expired(void) { return 0; } 214 + static __always_inline int tick_check_broadcast_expired(void) { return 0; } 215 215 static inline void tick_setup_hrtimer_broadcast(void) { } 216 216 # endif 217 217 ··· 219 219 220 220 static inline void clockevents_suspend(void) { } 221 221 static inline void clockevents_resume(void) { } 222 - static inline int tick_check_broadcast_expired(void) { return 0; } 222 + static __always_inline int tick_check_broadcast_expired(void) { return 0; } 223 223 static inline void tick_setup_hrtimer_broadcast(void) { } 224 224 225 225 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
+32 -2
include/linux/cpuidle.h
··· 14 14 #include <linux/percpu.h> 15 15 #include <linux/list.h> 16 16 #include <linux/hrtimer.h> 17 + #include <linux/context_tracking.h> 17 18 18 19 #define CPUIDLE_STATE_MAX 10 19 20 #define CPUIDLE_NAME_LEN 16 ··· 115 114 116 115 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); 117 116 DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev); 117 + 118 + static __always_inline void ct_cpuidle_enter(void) 119 + { 120 + lockdep_assert_irqs_disabled(); 121 + /* 122 + * Idle is allowed to (temporary) enable IRQs. It 123 + * will return with IRQs disabled. 124 + * 125 + * Trace IRQs enable here, then switch off RCU, and have 126 + * arch_cpu_idle() use raw_local_irq_enable(). Note that 127 + * ct_idle_enter() relies on lockdep IRQ state, so switch that 128 + * last -- this is very similar to the entry code. 129 + */ 130 + trace_hardirqs_on_prepare(); 131 + lockdep_hardirqs_on_prepare(); 132 + instrumentation_end(); 133 + ct_idle_enter(); 134 + lockdep_hardirqs_on(_RET_IP_); 135 + } 136 + 137 + static __always_inline void ct_cpuidle_exit(void) 138 + { 139 + /* 140 + * Carefully undo the above. 141 + */ 142 + lockdep_hardirqs_off(_RET_IP_); 143 + ct_idle_exit(); 144 + instrumentation_begin(); 145 + } 118 146 119 147 /**************************** 120 148 * CPUIDLE DRIVER INTERFACE * ··· 319 289 if (!is_retention) \ 320 290 __ret = cpu_pm_enter(); \ 321 291 if (!__ret) { \ 322 - ct_idle_enter(); \ 292 + ct_cpuidle_enter(); \ 323 293 __ret = low_level_idle_enter(state); \ 324 - ct_idle_exit(); \ 294 + ct_cpuidle_exit(); \ 325 295 if (!is_retention) \ 326 296 cpu_pm_exit(); \ 327 297 } \
+13 -32
kernel/sched/idle.c
··· 51 51 52 52 static noinline int __cpuidle cpu_idle_poll(void) 53 53 { 54 + instrumentation_begin(); 54 55 trace_cpu_idle(0, smp_processor_id()); 55 56 stop_critical_timings(); 56 - ct_idle_enter(); 57 - local_irq_enable(); 57 + ct_cpuidle_enter(); 58 58 59 + raw_local_irq_enable(); 59 60 while (!tif_need_resched() && 60 61 (cpu_idle_force_poll || tick_check_broadcast_expired())) 61 62 cpu_relax(); 63 + raw_local_irq_disable(); 62 64 63 - ct_idle_exit(); 65 + ct_cpuidle_exit(); 64 66 start_critical_timings(); 65 67 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 68 + local_irq_enable(); 69 + instrumentation_end(); 66 70 67 71 return 1; 68 72 } ··· 89 85 */ 90 86 void __cpuidle default_idle_call(void) 91 87 { 92 - if (current_clr_polling_and_test()) { 93 - local_irq_enable(); 94 - } else { 95 - 88 + instrumentation_begin(); 89 + if (!current_clr_polling_and_test()) { 96 90 trace_cpu_idle(1, smp_processor_id()); 97 91 stop_critical_timings(); 98 92 99 - /* 100 - * arch_cpu_idle() is supposed to enable IRQs, however 101 - * we can't do that because of RCU and tracing. 102 - * 103 - * Trace IRQs enable here, then switch off RCU, and have 104 - * arch_cpu_idle() use raw_local_irq_enable(). Note that 105 - * ct_idle_enter() relies on lockdep IRQ state, so switch that 106 - * last -- this is very similar to the entry code. 107 - */ 108 - trace_hardirqs_on_prepare(); 109 - lockdep_hardirqs_on_prepare(); 110 - ct_idle_enter(); 111 - lockdep_hardirqs_on(_THIS_IP_); 112 - 93 + ct_cpuidle_enter(); 113 94 arch_cpu_idle(); 114 - 115 - /* 116 - * OK, so IRQs are enabled here, but RCU needs them disabled to 117 - * turn itself back on.. funny thing is that disabling IRQs 118 - * will cause tracing, which needs RCU. Jump through hoops to 119 - * make it 'work'. 120 - */ 121 95 raw_local_irq_disable(); 122 - lockdep_hardirqs_off(_THIS_IP_); 123 - ct_idle_exit(); 124 - lockdep_hardirqs_on(_THIS_IP_); 125 - raw_local_irq_enable(); 96 + ct_cpuidle_exit(); 126 97 127 98 start_critical_timings(); 128 99 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 129 100 } 101 + local_irq_enable(); 102 + instrumentation_end(); 130 103 } 131 104 132 105 static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
+5 -1
kernel/time/tick-broadcast.c
··· 622 622 * to avoid a deep idle transition as we are about to get the 623 623 * broadcast IPI right away. 624 624 */ 625 - int tick_check_broadcast_expired(void) 625 + noinstr int tick_check_broadcast_expired(void) 626 626 { 627 + #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H 628 + return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); 629 + #else 627 630 return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); 631 + #endif 628 632 } 629 633 630 634 /*