sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Since sched_delayed tasks remain queued even after blocking, the load
balancer can migrate them between runqueues while PSI considers them
to be asleep. As a result, it misreads the migration requeue followed
by a wakeup as a double queue:

psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4

First, call psi_enqueue() after p->sched_class->enqueue_task(). A
wakeup will clear p->se.sched_delayed while a migration will not, so
psi can use that flag to tell them apart.

Then teach psi to migrate any "sleep" state when delayed-dequeue tasks
are being migrated.

Delayed-dequeue tasks can be revived by ttwu_runnable(), which will
call down with a new ENQUEUE_DELAYED. Instead of further complicating
the wakeup conditional in enqueue_task(), identify migration contexts
instead and default to wakeup handling for all other cases.

It's not just the warning in dmesg, the task state corruption causes a
permanent CPU pressure indication, which messes with workload/machine
health monitoring.

Debugged-by-and-original-fix-by: K Prateek Nayak <kprateek.nayak@amd.com>
Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20241010193712.GC181795@cmpxchg.org

authored by

Johannes Weiner and committed by

Ingo Molnar 2 years ago c6508124 f5aaff7b

+39 -21

2 changed files

expand all

kernel

sched

core.c

stats.h

+6 -6

kernel/sched/core.c

··· 2012 2012 if (!(flags & ENQUEUE_NOCLOCK)) 2013 2013 update_rq_clock(rq); 2014 2014 2015 - if (!(flags & ENQUEUE_RESTORE)) { 2016 - sched_info_enqueue(rq, p); 2017 - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); 2018 - } 2019 - 2020 2015 p->sched_class->enqueue_task(rq, p, flags); 2021 2016 /* 2022 2017 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear 2023 2018 * ->sched_delayed. 2024 2019 */ 2025 2020 uclamp_rq_inc(rq, p); 2021 + 2022 + if (!(flags & ENQUEUE_RESTORE)) { 2023 + sched_info_enqueue(rq, p); 2024 + psi_enqueue(p, flags & ENQUEUE_MIGRATED); 2025 + } 2026 2026 2027 2027 if (sched_core_enabled(rq)) 2028 2028 sched_core_enqueue(rq, p); ··· 2041 2041 2042 2042 if (!(flags & DEQUEUE_SAVE)) { 2043 2043 sched_info_dequeue(rq, p); 2044 - psi_dequeue(p, flags & DEQUEUE_SLEEP); 2044 + psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); 2045 2045 } 2046 2046 2047 2047 /*

+33 -15

kernel/sched/stats.h

··· 119 119 /* 120 120 * PSI tracks state that persists across sleeps, such as iowaits and 121 121 * memory stalls. As a result, it has to distinguish between sleeps, 122 - * where a task's runnable state changes, and requeues, where a task 123 - * and its state are being moved between CPUs and runqueues. 122 + * where a task's runnable state changes, and migrations, where a task 123 + * and its runnable state are being moved between CPUs and runqueues. 124 + * 125 + * A notable case is a task whose dequeue is delayed. PSI considers 126 + * those sleeping, but because they are still on the runqueue they can 127 + * go through migration requeues. In this case, *sleeping* states need 128 + * to be transferred. 124 129 */ 125 - static inline void psi_enqueue(struct task_struct *p, bool wakeup) 130 + static inline void psi_enqueue(struct task_struct *p, bool migrate) 126 131 { 127 - int clear = 0, set = TSK_RUNNING; 132 + int clear = 0, set = 0; 128 133 129 134 if (static_branch_likely(&psi_disabled)) 130 135 return; 131 136 132 - if (p->in_memstall) 133 - set |= TSK_MEMSTALL_RUNNING; 134 - 135 - if (!wakeup) { 137 + if (p->se.sched_delayed) { 138 + /* CPU migration of "sleeping" task */ 139 + SCHED_WARN_ON(!migrate); 136 140 if (p->in_memstall) 137 141 set |= TSK_MEMSTALL; 142 + if (p->in_iowait) 143 + set |= TSK_IOWAIT; 144 + } else if (migrate) { 145 + /* CPU migration of runnable task */ 146 + set = TSK_RUNNING; 147 + if (p->in_memstall) 148 + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; 138 149 } else { 150 + /* Wakeup of new or sleeping task */ 139 151 if (p->in_iowait) 140 152 clear |= TSK_IOWAIT; 153 + set = TSK_RUNNING; 154 + if (p->in_memstall) 155 + set |= TSK_MEMSTALL_RUNNING; 141 156 } 142 157 143 158 psi_task_change(p, clear, set); 144 159 } 145 160 146 - static inline void psi_dequeue(struct task_struct *p, bool sleep) 161 + static inline void psi_dequeue(struct task_struct *p, bool migrate) 147 162 { 148 163 if (static_branch_likely(&psi_disabled)) 149 164 return; 165 + 166 + /* 167 + * When migrating a task to another CPU, clear all psi 168 + * state. The enqueue callback above will work it out. 169 + */ 170 + if (migrate) 171 + psi_task_change(p, p->psi_flags, 0); 150 172 151 173 /* 152 174 * A voluntary sleep is a dequeue followed by a task switch. To ··· 176 154 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. 177 155 * Do nothing here. 178 156 */ 179 - if (sleep) 180 - return; 181 - 182 - psi_task_change(p, p->psi_flags, 0); 183 157 } 184 158 185 159 static inline void psi_ttwu_dequeue(struct task_struct *p) ··· 208 190 } 209 191 210 192 #else /* CONFIG_PSI */ 211 - static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} 212 - static inline void psi_dequeue(struct task_struct *p, bool sleep) {} 193 + static inline void psi_enqueue(struct task_struct *p, bool migrate) {} 194 + static inline void psi_dequeue(struct task_struct *p, bool migrate) {} 213 195 static inline void psi_ttwu_dequeue(struct task_struct *p) {} 214 196 static inline void psi_sched_switch(struct task_struct *prev, 215 197 struct task_struct *next,