Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/tracing: Don't re-read p->state when emitting sched_switch event

As of commit

c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

the following sequence becomes possible:

p->__state = TASK_INTERRUPTIBLE;
__schedule()
deactivate_task(p);
ttwu()
READ !p->on_rq
p->__state=TASK_WAKING
trace_sched_switch()
__trace_sched_switch_state()
task_state_index()
return 0;

TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.

Prevent this by pushing the value read from __schedule() down the trace
event.

Reported-by: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220120162520.570782-2-valentin.schneider@arm.com

authored by

Valentin Schneider and committed by
Peter Zijlstra
fa2c3254 49bef33e

+34 -14
+8 -3
include/linux/sched.h
··· 1620 1620 #define TASK_REPORT_IDLE (TASK_REPORT + 1) 1621 1621 #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) 1622 1622 1623 - static inline unsigned int task_state_index(struct task_struct *tsk) 1623 + static inline unsigned int __task_state_index(unsigned int tsk_state, 1624 + unsigned int tsk_exit_state) 1624 1625 { 1625 - unsigned int tsk_state = READ_ONCE(tsk->__state); 1626 - unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; 1626 + unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; 1627 1627 1628 1628 BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); 1629 1629 ··· 1631 1631 state = TASK_REPORT_IDLE; 1632 1632 1633 1633 return fls(state); 1634 + } 1635 + 1636 + static inline unsigned int task_state_index(struct task_struct *tsk) 1637 + { 1638 + return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); 1634 1639 } 1635 1640 1636 1641 static inline char task_index_to_char(unsigned int state)
+7 -4
include/trace/events/sched.h
··· 187 187 TP_ARGS(p)); 188 188 189 189 #ifdef CREATE_TRACE_POINTS 190 - static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) 190 + static inline long __trace_sched_switch_state(bool preempt, 191 + unsigned int prev_state, 192 + struct task_struct *p) 191 193 { 192 194 unsigned int state; 193 195 ··· 210 208 * it for left shift operation to get the correct task->state 211 209 * mapping. 212 210 */ 213 - state = task_state_index(p); 211 + state = __task_state_index(prev_state, p->exit_state); 214 212 215 213 return state ? (1 << (state - 1)) : state; 216 214 } ··· 222 220 TRACE_EVENT(sched_switch, 223 221 224 222 TP_PROTO(bool preempt, 223 + unsigned int prev_state, 225 224 struct task_struct *prev, 226 225 struct task_struct *next), 227 226 228 - TP_ARGS(preempt, prev, next), 227 + TP_ARGS(preempt, prev_state, prev, next), 229 228 230 229 TP_STRUCT__entry( 231 230 __array( char, prev_comm, TASK_COMM_LEN ) ··· 242 239 memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); 243 240 __entry->prev_pid = prev->pid; 244 241 __entry->prev_prio = prev->prio; 245 - __entry->prev_state = __trace_sched_switch_state(preempt, prev); 242 + __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); 246 243 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); 247 244 __entry->next_pid = next->pid; 248 245 __entry->next_prio = next->prio;
+2 -2
kernel/sched/core.c
··· 4836 4836 { 4837 4837 struct rq *rq = this_rq(); 4838 4838 struct mm_struct *mm = rq->prev_mm; 4839 - long prev_state; 4839 + unsigned int prev_state; 4840 4840 4841 4841 /* 4842 4842 * The previous task will have left us with a preempt_count of 2 ··· 6300 6300 migrate_disable_switch(rq, prev); 6301 6301 psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 6302 6302 6303 - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); 6303 + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); 6304 6304 6305 6305 /* Also unlocks the rq: */ 6306 6306 rq = context_switch(rq, prev, next, &rf);
+3 -1
kernel/trace/fgraph.c
··· 415 415 416 416 static void 417 417 ftrace_graph_probe_sched_switch(void *ignore, bool preempt, 418 - struct task_struct *prev, struct task_struct *next) 418 + unsigned int prev_state, 419 + struct task_struct *prev, 420 + struct task_struct *next) 419 421 { 420 422 unsigned long long timestamp; 421 423 int index;
+3 -1
kernel/trace/ftrace.c
··· 7347 7347 7348 7348 static void 7349 7349 ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, 7350 - struct task_struct *prev, struct task_struct *next) 7350 + unsigned int prev_state, 7351 + struct task_struct *prev, 7352 + struct task_struct *next) 7351 7353 { 7352 7354 struct trace_array *tr = data; 7353 7355 struct trace_pid_list *pid_list;
+6 -2
kernel/trace/trace_events.c
··· 759 759 760 760 static void 761 761 event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, 762 - struct task_struct *prev, struct task_struct *next) 762 + unsigned int prev_state, 763 + struct task_struct *prev, 764 + struct task_struct *next) 763 765 { 764 766 struct trace_array *tr = data; 765 767 struct trace_pid_list *no_pid_list; ··· 785 783 786 784 static void 787 785 event_filter_pid_sched_switch_probe_post(void *data, bool preempt, 788 - struct task_struct *prev, struct task_struct *next) 786 + unsigned int prev_state, 787 + struct task_struct *prev, 788 + struct task_struct *next) 789 789 { 790 790 struct trace_array *tr = data; 791 791 struct trace_pid_list *no_pid_list;
+3 -1
kernel/trace/trace_osnoise.c
··· 1167 1167 * used to record the beginning and to report the end of a thread noise window. 1168 1168 */ 1169 1169 static void 1170 - trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, 1170 + trace_sched_switch_callback(void *data, bool preempt, 1171 + unsigned int prev_state, 1172 + struct task_struct *p, 1171 1173 struct task_struct *n) 1172 1174 { 1173 1175 struct osnoise_variables *osn_var = this_cpu_osn_var();
+1
kernel/trace/trace_sched_switch.c
··· 22 22 23 23 static void 24 24 probe_sched_switch(void *ignore, bool preempt, 25 + unsigned int prev_state, 25 26 struct task_struct *prev, struct task_struct *next) 26 27 { 27 28 int flags;
+1
kernel/trace/trace_sched_wakeup.c
··· 426 426 427 427 static void notrace 428 428 probe_wakeup_sched_switch(void *ignore, bool preempt, 429 + unsigned int prev_state, 429 430 struct task_struct *prev, struct task_struct *next) 430 431 { 431 432 struct trace_array_cpu *data;