Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rv: Retry when da monitor detects race conditions

DA monitor can be accessed from multiple cores simultaneously, this is
likely, for instance when dealing with per-task monitors reacting on
events that do not always occur on the CPU where the task is running.
This can cause race conditions where two events change the next state
and we see inconsistent values. E.g.:

[62] event_srs: 27: sleepable x sched_wakeup -> running (final)
[63] event_srs: 27: sleepable x sched_set_state_sleepable -> sleepable
[63] error_srs: 27: event sched_switch_suspend not expected in the state running

In this case the monitor fails because the event on CPU 62 wins against
the one on CPU 63, although the correct state should have been
sleepable, since the task get suspended.

Detect if the current state was modified by using try_cmpxchg while
storing the next value. If it was, try again reading the current state.
After a maximum number of failed retries, react by calling a special
tracepoint, print on the console and reset the monitor.

Remove the functions da_monitor_curr_state() and da_monitor_set_state()
as they only hide the underlying implementation in this case.

Monitors where this type of condition can occur must be able to account
for racing events in any possible order, as we cannot know the winner.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Tomas Glozar <tglozar@redhat.com>
Cc: Juri Lelli <jlelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/20250728135022.255578-6-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

authored by

Gabriele Monaco and committed by
Steven Rostedt (Google)
9d475d80 79de6617

+83 -52
+2 -1
include/linux/rv.h
··· 10 10 #include <linux/types.h> 11 11 #include <linux/list.h> 12 12 13 - #define MAX_DA_NAME_LEN 32 13 + #define MAX_DA_NAME_LEN 32 14 + #define MAX_DA_RETRY_RACING_EVENTS 3 14 15 15 16 #ifdef CONFIG_RV 16 17 #include <linux/bitops.h>
+52 -51
include/rv/da_monitor.h
··· 55 55 } \ 56 56 \ 57 57 /* \ 58 - * da_monitor_curr_state_##name - return the current state \ 59 - */ \ 60 - static inline type da_monitor_curr_state_##name(struct da_monitor *da_mon) \ 61 - { \ 62 - return da_mon->curr_state; \ 63 - } \ 64 - \ 65 - /* \ 66 - * da_monitor_set_state_##name - set the new current state \ 67 - */ \ 68 - static inline void \ 69 - da_monitor_set_state_##name(struct da_monitor *da_mon, enum states_##name state) \ 70 - { \ 71 - da_mon->curr_state = state; \ 72 - } \ 73 - \ 74 - /* \ 75 58 * da_monitor_start_##name - start monitoring \ 76 59 * \ 77 60 * The monitor will ignore all events until monitoring is set to true. This \ ··· 110 127 * Event handler for implicit monitors. Implicit monitor is the one which the 111 128 * handler does not need to specify which da_monitor to manipulate. Examples 112 129 * of implicit monitor are the per_cpu or the global ones. 130 + * 131 + * Retry in case there is a race between getting and setting the next state, 132 + * warn and reset the monitor if it runs out of retries. The monitor should be 133 + * able to handle various orders. 113 134 */ 114 135 #define DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type) \ 115 136 \ 116 137 static inline bool \ 117 138 da_event_##name(struct da_monitor *da_mon, enum events_##name event) \ 118 139 { \ 119 - type curr_state = da_monitor_curr_state_##name(da_mon); \ 120 - type next_state = model_get_next_state_##name(curr_state, event); \ 140 + enum states_##name curr_state, next_state; \ 121 141 \ 122 - if (next_state != INVALID_STATE) { \ 123 - da_monitor_set_state_##name(da_mon, next_state); \ 124 - \ 125 - trace_event_##name(model_get_state_name_##name(curr_state), \ 126 - model_get_event_name_##name(event), \ 127 - model_get_state_name_##name(next_state), \ 128 - model_is_final_state_##name(next_state)); \ 129 - \ 130 - return true; \ 142 + curr_state = READ_ONCE(da_mon->curr_state); \ 143 + for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ 144 + next_state = model_get_next_state_##name(curr_state, event); \ 145 + if (next_state == INVALID_STATE) { \ 146 + cond_react_##name(curr_state, event); \ 147 + trace_error_##name(model_get_state_name_##name(curr_state), \ 148 + model_get_event_name_##name(event)); \ 149 + return false; \ 150 + } \ 151 + if (likely(try_cmpxchg(&da_mon->curr_state, &curr_state, next_state))) { \ 152 + trace_event_##name(model_get_state_name_##name(curr_state), \ 153 + model_get_event_name_##name(event), \ 154 + model_get_state_name_##name(next_state), \ 155 + model_is_final_state_##name(next_state)); \ 156 + return true; \ 157 + } \ 131 158 } \ 132 159 \ 133 - cond_react_##name(curr_state, event); \ 134 - \ 135 - trace_error_##name(model_get_state_name_##name(curr_state), \ 136 - model_get_event_name_##name(event)); \ 137 - \ 160 + trace_rv_retries_error(#name, model_get_event_name_##name(event)); \ 161 + pr_warn("rv: " __stringify(MAX_DA_RETRY_RACING_EVENTS) \ 162 + " retries reached for event %s, resetting monitor %s", \ 163 + model_get_event_name_##name(event), #name); \ 138 164 return false; \ 139 165 } \ 140 166 141 167 /* 142 168 * Event handler for per_task monitors. 169 + * 170 + * Retry in case there is a race between getting and setting the next state, 171 + * warn and reset the monitor if it runs out of retries. The monitor should be 172 + * able to handle various orders. 143 173 */ 144 174 #define DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type) \ 145 175 \ 146 176 static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct *tsk, \ 147 177 enum events_##name event) \ 148 178 { \ 149 - type curr_state = da_monitor_curr_state_##name(da_mon); \ 150 - type next_state = model_get_next_state_##name(curr_state, event); \ 179 + enum states_##name curr_state, next_state; \ 151 180 \ 152 - if (next_state != INVALID_STATE) { \ 153 - da_monitor_set_state_##name(da_mon, next_state); \ 154 - \ 155 - trace_event_##name(tsk->pid, \ 156 - model_get_state_name_##name(curr_state), \ 157 - model_get_event_name_##name(event), \ 158 - model_get_state_name_##name(next_state), \ 159 - model_is_final_state_##name(next_state)); \ 160 - \ 161 - return true; \ 181 + curr_state = READ_ONCE(da_mon->curr_state); \ 182 + for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ 183 + next_state = model_get_next_state_##name(curr_state, event); \ 184 + if (next_state == INVALID_STATE) { \ 185 + cond_react_##name(curr_state, event); \ 186 + trace_error_##name(tsk->pid, \ 187 + model_get_state_name_##name(curr_state), \ 188 + model_get_event_name_##name(event)); \ 189 + return false; \ 190 + } \ 191 + if (likely(try_cmpxchg(&da_mon->curr_state, &curr_state, next_state))) { \ 192 + trace_event_##name(tsk->pid, \ 193 + model_get_state_name_##name(curr_state), \ 194 + model_get_event_name_##name(event), \ 195 + model_get_state_name_##name(next_state), \ 196 + model_is_final_state_##name(next_state)); \ 197 + return true; \ 198 + } \ 162 199 } \ 163 200 \ 164 - cond_react_##name(curr_state, event); \ 165 - \ 166 - trace_error_##name(tsk->pid, \ 167 - model_get_state_name_##name(curr_state), \ 168 - model_get_event_name_##name(event)); \ 169 - \ 201 + trace_rv_retries_error(#name, model_get_event_name_##name(event)); \ 202 + pr_warn("rv: " __stringify(MAX_DA_RETRY_RACING_EVENTS) \ 203 + " retries reached for event %s, resetting monitor %s", \ 204 + model_get_event_name_##name(event), #name); \ 170 205 return false; \ 171 206 } 172 207
+5
kernel/trace/rv/Kconfig
··· 3 3 config RV_MON_EVENTS 4 4 bool 5 5 6 + config RV_MON_MAINTENANCE_EVENTS 7 + bool 8 + 6 9 config DA_MON_EVENTS_IMPLICIT 7 10 select RV_MON_EVENTS 11 + select RV_MON_MAINTENANCE_EVENTS 8 12 bool 9 13 10 14 config DA_MON_EVENTS_ID 11 15 select RV_MON_EVENTS 16 + select RV_MON_MAINTENANCE_EVENTS 12 17 bool 13 18 14 19 config LTL_MON_EVENTS_ID
+24
kernel/trace/rv/rv_trace.h
··· 176 176 #include <monitors/sleep/sleep_trace.h> 177 177 // Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here 178 178 #endif /* CONFIG_LTL_MON_EVENTS_ID */ 179 + 180 + #ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS 181 + /* Tracepoint useful for monitors development, currenly only used in DA */ 182 + TRACE_EVENT(rv_retries_error, 183 + 184 + TP_PROTO(char *name, char *event), 185 + 186 + TP_ARGS(name, event), 187 + 188 + TP_STRUCT__entry( 189 + __string( name, name ) 190 + __string( event, event ) 191 + ), 192 + 193 + TP_fast_assign( 194 + __assign_str(name); 195 + __assign_str(event); 196 + ), 197 + 198 + TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS) 199 + " retries reached for event %s, resetting monitor %s", 200 + __get_str(event), __get_str(name)) 201 + ); 202 + #endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */ 179 203 #endif /* _TRACE_RV_H */ 180 204 181 205 /* This part must be outside protection */