Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf sched timehist: Add pre-migration wait time option

pre-migration wait time is the time that a task unnecessarily spends
on the runqueue of a CPU but doesn't get switched-in there. In terms
of tracepoints, it is the time between sched:sched_wakeup and
sched:sched_migrate_task.

Let's say a task woke up on CPU2, then it got migrated to CPU4 and
then it's switched-in to CPU4. So, here pre-migration wait time is
time that it was waiting on runqueue of CPU2 after it is woken up.

The general pattern for pre-migration to occur is:
sched:sched_wakeup
sched:sched_migrate_task
sched:sched_switch

The sched:sched_waking event is used to capture the wakeup time,
as it aligns with the existing code and only introduces a negligible
time difference.

pre-migrations are generally not useful and it increases migrations.
This metric would be helpful in testing patches mainly related to wakeup
and load-balancer code paths as better wakeup logic would choose an
optimal CPU where task would be switched-in and thereby reducing pre-
migrations.

The sample output(s) when -P or --pre-migrations is used:
=================
time cpu task name wait time sch delay run time pre-mig time
[tid/pid] (msec) (msec) (msec) (msec)
--------------- ------ ------------------------------ --------- --------- --------- ---------
38456.720806 [0001] schbench[28634/28574] 4.917 4.768 1.004 0.000
38456.720810 [0001] rcu_preempt[18] 3.919 0.003 0.004 0.000
38456.721800 [0006] schbench[28779/28574] 23.465 23.465 1.999 0.000
38456.722800 [0002] schbench[28773/28574] 60.371 60.237 3.955 60.197
38456.722806 [0001] schbench[28634/28574] 0.004 0.004 1.996 0.000
38456.722811 [0001] rcu_preempt[18] 1.996 0.005 0.005 0.000
38456.723800 [0000] schbench[28833/28574] 4.000 4.000 3.999 0.000
38456.723800 [0004] schbench[28762/28574] 42.951 42.839 3.999 39.867
38456.723802 [0007] schbench[28812/28574] 43.947 43.817 3.999 40.866
38456.723804 [0001] schbench[28587/28574] 7.935 7.822 0.993 0.000

Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lore.kernel.org/r/20241004170756.18064-1-vineethr@linux.ibm.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>

authored by

Madadi Vineeth Reddy and committed by
Namhyung Kim
cd912ab3 af3902bf

+67 -39
+8
tools/perf/Documentation/perf-sched.txt
··· 221 221 priorities are specified with -: 120-129. A combination of both can also be 222 222 provided: 0,120-129. 223 223 224 + -P:: 225 + --pre-migrations:: 226 + Show pre-migration wait time. pre-migration wait time is the time spent 227 + by a task waiting on a runqueue but not getting the chance to run there 228 + and is migrated to a different runqueue where it is finally run. This 229 + time between sched_wakeup and migrate_task is the pre-migration wait 230 + time. 231 + 224 232 OPTIONS for 'perf sched replay' 225 233 ------------------------------ 226 234
+59 -39
tools/perf/builtin-sched.c
··· 225 225 bool show_wakeups; 226 226 bool show_next; 227 227 bool show_migrations; 228 + bool pre_migrations; 228 229 bool show_state; 229 230 bool show_prio; 230 231 u64 skipped_samples; ··· 245 244 u64 dt_iowait; /* time between CPU access by iowait (off cpu) */ 246 245 u64 dt_preempt; /* time between CPU access by preempt (off cpu) */ 247 246 u64 dt_delay; /* time between wakeup and sched-in */ 247 + u64 dt_pre_mig; /* time between migration and wakeup */ 248 248 u64 ready_to_run; /* time of wakeup */ 249 + u64 migrated; /* time when a thread is migrated */ 249 250 250 251 struct stats run_stats; 251 252 u64 total_run_time; ··· 255 252 u64 total_iowait_time; 256 253 u64 total_preempt_time; 257 254 u64 total_delay_time; 255 + u64 total_pre_mig_time; 258 256 259 257 char last_state; 260 258 ··· 2077 2073 printf(" "); 2078 2074 } 2079 2075 2080 - if (sched->show_prio) { 2081 - printf(" %-*s %-*s %9s %9s %9s", 2082 - comm_width, "task name", MAX_PRIO_STR_LEN, "prio", 2083 - "wait time", "sch delay", "run time"); 2084 - } else { 2085 - printf(" %-*s %9s %9s %9s", comm_width, 2086 - "task name", "wait time", "sch delay", "run time"); 2087 - } 2076 + printf(" %-*s", comm_width, "task name"); 2077 + 2078 + if (sched->show_prio) 2079 + printf(" %-*s", MAX_PRIO_STR_LEN, "prio"); 2080 + 2081 + printf(" %9s %9s %9s", "wait time", "sch delay", "run time"); 2082 + 2083 + if (sched->pre_migrations) 2084 + printf(" %9s", "pre-mig time"); 2088 2085 2089 2086 if (sched->show_state) 2090 2087 printf(" %s", "state"); ··· 2100 2095 if (sched->show_cpu_visual) 2101 2096 printf(" %*s ", ncpus, ""); 2102 2097 2103 - if (sched->show_prio) { 2104 - printf(" %-*s %-*s %9s %9s %9s", 2105 - comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "", 2106 - "(msec)", "(msec)", "(msec)"); 2107 - } else { 2108 - printf(" %-*s %9s %9s %9s", comm_width, 2109 - "[tid/pid]", "(msec)", "(msec)", "(msec)"); 2110 - } 2098 + printf(" %-*s", comm_width, "[tid/pid]"); 2111 2099 2112 - if (sched->show_state) 2113 - printf(" %5s", ""); 2100 + if (sched->show_prio) 2101 + printf(" %-*s", MAX_PRIO_STR_LEN, ""); 2102 + 2103 + printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)"); 2104 + 2105 + if (sched->pre_migrations) 2106 + printf(" %9s", "(msec)"); 2114 2107 2115 2108 printf("\n"); 2116 2109 ··· 2120 2117 if (sched->show_cpu_visual) 2121 2118 printf(" %.*s ", ncpus, graph_dotted_line); 2122 2119 2123 - if (sched->show_prio) { 2124 - printf(" %.*s %.*s %.9s %.9s %.9s", 2125 - comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line, 2126 - graph_dotted_line, graph_dotted_line, graph_dotted_line); 2127 - } else { 2128 - printf(" %.*s %.9s %.9s %.9s", comm_width, 2129 - graph_dotted_line, graph_dotted_line, graph_dotted_line, 2130 - graph_dotted_line); 2131 - } 2120 + printf(" %.*s", comm_width, graph_dotted_line); 2121 + 2122 + if (sched->show_prio) 2123 + printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line); 2124 + 2125 + printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line); 2126 + 2127 + if (sched->pre_migrations) 2128 + printf(" %.9s", graph_dotted_line); 2132 2129 2133 2130 if (sched->show_state) 2134 2131 printf(" %.5s", graph_dotted_line); ··· 2183 2180 2184 2181 print_sched_time(tr->dt_delay, 6); 2185 2182 print_sched_time(tr->dt_run, 6); 2183 + if (sched->pre_migrations) 2184 + print_sched_time(tr->dt_pre_mig, 6); 2186 2185 2187 2186 if (sched->show_state) 2188 2187 printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state); ··· 2222 2217 * last_time = time of last sched change event for current task 2223 2218 * (i.e, time process was last scheduled out) 2224 2219 * ready_to_run = time of wakeup for current task 2220 + * migrated = time of task migration to another CPU 2225 2221 * 2226 - * -----|------------|------------|------------|------ 2227 - * last ready tprev t 2222 + * -----|-------------|-------------|-------------|-------------|----- 2223 + * last ready migrated tprev t 2228 2224 * time to run 2229 2225 * 2230 - * |-------- dt_wait --------| 2231 - * |- dt_delay -|-- dt_run --| 2226 + * |---------------- dt_wait ----------------| 2227 + * |--------- dt_delay ---------|-- dt_run --| 2228 + * |- dt_pre_mig -| 2232 2229 * 2233 - * dt_run = run time of current task 2234 - * dt_wait = time between last schedule out event for task and tprev 2235 - * represents time spent off the cpu 2236 - * dt_delay = time between wakeup and schedule-in of task 2230 + * dt_run = run time of current task 2231 + * dt_wait = time between last schedule out event for task and tprev 2232 + * represents time spent off the cpu 2233 + * dt_delay = time between wakeup and schedule-in of task 2234 + * dt_pre_mig = time between wakeup and migration to another CPU 2237 2235 */ 2238 2236 2239 2237 static void timehist_update_runtime_stats(struct thread_runtime *r, ··· 2247 2239 r->dt_iowait = 0; 2248 2240 r->dt_preempt = 0; 2249 2241 r->dt_run = 0; 2242 + r->dt_pre_mig = 0; 2250 2243 2251 2244 if (tprev) { 2252 2245 r->dt_run = t - tprev; ··· 2256 2247 pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); 2257 2248 else 2258 2249 r->dt_delay = tprev - r->ready_to_run; 2250 + 2251 + if ((r->migrated > r->ready_to_run) && (r->migrated < tprev)) 2252 + r->dt_pre_mig = r->migrated - r->ready_to_run; 2259 2253 } 2260 2254 2261 2255 if (r->last_time > tprev) ··· 2282 2270 r->total_sleep_time += r->dt_sleep; 2283 2271 r->total_iowait_time += r->dt_iowait; 2284 2272 r->total_preempt_time += r->dt_preempt; 2273 + r->total_pre_mig_time += r->dt_pre_mig; 2285 2274 } 2286 2275 2287 2276 static bool is_idle_sample(struct perf_sample *sample, ··· 2696 2683 return -1; 2697 2684 2698 2685 tr->migrations++; 2686 + tr->migrated = sample->time; 2699 2687 2700 2688 /* show migrations if requested */ 2701 - timehist_print_migration_event(sched, evsel, sample, machine, thread); 2689 + if (sched->show_migrations) { 2690 + timehist_print_migration_event(sched, evsel, sample, 2691 + machine, thread); 2692 + } 2702 2693 2703 2694 return 0; 2704 2695 } ··· 2853 2836 /* last state is used to determine where to account wait time */ 2854 2837 tr->last_state = state; 2855 2838 2856 - /* sched out event for task so reset ready to run time */ 2839 + /* sched out event for task so reset ready to run time and migrated time */ 2857 2840 if (state == 'R') 2858 2841 tr->ready_to_run = t; 2859 2842 else 2860 2843 tr->ready_to_run = 0; 2844 + 2845 + tr->migrated = 0; 2861 2846 } 2862 2847 2863 2848 evsel__save_time(evsel, sample->time, sample->cpu); ··· 3299 3280 goto out; 3300 3281 } 3301 3282 3302 - if (sched->show_migrations && 3303 - perf_session__set_tracepoints_handlers(session, migrate_handlers)) 3283 + if ((sched->show_migrations || sched->pre_migrations) && 3284 + perf_session__set_tracepoints_handlers(session, migrate_handlers)) 3304 3285 goto out; 3305 3286 3306 3287 /* pre-allocate struct for per-CPU idle stats */ ··· 3842 3823 OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"), 3843 3824 OPT_STRING(0, "prio", &sched.prio_str, "prio", 3844 3825 "analyze events only for given task priority(ies)"), 3826 + OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"), 3845 3827 OPT_PARENT(sched_options) 3846 3828 }; 3847 3829