Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched: Add a tracepoint to track rq->nr_running

Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
->nr_running CPU's rq. This is used to accurately trace this data and
provide a visualization of scheduler imbalances in, for example, the
form of a heat map. The tracepoint is accessed by loading an external
kernel module. An example module (forked from Qais' module and including
the pelt related tracepoints) can be found at:

https://github.com/auldp/tracepoints-helpers.git

A script to turn the trace-cmd report output into a heatmap plot can be
found at:

https://github.com/jirvoz/plot-nr-running

The tracepoints are added to add_nr_running() and sub_nr_running() which
are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in
the header a wrapper call is used and the trace/events/sched.h include
is moved before sched.h in kernel/sched/core.

Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200629192303.GC120228@lorien.usersys.redhat.com

authored by

Phil Auld and committed by
Peter Zijlstra
9d246053 07bbecb3

+30 -8
+1
include/linux/sched.h
··· 2044 2044 const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); 2045 2045 2046 2046 int sched_trace_rq_cpu(struct rq *rq); 2047 + int sched_trace_rq_nr_running(struct rq *rq); 2047 2048 2048 2049 const struct cpumask *sched_trace_rd_span(struct root_domain *rd); 2049 2050
+4
include/trace/events/sched.h
··· 642 642 TP_PROTO(struct sched_entity *se), 643 643 TP_ARGS(se)); 644 644 645 + DECLARE_TRACE(sched_update_nr_running_tp, 646 + TP_PROTO(struct rq *rq, int change), 647 + TP_ARGS(rq, change)); 648 + 645 649 #endif /* _TRACE_SCHED_H */ 646 650 647 651 /* This part must be outside protection */
+9 -4
kernel/sched/core.c
··· 6 6 * 7 7 * Copyright (C) 1991-2002 Linus Torvalds 8 8 */ 9 + #define CREATE_TRACE_POINTS 10 + #include <trace/events/sched.h> 11 + #undef CREATE_TRACE_POINTS 12 + 9 13 #include "sched.h" 10 14 11 15 #include <linux/nospec.h> ··· 27 23 #include "pelt.h" 28 24 #include "smp.h" 29 25 30 - #define CREATE_TRACE_POINTS 31 - #include <trace/events/sched.h> 32 - 33 26 /* 34 27 * Export tracepoints that act as a bare tracehook (ie: have no trace event 35 28 * associated with them) to allow external modules to probe them. ··· 39 38 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); 40 39 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); 41 40 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); 41 + EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); 42 42 43 43 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 44 44 ··· 8197 8195 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 8198 8196 }; 8199 8197 8200 - #undef CREATE_TRACE_POINTS 8198 + void call_trace_sched_update_nr_running(struct rq *rq, int count) 8199 + { 8200 + trace_sched_update_nr_running_tp(rq, count); 8201 + }
+6 -2
kernel/sched/fair.c
··· 22 22 */ 23 23 #include "sched.h" 24 24 25 - #include <trace/events/sched.h> 26 - 27 25 /* 28 26 * Targeted preemption latency for CPU-bound tasks: 29 27 * ··· 11294 11296 #endif 11295 11297 } 11296 11298 EXPORT_SYMBOL_GPL(sched_trace_rd_span); 11299 + 11300 + int sched_trace_rq_nr_running(struct rq *rq) 11301 + { 11302 + return rq ? rq->nr_running : -1; 11303 + } 11304 + EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
-2
kernel/sched/pelt.c
··· 28 28 #include "sched.h" 29 29 #include "pelt.h" 30 30 31 - #include <trace/events/sched.h> 32 - 33 31 /* 34 32 * Approximate: 35 33 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+10
kernel/sched/sched.h
··· 76 76 #include "cpupri.h" 77 77 #include "cpudeadline.h" 78 78 79 + #include <trace/events/sched.h> 80 + 79 81 #ifdef CONFIG_SCHED_DEBUG 80 82 # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) 81 83 #else ··· 99 97 extern void calc_global_load_tick(struct rq *this_rq); 100 98 extern long calc_load_fold_active(struct rq *this_rq, long adjust); 101 99 100 + extern void call_trace_sched_update_nr_running(struct rq *rq, int count); 102 101 /* 103 102 * Helpers for converting nanosecond timing to jiffy resolution 104 103 */ ··· 1976 1973 unsigned prev_nr = rq->nr_running; 1977 1974 1978 1975 rq->nr_running = prev_nr + count; 1976 + if (trace_sched_update_nr_running_tp_enabled()) { 1977 + call_trace_sched_update_nr_running(rq, count); 1978 + } 1979 1979 1980 1980 #ifdef CONFIG_SMP 1981 1981 if (prev_nr < 2 && rq->nr_running >= 2) { ··· 1993 1987 static inline void sub_nr_running(struct rq *rq, unsigned count) 1994 1988 { 1995 1989 rq->nr_running -= count; 1990 + if (trace_sched_update_nr_running_tp_enabled()) { 1991 + call_trace_sched_update_nr_running(rq, count); 1992 + } 1993 + 1996 1994 /* Check if we still need preemption */ 1997 1995 sched_update_tick_dependency(rq); 1998 1996 }