Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tracepoint: Make rcuidle tracepoint callers use SRCU

In recent tests with IRQ on/off tracepoints, a large performance
overhead ~10% is noticed when running hackbench. This is root caused to
calls to rcu_irq_enter_irqson and rcu_irq_exit_irqson from the
tracepoint code. Following a long discussion on the list [1] about this,
we concluded that srcu is a better alternative for use during rcu idle.
Although it does involve extra barriers, its lighter than the sched-rcu
version which has to do additional RCU calls to notify RCU idle about
entry into RCU sections.

In this patch, we change the underlying implementation of the
trace_*_rcuidle API to use SRCU. This has shown to improve performance
alot for the high frequency irq enable/disable tracepoints.

Test: Tested idle and preempt/irq tracepoints.

Here are some performance numbers:

With a run of the following 30 times on a single core x86 Qemu instance
with 1GB memory:
hackbench -g 4 -f 2 -l 3000

Completion times in seconds. CONFIG_PROVE_LOCKING=y.

No patches (without this series)
Mean: 3.048
Median: 3.025
Std Dev: 0.064

With Lockdep using irq tracepoints with RCU implementation:
Mean: 3.451 (-11.66 %)
Median: 3.447 (-12.22%)
Std Dev: 0.049

With Lockdep using irq tracepoints with SRCU implementation (this series):
Mean: 3.020 (I would consider the improvement against the "without
this series" case as just noise).
Median: 3.013
Std Dev: 0.033

[1] https://patchwork.kernel.org/patch/10344297/

[remove rcu_read_lock_sched_notrace as its the equivalent of
preempt_disable_notrace and is unnecessary to call in tracepoint code]
Link: http://lkml.kernel.org/r/20180730222423.196630-3-joel@joelfernandes.org

Cleaned-up-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
[ Simplified WARN_ON_ONCE() ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

authored by

Joel Fernandes (Google) and committed by
Steven Rostedt (VMware)
e6753f23 01f38497

+47 -9
+32 -8
include/linux/tracepoint.h
··· 15 15 */ 16 16 17 17 #include <linux/smp.h> 18 + #include <linux/srcu.h> 18 19 #include <linux/errno.h> 19 20 #include <linux/types.h> 20 21 #include <linux/cpumask.h> ··· 33 32 }; 34 33 35 34 #define TRACEPOINT_DEFAULT_PRIO 10 35 + 36 + extern struct srcu_struct tracepoint_srcu; 36 37 37 38 extern int 38 39 tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); ··· 78 75 * probe unregistration and the end of module exit to make sure there is no 79 76 * caller executing a probe when it is freed. 80 77 */ 78 + #ifdef CONFIG_TRACEPOINTS 81 79 static inline void tracepoint_synchronize_unregister(void) 82 80 { 81 + synchronize_srcu(&tracepoint_srcu); 83 82 synchronize_sched(); 84 83 } 84 + #else 85 + static inline void tracepoint_synchronize_unregister(void) 86 + { } 87 + #endif 85 88 86 89 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS 87 90 extern int syscall_regfunc(void); ··· 138 129 * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just 139 130 * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". 140 131 */ 141 - #define __DO_TRACE(tp, proto, args, cond, rcucheck) \ 132 + #define __DO_TRACE(tp, proto, args, cond, rcuidle) \ 142 133 do { \ 143 134 struct tracepoint_func *it_func_ptr; \ 144 135 void *it_func; \ 145 136 void *__data; \ 137 + int __maybe_unused idx = 0; \ 146 138 \ 147 139 if (!(cond)) \ 148 140 return; \ 149 - if (rcucheck) \ 150 - rcu_irq_enter_irqson(); \ 151 - rcu_read_lock_sched_notrace(); \ 152 - it_func_ptr = rcu_dereference_sched((tp)->funcs); \ 141 + \ 142 + /* srcu can't be used from NMI */ \ 143 + WARN_ON_ONCE(rcuidle && in_nmi()); \ 144 + \ 145 + /* keep srcu and sched-rcu usage consistent */ \ 146 + preempt_disable_notrace(); \ 147 + \ 148 + /* \ 149 + * For rcuidle callers, use srcu since sched-rcu \ 150 + * doesn't work from the idle path. \ 151 + */ \ 152 + if (rcuidle) \ 153 + idx = srcu_read_lock_notrace(&tracepoint_srcu); \ 154 + \ 155 + it_func_ptr = rcu_dereference_raw((tp)->funcs); \ 156 + \ 153 157 if (it_func_ptr) { \ 154 158 do { \ 155 159 it_func = (it_func_ptr)->func; \ ··· 170 148 ((void(*)(proto))(it_func))(args); \ 171 149 } while ((++it_func_ptr)->func); \ 172 150 } \ 173 - rcu_read_unlock_sched_notrace(); \ 174 - if (rcucheck) \ 175 - rcu_irq_exit_irqson(); \ 151 + \ 152 + if (rcuidle) \ 153 + srcu_read_unlock_notrace(&tracepoint_srcu, idx);\ 154 + \ 155 + preempt_enable_notrace(); \ 176 156 } while (0) 177 157 178 158 #ifndef MODULE
+15 -1
kernel/tracepoint.c
··· 31 31 extern struct tracepoint * const __start___tracepoints_ptrs[]; 32 32 extern struct tracepoint * const __stop___tracepoints_ptrs[]; 33 33 34 + DEFINE_SRCU(tracepoint_srcu); 35 + EXPORT_SYMBOL_GPL(tracepoint_srcu); 36 + 34 37 /* Set to 1 to enable tracepoint debug output */ 35 38 static const int tracepoint_debug; 36 39 ··· 70 67 return p == NULL ? NULL : p->probes; 71 68 } 72 69 73 - static void rcu_free_old_probes(struct rcu_head *head) 70 + static void srcu_free_old_probes(struct rcu_head *head) 74 71 { 75 72 kfree(container_of(head, struct tp_probes, rcu)); 73 + } 74 + 75 + static void rcu_free_old_probes(struct rcu_head *head) 76 + { 77 + call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); 76 78 } 77 79 78 80 static inline void release_probes(struct tracepoint_func *old) ··· 85 77 if (old) { 86 78 struct tp_probes *tp_probes = container_of(old, 87 79 struct tp_probes, probes[0]); 80 + /* 81 + * Tracepoint probes are protected by both sched RCU and SRCU, 82 + * by calling the SRCU callback in the sched RCU callback we 83 + * cover both cases. So let us chain the SRCU and sched RCU 84 + * callbacks to wait for both grace periods. 85 + */ 88 86 call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); 89 87 } 90 88 }