Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rcu/nocb: Unify timers

Now that ->nocb_timer and ->nocb_bypass_timer have become quite similar,
this commit merges them together. A new RCU_NOCB_WAKE_BYPASS wake level
is introduced. As a result, timers perform all kinds of deferred wake
ups but other deferred wakeup callsites only handle non-bypass wakeups
in order not to wake up rcuo too early.

The timer also unconditionally executes a full barrier so as to order
timer_pending() and callback enqueue although the path performing
RCU_NOCB_WAKE_FORCE that makes use of it is debatable. It should also
test against the rdp leader instead of the current rdp.

This unconditional full barrier shouldn't bring visible overhead since
these timers almost never fire.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

authored by

Frederic Weisbecker and committed by
Paul E. McKenney
e75bcd48 87090516

+43 -56
+1
include/trace/events/rcu.h
··· 278 278 * "WakeNot": Don't wake rcuo kthread. 279 279 * "WakeNotPoll": Don't wake rcuo kthread because it is polling. 280 280 * "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge. 281 + * "WakeBypassIsDeferred": Wake rcuo kthread later, bypass list is contended. 281 282 * "WokeEmpty": rcuo CB kthread woke to find empty list. 282 283 */ 283 284 TRACE_EVENT_RCU(rcu_nocb_wake,
+3 -3
kernel/rcu/tree.h
··· 218 218 219 219 /* The following fields are used by GP kthread, hence own cacheline. */ 220 220 raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; 221 - struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */ 222 221 u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */ 223 222 u8 nocb_gp_bypass; /* Found a bypass on last scan? */ 224 223 u8 nocb_gp_gp; /* GP to wait for on last scan? */ ··· 257 258 258 259 /* Values for nocb_defer_wakeup field in struct rcu_data. */ 259 260 #define RCU_NOCB_WAKE_NOT 0 260 - #define RCU_NOCB_WAKE 1 261 - #define RCU_NOCB_WAKE_FORCE 2 261 + #define RCU_NOCB_WAKE_BYPASS 1 262 + #define RCU_NOCB_WAKE 2 263 + #define RCU_NOCB_WAKE_FORCE 3 262 264 263 265 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) 264 266 /* For jiffies_till_first_fqs and */
+39 -53
kernel/rcu/tree_plugin.h
··· 1701 1701 del_timer(&rdp_gp->nocb_timer); 1702 1702 } 1703 1703 1704 - del_timer(&rdp_gp->nocb_bypass_timer); 1705 - 1706 1704 if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { 1707 1705 WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); 1708 1706 needwake = true; ··· 1738 1740 1739 1741 raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); 1740 1742 1741 - if (rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) 1742 - mod_timer(&rdp_gp->nocb_timer, jiffies + 1); 1743 - if (rdp_gp->nocb_defer_wakeup < waketype) 1743 + /* 1744 + * Bypass wakeup overrides previous deferments. In case 1745 + * of callback storm, no need to wake up too early. 1746 + */ 1747 + if (waketype == RCU_NOCB_WAKE_BYPASS) { 1748 + mod_timer(&rdp_gp->nocb_timer, jiffies + 2); 1744 1749 WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); 1750 + } else { 1751 + if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) 1752 + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); 1753 + if (rdp_gp->nocb_defer_wakeup < waketype) 1754 + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); 1755 + } 1745 1756 1746 1757 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); 1747 1758 ··· 2002 1995 smp_mb(); /* Enqueue before timer_pending(). */ 2003 1996 if ((rdp->nocb_cb_sleep || 2004 1997 !rcu_segcblist_ready_cbs(&rdp->cblist)) && 2005 - !timer_pending(&rdp->nocb_bypass_timer)) { 1998 + !timer_pending(&rdp->nocb_timer)) { 2006 1999 rcu_nocb_unlock_irqrestore(rdp, flags); 2007 2000 wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, 2008 2001 TPS("WakeOvfIsDeferred")); ··· 2015 2008 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); 2016 2009 } 2017 2010 return; 2018 - } 2019 - 2020 - /* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */ 2021 - static void do_nocb_bypass_wakeup_timer(struct timer_list *t) 2022 - { 2023 - unsigned long flags; 2024 - struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer); 2025 - 2026 - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); 2027 - 2028 - raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); 2029 - smp_mb__after_spinlock(); /* Timer expire before wakeup. */ 2030 - __wake_nocb_gp(rdp, rdp, false, flags); 2031 2011 } 2032 2012 2033 2013 /* ··· 2169 2175 my_rdp->nocb_gp_bypass = bypass; 2170 2176 my_rdp->nocb_gp_gp = needwait_gp; 2171 2177 my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; 2172 - if (bypass) { 2173 - if (!rcu_nocb_poll) { 2174 - raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); 2175 - // Avoid race with first bypass CB. 2176 - if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { 2177 - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2178 - del_timer(&my_rdp->nocb_timer); 2179 - } 2180 - // At least one child with non-empty ->nocb_bypass, so set 2181 - // timer in order to avoid stranding its callbacks. 2182 - mod_timer(&my_rdp->nocb_bypass_timer, j + 2); 2183 - raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); 2184 - } 2178 + 2179 + if (bypass && !rcu_nocb_poll) { 2180 + // At least one child with non-empty ->nocb_bypass, so set 2181 + // timer in order to avoid stranding its callbacks. 2182 + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, 2183 + TPS("WakeBypassIsDeferred")); 2185 2184 } 2186 2185 if (rcu_nocb_poll) { 2187 2186 /* Polling, so trace if first poll in the series. */ ··· 2198 2211 } 2199 2212 if (!rcu_nocb_poll) { 2200 2213 raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); 2201 - if (bypass) 2202 - del_timer(&my_rdp->nocb_bypass_timer); 2203 2214 if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { 2204 2215 WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2205 2216 del_timer(&my_rdp->nocb_timer); ··· 2343 2358 } 2344 2359 2345 2360 /* Do a deferred wakeup of rcu_nocb_kthread(). */ 2346 - static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp, 2347 - int level) 2361 + static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, 2362 + struct rcu_data *rdp, int level, 2363 + unsigned long flags) 2364 + __releases(rdp_gp->nocb_gp_lock) 2348 2365 { 2349 - unsigned long flags; 2350 2366 int ndw; 2351 - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; 2352 2367 int ret; 2353 - 2354 - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); 2355 2368 2356 2369 if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { 2357 2370 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); ··· 2366 2383 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ 2367 2384 static void do_nocb_deferred_wakeup_timer(struct timer_list *t) 2368 2385 { 2386 + unsigned long flags; 2369 2387 struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); 2370 2388 2371 - do_nocb_deferred_wakeup_common(rdp, RCU_NOCB_WAKE); 2389 + WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); 2390 + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); 2391 + 2392 + raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); 2393 + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ 2394 + do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); 2372 2395 } 2373 2396 2374 2397 /* ··· 2384 2395 */ 2385 2396 static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) 2386 2397 { 2387 - if (!rdp->nocb_gp_rdp) 2398 + unsigned long flags; 2399 + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; 2400 + 2401 + if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) 2388 2402 return false; 2389 2403 2390 - if (rcu_nocb_need_deferred_wakeup(rdp->nocb_gp_rdp, RCU_NOCB_WAKE)) 2391 - return do_nocb_deferred_wakeup_common(rdp, RCU_NOCB_WAKE); 2392 - return false; 2404 + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); 2405 + return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); 2393 2406 } 2394 2407 2395 2408 void rcu_nocb_flush_deferred_wakeup(void) ··· 2635 2644 raw_spin_lock_init(&rdp->nocb_bypass_lock); 2636 2645 raw_spin_lock_init(&rdp->nocb_gp_lock); 2637 2646 timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); 2638 - timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0); 2639 2647 rcu_cblist_init(&rdp->nocb_bypass); 2640 2648 } 2641 2649 ··· 2793 2803 { 2794 2804 struct rcu_node *rnp = rdp->mynode; 2795 2805 2796 - pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", 2806 + pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", 2797 2807 rdp->cpu, 2798 2808 "kK"[!!rdp->nocb_gp_kthread], 2799 2809 "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], 2800 2810 "dD"[!!rdp->nocb_defer_wakeup], 2801 2811 "tT"[timer_pending(&rdp->nocb_timer)], 2802 - "bB"[timer_pending(&rdp->nocb_bypass_timer)], 2803 2812 "sS"[!!rdp->nocb_gp_sleep], 2804 2813 ".W"[swait_active(&rdp->nocb_gp_wq)], 2805 2814 ".W"[swait_active(&rnp->nocb_gp_wq[0])], ··· 2819 2830 char bufr[20]; 2820 2831 struct rcu_segcblist *rsclp = &rdp->cblist; 2821 2832 bool waslocked; 2822 - bool wastimer; 2823 2833 bool wassleep; 2824 2834 2825 2835 if (rdp->nocb_gp_rdp == rdp) ··· 2855 2867 return; 2856 2868 2857 2869 waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); 2858 - wastimer = timer_pending(&rdp->nocb_bypass_timer); 2859 2870 wassleep = swait_active(&rdp->nocb_gp_wq); 2860 - if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep) 2871 + if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) 2861 2872 return; /* Nothing untowards. */ 2862 2873 2863 - pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", 2874 + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", 2864 2875 "lL"[waslocked], 2865 2876 "dD"[!!rdp->nocb_defer_wakeup], 2866 - "tT"[wastimer], 2867 2877 "sS"[!!rdp->nocb_gp_sleep], 2868 2878 ".W"[wassleep]); 2869 2879 }