Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rcu: Enable tick for nohz_full CPUs slow to provide expedited QS

An expedited grace period can be stalled by a nohz_full CPU looping
in kernel context. This possibility is currently handled by some
carefully crafted checks in rcu_read_unlock_special() that enlist help
from ksoftirqd when permitted by the scheduler. However, it is exactly
these checks that require the scheduler avoid holding any of its rq or
pi locks across rcu_read_unlock() without also having held them across
the entire RCU read-side critical section.

It would therefore be very nice if expedited grace periods could
handle nohz_full CPUs looping in kernel context without such checks.
This commit therefore adds code to the expedited grace period's wait
and cleanup code that forces the scheduler-clock interrupt on for CPUs
that fail to quickly supply a quiescent state. "Quickly" is currently
a hard-coded single-jiffy delay.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

+50 -8
+4 -1
include/linux/tick.h
··· 109 109 TICK_DEP_BIT_PERF_EVENTS = 1, 110 110 TICK_DEP_BIT_SCHED = 2, 111 111 TICK_DEP_BIT_CLOCK_UNSTABLE = 3, 112 - TICK_DEP_BIT_RCU = 4 112 + TICK_DEP_BIT_RCU = 4, 113 + TICK_DEP_BIT_RCU_EXP = 5 113 114 }; 115 + #define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP 114 116 115 117 #define TICK_DEP_MASK_NONE 0 116 118 #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) ··· 120 118 #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) 121 119 #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) 122 120 #define TICK_DEP_MASK_RCU (1 << TICK_DEP_BIT_RCU) 121 + #define TICK_DEP_MASK_RCU_EXP (1 << TICK_DEP_BIT_RCU_EXP) 123 122 124 123 #ifdef CONFIG_NO_HZ_COMMON 125 124 extern bool tick_nohz_enabled;
+1
kernel/rcu/tree.h
··· 182 182 bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ 183 183 bool rcu_urgent_qs; /* GP old need light quiescent state. */ 184 184 bool rcu_forced_tick; /* Forced tick to provide QS. */ 185 + bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ 185 186 #ifdef CONFIG_RCU_FAST_NO_HZ 186 187 bool all_lazy; /* All CPU's CBs lazy at idle start? */ 187 188 unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
+45 -7
kernel/rcu/tree_exp.h
··· 230 230 static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, 231 231 unsigned long mask, bool wake) 232 232 { 233 + int cpu; 233 234 unsigned long flags; 235 + struct rcu_data *rdp; 234 236 235 237 raw_spin_lock_irqsave_rcu_node(rnp, flags); 236 238 if (!(rnp->expmask & mask)) { ··· 240 238 return; 241 239 } 242 240 WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); 241 + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { 242 + rdp = per_cpu_ptr(&rcu_data, cpu); 243 + if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp) 244 + continue; 245 + rdp->rcu_forced_tick_exp = false; 246 + tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP); 247 + } 243 248 __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ 244 249 } 245 250 ··· 459 450 } 460 451 461 452 /* 453 + * Wait for the expedited grace period to elapse, within time limit. 454 + * If the time limit is exceeded without the grace period elapsing, 455 + * return false, otherwise return true. 456 + */ 457 + static bool synchronize_rcu_expedited_wait_once(long tlimit) 458 + { 459 + int t; 460 + struct rcu_node *rnp_root = rcu_get_root(); 461 + 462 + t = swait_event_timeout_exclusive(rcu_state.expedited_wq, 463 + sync_rcu_exp_done_unlocked(rnp_root), 464 + tlimit); 465 + // Workqueues should not be signaled. 466 + if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root)) 467 + return true; 468 + WARN_ON(t < 0); /* workqueues should not be signaled. */ 469 + return false; 470 + } 471 + 472 + /* 462 473 * Wait for the expedited grace period to elapse, issuing any needed 463 474 * RCU CPU stall warnings along the way. 464 475 */ ··· 489 460 unsigned long jiffies_start; 490 461 unsigned long mask; 491 462 int ndetected; 463 + struct rcu_data *rdp; 492 464 struct rcu_node *rnp; 493 465 struct rcu_node *rnp_root = rcu_get_root(); 494 - int ret; 495 466 496 467 trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); 497 468 jiffies_stall = rcu_jiffies_till_stall_check(); 498 469 jiffies_start = jiffies; 470 + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { 471 + if (synchronize_rcu_expedited_wait_once(1)) 472 + return; 473 + rcu_for_each_leaf_node(rnp) { 474 + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { 475 + rdp = per_cpu_ptr(&rcu_data, cpu); 476 + if (rdp->rcu_forced_tick_exp) 477 + continue; 478 + rdp->rcu_forced_tick_exp = true; 479 + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); 480 + } 481 + } 482 + WARN_ON_ONCE(1); 483 + } 499 484 500 485 for (;;) { 501 - ret = swait_event_timeout_exclusive( 502 - rcu_state.expedited_wq, 503 - sync_rcu_exp_done_unlocked(rnp_root), 504 - jiffies_stall); 505 - if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root)) 486 + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) 506 487 return; 507 - WARN_ON(ret < 0); /* workqueues should not be signaled. */ 508 488 if (rcu_cpu_stall_suppress) 509 489 continue; 510 490 panic_on_rcu_stall();