Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rcu: RCU-based detection of stalled CPUs for Classic RCU

This patch adds stalled-CPU detection to Classic RCU. This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.

This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.

This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds. This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1c4).

The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Paul E. McKenney and committed by
Ingo Molnar
2133b5d7 b5259d94

+96 -84
+9 -3
include/linux/rcuclassic.h
··· 40 40 #include <linux/cpumask.h> 41 41 #include <linux/seqlock.h> 42 42 43 + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR 44 + #define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ 45 + #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ 46 + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 43 47 44 48 /* Global control variables for rcupdate callback mechanism. */ 45 49 struct rcu_ctrlblk { 46 50 long cur; /* Current batch number. */ 47 51 long completed; /* Number of the last completed batch */ 48 52 long pending; /* Number of the last pending batch */ 49 - #ifdef CONFIG_DEBUG_RCU_STALL 50 - unsigned long gp_check; /* Time grace period should end, in seconds. */ 51 - #endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ 53 + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR 54 + unsigned long gp_start; /* Time at which GP started in jiffies. */ 55 + unsigned long jiffies_stall; 56 + /* Time at which to check for CPU stalls. */ 57 + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 52 58 53 59 int signaled; 54 60
+86 -80
kernel/rcuclassic.c
··· 164 164 } 165 165 } 166 166 167 + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR 168 + 169 + static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) 170 + { 171 + rcp->gp_start = jiffies; 172 + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; 173 + } 174 + 175 + static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) 176 + { 177 + int cpu; 178 + long delta; 179 + unsigned long flags; 180 + 181 + /* Only let one CPU complain about others per time interval. */ 182 + 183 + spin_lock_irqsave(&rcp->lock, flags); 184 + delta = jiffies - rcp->jiffies_stall; 185 + if (delta < 2 || rcp->cur != rcp->completed) { 186 + spin_unlock_irqrestore(&rcp->lock, flags); 187 + return; 188 + } 189 + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 190 + spin_unlock_irqrestore(&rcp->lock, flags); 191 + 192 + /* OK, time to rat on our buddy... */ 193 + 194 + printk(KERN_ERR "RCU detected CPU stalls:"); 195 + for_each_possible_cpu(cpu) { 196 + if (cpu_isset(cpu, rcp->cpumask)) 197 + printk(" %d", cpu); 198 + } 199 + printk(" (detected by %d, t=%ld jiffies)\n", 200 + smp_processor_id(), (long)(jiffies - rcp->gp_start)); 201 + } 202 + 203 + static void print_cpu_stall(struct rcu_ctrlblk *rcp) 204 + { 205 + unsigned long flags; 206 + 207 + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 208 + smp_processor_id(), jiffies, 209 + jiffies - rcp->gp_start); 210 + dump_stack(); 211 + spin_lock_irqsave(&rcp->lock, flags); 212 + if ((long)(jiffies - rcp->jiffies_stall) >= 0) 213 + rcp->jiffies_stall = 214 + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 215 + spin_unlock_irqrestore(&rcp->lock, flags); 216 + set_need_resched(); /* kick ourselves to get things going. */ 217 + } 218 + 219 + static void check_cpu_stall(struct rcu_ctrlblk *rcp) 220 + { 221 + long delta; 222 + 223 + delta = jiffies - rcp->jiffies_stall; 224 + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { 225 + 226 + /* We haven't checked in, so go dump stack. */ 227 + print_cpu_stall(rcp); 228 + 229 + } else if (rcp->cur != rcp->completed && delta >= 2) { 230 + 231 + /* They had two seconds to dump stack, so complain. */ 232 + print_other_cpu_stall(rcp); 233 + } 234 + } 235 + 236 + #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 237 + 238 + static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) 239 + { 240 + } 241 + 242 + static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 243 + { 244 + } 245 + 246 + #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 247 + 167 248 /** 168 249 * call_rcu - Queue an RCU callback for invocation after a grace period. 169 250 * @head: structure to be used for queueing the RCU updates. ··· 374 293 * period (if necessary). 375 294 */ 376 295 377 - #ifdef CONFIG_DEBUG_RCU_STALL 378 - 379 - static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) 380 - { 381 - rcp->gp_check = get_seconds() + 3; 382 - } 383 - 384 - static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) 385 - { 386 - int cpu; 387 - long delta; 388 - unsigned long flags; 389 - 390 - /* Only let one CPU complain about others per time interval. */ 391 - 392 - spin_lock_irqsave(&rcp->lock, flags); 393 - delta = get_seconds() - rcp->gp_check; 394 - if (delta < 2L || cpus_empty(rcp->cpumask)) { 395 - spin_unlock(&rcp->lock); 396 - return; 397 - } 398 - rcp->gp_check = get_seconds() + 30; 399 - spin_unlock_irqrestore(&rcp->lock, flags); 400 - 401 - /* OK, time to rat on our buddy... */ 402 - 403 - printk(KERN_ERR "RCU detected CPU stalls:"); 404 - for_each_cpu_mask(cpu, rcp->cpumask) 405 - printk(" %d", cpu); 406 - printk(" (detected by %d, t=%lu/%lu)\n", 407 - smp_processor_id(), get_seconds(), rcp->gp_check); 408 - } 409 - 410 - static void print_cpu_stall(struct rcu_ctrlblk *rcp) 411 - { 412 - unsigned long flags; 413 - 414 - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", 415 - smp_processor_id(), get_seconds(), rcp->gp_check); 416 - dump_stack(); 417 - spin_lock_irqsave(&rcp->lock, flags); 418 - if ((long)(get_seconds() - rcp->gp_check) >= 0L) 419 - rcp->gp_check = get_seconds() + 30; 420 - spin_unlock_irqrestore(&rcp->lock, flags); 421 - } 422 - 423 - static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 424 - { 425 - long delta; 426 - 427 - delta = get_seconds() - rcp->gp_check; 428 - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { 429 - 430 - /* We haven't checked in, so go dump stack. */ 431 - 432 - print_cpu_stall(rcp); 433 - 434 - } else { 435 - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { 436 - /* They had two seconds to dump stack, so complain. */ 437 - print_other_cpu_stall(rcp); 438 - } 439 - } 440 - } 441 - 442 - #else /* #ifdef CONFIG_DEBUG_RCU_STALL */ 443 - 444 - static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) 445 - { 446 - } 447 - 448 - static inline void 449 - check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 450 - { 451 - } 452 - 453 - #endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ 454 - 455 296 /* 456 297 * Register a new batch of callbacks, and start it up if there is currently no 457 298 * active batch and the batch to be registered has not already occurred. ··· 384 381 if (rcp->cur != rcp->pending && 385 382 rcp->completed == rcp->cur) { 386 383 rcp->cur++; 387 - record_gp_check_time(rcp); 384 + record_gp_stall_check_time(rcp); 388 385 389 386 /* 390 387 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a ··· 606 603 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 607 604 { 608 605 /* Check for CPU stalls, if enabled. */ 609 - check_cpu_stall(rcp, rdp); 606 + check_cpu_stall(rcp); 610 607 611 608 if (rdp->nxtlist) { 612 609 long completed_snap = ACCESS_ONCE(rcp->completed); ··· 772 769 */ 773 770 void __init __rcu_init(void) 774 771 { 772 + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR 773 + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 774 + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 775 775 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 776 776 (void *)(long)smp_processor_id()); 777 777 /* Register notifier for non-boot CPUs */
+1 -1
lib/Kconfig.debug
··· 597 597 Say N here if you want the RCU torture tests to start only 598 598 after being manually enabled via /proc. 599 599 600 - config RCU_CPU_STALL 600 + config RCU_CPU_STALL_DETECTOR 601 601 bool "Check for stalled CPUs delaying RCU grace periods" 602 602 depends on CLASSIC_RCU 603 603 default n