kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

In many cases of hardlockup reports, it's actually not possible to know
why it triggered, because the CPU that got stuck is usually waiting on a
resource (with IRQs disabled) in posession of some other CPU is holding.

IOW, we are often looking at the stacktrace of the victim and not the
actual offender.

Introduce sysctl / cmdline parameter that makes it possible to have
hardlockup detector perform all-CPU backtrace.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jiri Kosina and committed by

Linus Torvalds 10 years ago 55537871 ee7fed54

+55 -5

5 changed files

expand all

Documentation

kernel-parameters.txt

sysctl

kernel.txt

include

linux

nmi.h

kernel

sysctl.c

watchdog.c

Documentation/kernel-parameters.txt

··· 1269 1269 Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0. 1270 1270 Default: 1024 1271 1271 1272 + hardlockup_all_cpu_backtrace= 1273 + [KNL] Should the hard-lockup detector generate 1274 + backtraces on all cpus. 1275 + Format: <integer> 1276 + 1272 1277 hashdist= [KNL,NUMA] Large hashes allocated during boot 1273 1278 are distributed across NUMA nodes. Defaults on 1274 1279 for 64-bit NUMA, off otherwise.

+12

Documentation/sysctl/kernel.txt

··· 33 33 - domainname 34 34 - hostname 35 35 - hotplug 36 + - hardlockup_all_cpu_backtrace 36 37 - hung_task_panic 37 38 - hung_task_check_count 38 39 - hung_task_timeout_secs ··· 293 292 domain names are in general different. For a detailed discussion 294 293 see the hostname(1) man page. 295 294 295 + ============================================================== 296 + hardlockup_all_cpu_backtrace: 297 + 298 + This value controls the hard lockup detector behavior when a hard 299 + lockup condition is detected as to whether or not to gather further 300 + debug information. If enabled, arch-specific all-CPU stack dumping 301 + will be initiated. 302 + 303 + 0: do nothing. This is the default behavior. 304 + 305 + 1: on detection capture more debug information. 296 306 ============================================================== 297 307 298 308 hotplug:

include/linux/nmi.h

··· 73 73 extern int watchdog_thresh; 74 74 extern unsigned long *watchdog_cpumask_bits; 75 75 extern int sysctl_softlockup_all_cpu_backtrace; 76 + extern int sysctl_hardlockup_all_cpu_backtrace; 76 77 struct ctl_table; 77 78 extern int proc_watchdog(struct ctl_table *, int , 78 79 void __user *, size_t *, loff_t *);

kernel/sysctl.c

··· 898 898 .extra1 = &zero, 899 899 .extra2 = &one, 900 900 }, 901 + { 902 + .procname = "hardlockup_all_cpu_backtrace", 903 + .data = &sysctl_hardlockup_all_cpu_backtrace, 904 + .maxlen = sizeof(int), 905 + .mode = 0644, 906 + .proc_handler = proc_dointvec_minmax, 907 + .extra1 = &zero, 908 + .extra2 = &one, 909 + }, 901 910 #endif /* CONFIG_SMP */ 902 911 #endif 903 912 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)

+28 -5

kernel/watchdog.c

··· 57 57 58 58 #ifdef CONFIG_SMP 59 59 int __read_mostly sysctl_softlockup_all_cpu_backtrace; 60 + int __read_mostly sysctl_hardlockup_all_cpu_backtrace; 60 61 #else 61 62 #define sysctl_softlockup_all_cpu_backtrace 0 63 + #define sysctl_hardlockup_all_cpu_backtrace 0 62 64 #endif 63 65 static struct cpumask watchdog_cpumask __read_mostly; 64 66 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); ··· 114 112 #ifdef CONFIG_HARDLOCKUP_DETECTOR 115 113 static int hardlockup_panic = 116 114 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 115 + static unsigned long hardlockup_allcpu_dumped; 117 116 /* 118 117 * We may not want to enable hard lockup detection by default in all cases, 119 118 * for example when running the kernel as a guest on a hypervisor. In these ··· 176 173 return 1; 177 174 } 178 175 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); 176 + static int __init hardlockup_all_cpu_backtrace_setup(char *str) 177 + { 178 + sysctl_hardlockup_all_cpu_backtrace = 179 + !!simple_strtol(str, NULL, 0); 180 + return 1; 181 + } 182 + __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); 179 183 #endif 180 184 181 185 /* ··· 328 318 */ 329 319 if (is_hardlockup()) { 330 320 int this_cpu = smp_processor_id(); 321 + struct pt_regs *regs = get_irq_regs(); 331 322 332 323 /* only print hardlockups once */ 333 324 if (__this_cpu_read(hard_watchdog_warn) == true) 334 325 return; 335 326 336 - if (hardlockup_panic) 337 - panic("Watchdog detected hard LOCKUP on cpu %d", 338 - this_cpu); 327 + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); 328 + print_modules(); 329 + print_irqtrace_events(current); 330 + if (regs) 331 + show_regs(regs); 339 332 else 340 - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", 341 - this_cpu); 333 + dump_stack(); 334 + 335 + /* 336 + * Perform all-CPU dump only once to avoid multiple hardlockups 337 + * generating interleaving traces 338 + */ 339 + if (sysctl_hardlockup_all_cpu_backtrace && 340 + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) 341 + trigger_allbutself_cpu_backtrace(); 342 + 343 + if (hardlockup_panic) 344 + panic("Hard LOCKUP"); 342 345 343 346 __this_cpu_write(hard_watchdog_warn, true); 344 347 return;