Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup

In many cases of hardlockup reports, it's actually not possible to know
why it triggered, because the CPU that got stuck is usually waiting on a
resource (with IRQs disabled) in posession of some other CPU is holding.

IOW, we are often looking at the stacktrace of the victim and not the
actual offender.

Introduce sysctl / cmdline parameter that makes it possible to have
hardlockup detector perform all-CPU backtrace.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jiri Kosina and committed by
Linus Torvalds
55537871 ee7fed54

+55 -5
+5
Documentation/kernel-parameters.txt
··· 1269 1269 Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0. 1270 1270 Default: 1024 1271 1271 1272 + hardlockup_all_cpu_backtrace= 1273 + [KNL] Should the hard-lockup detector generate 1274 + backtraces on all cpus. 1275 + Format: <integer> 1276 + 1272 1277 hashdist= [KNL,NUMA] Large hashes allocated during boot 1273 1278 are distributed across NUMA nodes. Defaults on 1274 1279 for 64-bit NUMA, off otherwise.
+12
Documentation/sysctl/kernel.txt
··· 33 33 - domainname 34 34 - hostname 35 35 - hotplug 36 + - hardlockup_all_cpu_backtrace 36 37 - hung_task_panic 37 38 - hung_task_check_count 38 39 - hung_task_timeout_secs ··· 293 292 domain names are in general different. For a detailed discussion 294 293 see the hostname(1) man page. 295 294 295 + ============================================================== 296 + hardlockup_all_cpu_backtrace: 297 + 298 + This value controls the hard lockup detector behavior when a hard 299 + lockup condition is detected as to whether or not to gather further 300 + debug information. If enabled, arch-specific all-CPU stack dumping 301 + will be initiated. 302 + 303 + 0: do nothing. This is the default behavior. 304 + 305 + 1: on detection capture more debug information. 296 306 ============================================================== 297 307 298 308 hotplug:
+1
include/linux/nmi.h
··· 73 73 extern int watchdog_thresh; 74 74 extern unsigned long *watchdog_cpumask_bits; 75 75 extern int sysctl_softlockup_all_cpu_backtrace; 76 + extern int sysctl_hardlockup_all_cpu_backtrace; 76 77 struct ctl_table; 77 78 extern int proc_watchdog(struct ctl_table *, int , 78 79 void __user *, size_t *, loff_t *);
+9
kernel/sysctl.c
··· 898 898 .extra1 = &zero, 899 899 .extra2 = &one, 900 900 }, 901 + { 902 + .procname = "hardlockup_all_cpu_backtrace", 903 + .data = &sysctl_hardlockup_all_cpu_backtrace, 904 + .maxlen = sizeof(int), 905 + .mode = 0644, 906 + .proc_handler = proc_dointvec_minmax, 907 + .extra1 = &zero, 908 + .extra2 = &one, 909 + }, 901 910 #endif /* CONFIG_SMP */ 902 911 #endif 903 912 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+28 -5
kernel/watchdog.c
··· 57 57 58 58 #ifdef CONFIG_SMP 59 59 int __read_mostly sysctl_softlockup_all_cpu_backtrace; 60 + int __read_mostly sysctl_hardlockup_all_cpu_backtrace; 60 61 #else 61 62 #define sysctl_softlockup_all_cpu_backtrace 0 63 + #define sysctl_hardlockup_all_cpu_backtrace 0 62 64 #endif 63 65 static struct cpumask watchdog_cpumask __read_mostly; 64 66 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); ··· 114 112 #ifdef CONFIG_HARDLOCKUP_DETECTOR 115 113 static int hardlockup_panic = 116 114 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 115 + static unsigned long hardlockup_allcpu_dumped; 117 116 /* 118 117 * We may not want to enable hard lockup detection by default in all cases, 119 118 * for example when running the kernel as a guest on a hypervisor. In these ··· 176 173 return 1; 177 174 } 178 175 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); 176 + static int __init hardlockup_all_cpu_backtrace_setup(char *str) 177 + { 178 + sysctl_hardlockup_all_cpu_backtrace = 179 + !!simple_strtol(str, NULL, 0); 180 + return 1; 181 + } 182 + __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); 179 183 #endif 180 184 181 185 /* ··· 328 318 */ 329 319 if (is_hardlockup()) { 330 320 int this_cpu = smp_processor_id(); 321 + struct pt_regs *regs = get_irq_regs(); 331 322 332 323 /* only print hardlockups once */ 333 324 if (__this_cpu_read(hard_watchdog_warn) == true) 334 325 return; 335 326 336 - if (hardlockup_panic) 337 - panic("Watchdog detected hard LOCKUP on cpu %d", 338 - this_cpu); 327 + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); 328 + print_modules(); 329 + print_irqtrace_events(current); 330 + if (regs) 331 + show_regs(regs); 339 332 else 340 - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", 341 - this_cpu); 333 + dump_stack(); 334 + 335 + /* 336 + * Perform all-CPU dump only once to avoid multiple hardlockups 337 + * generating interleaving traces 338 + */ 339 + if (sysctl_hardlockup_all_cpu_backtrace && 340 + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) 341 + trigger_allbutself_cpu_backtrace(); 342 + 343 + if (hardlockup_panic) 344 + panic("Hard LOCKUP"); 342 345 343 346 __this_cpu_write(hard_watchdog_warn, true); 344 347 return;