Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

watchdog: Change the default timeout and configure nmi watchdog period based on watchdog_thresh

Before the conversion of the NMI watchdog to perf event, the
watchdog timeout was 5 seconds. Now it is 60 seconds. For my
particular application, netbooks, 5 seconds was a better
timeout. With a short timeout, we catch faults earlier and are
able to send back a panic. With a 60 second timeout, the user is
unlikely to wait and will instead hit the power button, causing
us to lose the panic info.

This change configures the NMI period to watchdog_thresh and
sets the softlockup_thresh to watchdog_thresh * 2. In addition,
watchdog_thresh was reduced to 10 seconds as suggested by Ingo
Molnar.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/1306127423-3347-4-git-send-email-msb@chromium.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20110517071642.GF22305@elte.hu>

authored by

Mandeep Singh Baines and committed by
Ingo Molnar
4eec42f3 586692a5

+18 -7
+2 -2
arch/x86/kernel/apic/hw_nmi.c
··· 19 19 #include <linux/delay.h> 20 20 21 21 #ifdef CONFIG_HARDLOCKUP_DETECTOR 22 - u64 hw_nmi_get_sample_period(void) 22 + u64 hw_nmi_get_sample_period(int watchdog_thresh) 23 23 { 24 - return (u64)(cpu_khz) * 1000 * 60; 24 + return (u64)(cpu_khz) * 1000 * watchdog_thresh; 25 25 } 26 26 #endif 27 27
+1 -1
include/linux/nmi.h
··· 45 45 46 46 #ifdef CONFIG_LOCKUP_DETECTOR 47 47 int hw_nmi_is_cpu_stuck(struct pt_regs *); 48 - u64 hw_nmi_get_sample_period(void); 48 + u64 hw_nmi_get_sample_period(int watchdog_thresh); 49 49 extern int watchdog_enabled; 50 50 extern int watchdog_thresh; 51 51 struct ctl_table;
+15 -4
kernel/watchdog.c
··· 28 28 #include <linux/perf_event.h> 29 29 30 30 int watchdog_enabled = 1; 31 - int __read_mostly watchdog_thresh = 60; 31 + int __read_mostly watchdog_thresh = 10; 32 32 33 33 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 34 34 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); ··· 91 91 __setup("nosoftlockup", nosoftlockup_setup); 92 92 /* */ 93 93 94 + /* 95 + * Hard-lockup warnings should be triggered after just a few seconds. Soft- 96 + * lockups can have false positives under extreme conditions. So we generally 97 + * want a higher threshold for soft lockups than for hard lockups. So we couple 98 + * the thresholds with a factor: we make the soft threshold twice the amount of 99 + * time the hard threshold is. 100 + */ 101 + static int get_softlockup_thresh() 102 + { 103 + return watchdog_thresh * 2; 104 + } 94 105 95 106 /* 96 107 * Returns seconds, approximately. We don't need nanosecond ··· 121 110 * increment before the hardlockup detector generates 122 111 * a warning 123 112 */ 124 - return watchdog_thresh * (NSEC_PER_SEC / 5); 113 + return get_softlockup_thresh() * (NSEC_PER_SEC / 5); 125 114 } 126 115 127 116 /* Commands for resetting the watchdog */ ··· 193 182 unsigned long now = get_timestamp(smp_processor_id()); 194 183 195 184 /* Warn about unreasonable delays: */ 196 - if (time_after(now, touch_ts + watchdog_thresh)) 185 + if (time_after(now, touch_ts + get_softlockup_thresh())) 197 186 return now - touch_ts; 198 187 199 188 return 0; ··· 370 359 371 360 /* Try to register using hardware perf events */ 372 361 wd_attr = &wd_hw_attr; 373 - wd_attr->sample_period = hw_nmi_get_sample_period(); 362 + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); 374 363 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 375 364 if (!IS_ERR(event)) { 376 365 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");