Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

watchdog: allow nmi watchdog to use raw perf event

NMI watchdog permanently consumes one hardware counters per CPU on the
system. For systems that use many hardware counters, this causes more
aggressive time multiplexing of perf events.

OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
used. Add kernel cmdline arg nmi_watchdog=rNNN to configure the watchdog
to use raw event. For example, on Intel CPUs, we can use "r300" to
configure the watchdog to use ref-cycles event.

If the raw event does not work, fall back to use "cycles".

[akpm@linux-foundation.org: fix kerneldoc]
Link: https://lkml.kernel.org/r/20240430060236.1878002-2-song@kernel.org
Signed-off-by: Song Liu <song@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Song Liu and committed by
Andrew Morton
393fb313 602ba773

+53 -2
+3 -2
Documentation/admin-guide/kernel-parameters.txt
··· 3773 3773 Format: [state][,regs][,debounce][,die] 3774 3774 3775 3775 nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels 3776 - Format: [panic,][nopanic,][num] 3776 + Format: [panic,][nopanic,][rNNN,][num] 3777 3777 Valid num: 0 or 1 3778 3778 0 - turn hardlockup detector in nmi_watchdog off 3779 3779 1 - turn hardlockup detector in nmi_watchdog on 3780 + rNNN - configure the watchdog with raw perf event 0xNNN 3781 + 3780 3782 When panic is specified, panic when an NMI watchdog 3781 3783 timeout occurs (or 'nopanic' to not panic on an NMI 3782 3784 watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set) ··· 7466 7464 memory, and other data can't be written using 7467 7465 xmon commands. 7468 7466 off xmon is disabled. 7469 -
+2
include/linux/nmi.h
··· 105 105 extern void hardlockup_detector_perf_stop(void); 106 106 extern void hardlockup_detector_perf_restart(void); 107 107 extern void hardlockup_detector_perf_cleanup(void); 108 + extern void hardlockup_config_perf_event(const char *str); 108 109 #else 109 110 static inline void hardlockup_detector_perf_stop(void) { } 110 111 static inline void hardlockup_detector_perf_restart(void) { } 111 112 static inline void hardlockup_detector_perf_cleanup(void) { } 113 + static inline void hardlockup_config_perf_event(const char *str) { } 112 114 #endif 113 115 114 116 void watchdog_hardlockup_stop(void);
+2
kernel/watchdog.c
··· 80 80 watchdog_hardlockup_user_enabled = 0; 81 81 else if (!strncmp(str, "1", 1)) 82 82 watchdog_hardlockup_user_enabled = 1; 83 + else if (!strncmp(str, "r", 1)) 84 + hardlockup_config_perf_event(str + 1); 83 85 while (*(str++)) { 84 86 if (*str == ',') { 85 87 str++;
+46
kernel/watchdog_perf.c
··· 90 90 .disabled = 1, 91 91 }; 92 92 93 + static struct perf_event_attr fallback_wd_hw_attr = { 94 + .type = PERF_TYPE_HARDWARE, 95 + .config = PERF_COUNT_HW_CPU_CYCLES, 96 + .size = sizeof(struct perf_event_attr), 97 + .pinned = 1, 98 + .disabled = 1, 99 + }; 100 + 93 101 /* Callback function for perf event subsystem */ 94 102 static void watchdog_overflow_callback(struct perf_event *event, 95 103 struct perf_sample_data *data, ··· 130 122 /* Try to register using hardware perf events */ 131 123 evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, 132 124 watchdog_overflow_callback, NULL); 125 + if (IS_ERR(evt)) { 126 + wd_attr = &fallback_wd_hw_attr; 127 + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); 128 + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, 129 + watchdog_overflow_callback, NULL); 130 + } 131 + 133 132 if (IS_ERR(evt)) { 134 133 pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, 135 134 PTR_ERR(evt)); ··· 273 258 this_cpu_write(watchdog_ev, NULL); 274 259 } 275 260 return ret; 261 + } 262 + 263 + /** 264 + * hardlockup_config_perf_event - Overwrite config of wd_hw_attr. 265 + * 266 + * @str: number which identifies the raw perf event to use 267 + */ 268 + void __init hardlockup_config_perf_event(const char *str) 269 + { 270 + u64 config; 271 + char buf[24]; 272 + char *comma = strchr(str, ','); 273 + 274 + if (!comma) { 275 + if (kstrtoull(str, 16, &config)) 276 + return; 277 + } else { 278 + unsigned int len = comma - str; 279 + 280 + if (len >= sizeof(buf)) 281 + return; 282 + 283 + if (strscpy(buf, str, sizeof(buf)) < 0) 284 + return; 285 + buf[len] = 0; 286 + if (kstrtoull(buf, 16, &config)) 287 + return; 288 + } 289 + 290 + wd_hw_attr.type = PERF_TYPE_RAW; 291 + wd_hw_attr.config = config; 276 292 }