Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

watchdog: add sys_info sysctls to dump sys info on system lockup

When soft/hard lockup happens, developers may need different kinds of
system information (call-stacks, memory info, locks, etc.) to help
debugging.

Add 'softlockup_sys_info' and 'hardlockup_sys_info' sysctl knobs to take
human readable string like "tasks,mem,timers,locks,ftrace,...", and when
system lockup happens, all requested information will be printed out.
(refer kernel/sys_info.c for more details).

Link: https://lkml.kernel.org/r/20251113111039.22701-4-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Feng Tang and committed by
Andrew Morton
a9af76a7 8b2b9b4f

+46 -3
+5
Documentation/admin-guide/sysctl/kernel.rst
··· 582 582 When ``kptr_restrict`` is set to 2, kernel pointers printed using 583 583 %pK will be replaced with 0s regardless of privileges. 584 584 585 + softlockup_sys_info & hardlockup_sys_info 586 + ========================================= 587 + A comma separated list of extra system information to be dumped when 588 + soft/hard lockup is detected, for example, "tasks,mem,timers,locks,...". 589 + Refer 'panic_sys_info' section below for more details. 585 590 586 591 modprobe 587 592 ========
+41 -3
kernel/watchdog.c
··· 25 25 #include <linux/stop_machine.h> 26 26 #include <linux/sysctl.h> 27 27 #include <linux/tick.h> 28 + #include <linux/sys_info.h> 28 29 29 30 #include <linux/sched/clock.h> 30 31 #include <linux/sched/debug.h> ··· 65 64 */ 66 65 unsigned int __read_mostly hardlockup_panic = 67 66 IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); 67 + 68 + /* 69 + * bitmasks to control what kinds of system info to be printed when 70 + * hard lockup is detected, it could be task, memory, lock etc. 71 + * Refer include/linux/sys_info.h for detailed bit definition. 72 + */ 73 + static unsigned long hardlockup_si_mask; 68 74 69 75 #ifdef CONFIG_SYSFS 70 76 ··· 186 178 187 179 void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) 188 180 { 181 + int hardlockup_all_cpu_backtrace; 182 + 189 183 if (per_cpu(watchdog_hardlockup_touched, cpu)) { 190 184 per_cpu(watchdog_hardlockup_touched, cpu) = false; 191 185 return; 192 186 } 193 187 188 + hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ? 189 + 1 : sysctl_hardlockup_all_cpu_backtrace; 194 190 /* 195 191 * Check for a hardlockup by making sure the CPU's timer 196 192 * interrupt is incrementing. The timer interrupt should have ··· 217 205 * Prevent multiple hard-lockup reports if one cpu is already 218 206 * engaged in dumping all cpu back traces. 219 207 */ 220 - if (sysctl_hardlockup_all_cpu_backtrace) { 208 + if (hardlockup_all_cpu_backtrace) { 221 209 if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) 222 210 return; 223 211 } ··· 246 234 trigger_single_cpu_backtrace(cpu); 247 235 } 248 236 249 - if (sysctl_hardlockup_all_cpu_backtrace) { 237 + if (hardlockup_all_cpu_backtrace) { 250 238 trigger_allbutcpu_cpu_backtrace(cpu); 251 239 if (!hardlockup_panic) 252 240 clear_bit_unlock(0, &hard_lockup_nmi_warn); 253 241 } 254 242 243 + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); 255 244 if (hardlockup_panic) 256 245 nmi_panic(regs, "Hard LOCKUP"); 257 246 ··· 342 329 #ifdef CONFIG_SMP 343 330 int __read_mostly sysctl_softlockup_all_cpu_backtrace; 344 331 #endif 332 + 333 + /* 334 + * bitmasks to control what kinds of system info to be printed when 335 + * soft lockup is detected, it could be task, memory, lock etc. 336 + * Refer include/linux/sys_info.h for detailed bit definition. 337 + */ 338 + static unsigned long softlockup_si_mask; 345 339 346 340 static struct cpumask watchdog_allowed_mask __read_mostly; 347 341 ··· 766 746 unsigned long touch_ts, period_ts, now; 767 747 struct pt_regs *regs = get_irq_regs(); 768 748 int duration; 769 - int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; 749 + int softlockup_all_cpu_backtrace; 770 750 unsigned long flags; 771 751 772 752 if (!watchdog_enabled) ··· 777 757 */ 778 758 if (panic_in_progress()) 779 759 return HRTIMER_NORESTART; 760 + 761 + softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ? 762 + 1 : sysctl_softlockup_all_cpu_backtrace; 780 763 781 764 watchdog_hardlockup_kick(); 782 765 ··· 869 846 } 870 847 871 848 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 849 + sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); 872 850 if (softlockup_panic) 873 851 panic("softlockup: hung tasks"); 874 852 } ··· 1221 1197 .extra1 = SYSCTL_ZERO, 1222 1198 .extra2 = SYSCTL_ONE, 1223 1199 }, 1200 + { 1201 + .procname = "softlockup_sys_info", 1202 + .data = &softlockup_si_mask, 1203 + .maxlen = sizeof(softlockup_si_mask), 1204 + .mode = 0644, 1205 + .proc_handler = sysctl_sys_info_handler, 1206 + }, 1224 1207 #ifdef CONFIG_SMP 1225 1208 { 1226 1209 .procname = "softlockup_all_cpu_backtrace", ··· 1249 1218 .proc_handler = proc_dointvec_minmax, 1250 1219 .extra1 = SYSCTL_ZERO, 1251 1220 .extra2 = SYSCTL_ONE, 1221 + }, 1222 + { 1223 + .procname = "hardlockup_sys_info", 1224 + .data = &hardlockup_si_mask, 1225 + .maxlen = sizeof(hardlockup_si_mask), 1226 + .mode = 0644, 1227 + .proc_handler = sysctl_sys_info_handler, 1252 1228 }, 1253 1229 #ifdef CONFIG_SMP 1254 1230 {