at master 5.2 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * hangcheck-timer.c 4 * 5 * Driver for a little io fencing timer. 6 * 7 * Copyright (C) 2002, 2003 Oracle. All rights reserved. 8 * 9 * Author: Joel Becker <joel.becker@oracle.com> 10 */ 11 12/* 13 * The hangcheck-timer driver uses the TSC to catch delays that 14 * jiffies does not notice. A timer is set. When the timer fires, it 15 * checks whether it was delayed and if that delay exceeds a given 16 * margin of error. The hangcheck_tick module parameter takes the timer 17 * duration in seconds. The hangcheck_margin parameter defines the 18 * margin of error, in seconds. The defaults are 60 seconds for the 19 * timer and 180 seconds for the margin of error. IOW, a timer is set 20 * for 60 seconds. When the timer fires, the callback checks the 21 * actual duration that the timer waited. If the duration exceeds the 22 * allotted time and margin (here 60 + 180, or 240 seconds), the machine 23 * is restarted. A healthy machine will have the duration match the 24 * expected timeout very closely. 25 */ 26 27#include <linux/module.h> 28#include <linux/moduleparam.h> 29#include <linux/types.h> 30#include <linux/kernel.h> 31#include <linux/fs.h> 32#include <linux/mm.h> 33#include <linux/reboot.h> 34#include <linux/init.h> 35#include <linux/delay.h> 36#include <linux/uaccess.h> 37#include <linux/sysrq.h> 38#include <linux/timer.h> 39#include <linux/hrtimer.h> 40 41#define VERSION_STR "0.9.1" 42 43#define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 44#define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 45 46static int hangcheck_tick = DEFAULT_IOFENCE_TICK; 47static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; 48static int hangcheck_reboot; /* Defaults to not reboot */ 49static int hangcheck_dump_tasks; /* Defaults to not dumping SysRQ T */ 50 51/* options - modular */ 52module_param(hangcheck_tick, int, 0); 53MODULE_PARM_DESC(hangcheck_tick, "Timer delay."); 54module_param(hangcheck_margin, int, 0); 55MODULE_PARM_DESC(hangcheck_margin, "If the hangcheck timer has been delayed more than hangcheck_margin seconds, the driver will fire."); 56module_param(hangcheck_reboot, int, 0); 57MODULE_PARM_DESC(hangcheck_reboot, "If nonzero, the machine will reboot when the timer margin is exceeded."); 58module_param(hangcheck_dump_tasks, int, 0); 59MODULE_PARM_DESC(hangcheck_dump_tasks, "If nonzero, the machine will dump the system task state when the timer margin is exceeded."); 60 61MODULE_AUTHOR("Oracle"); 62MODULE_DESCRIPTION("Hangcheck-timer detects when the system has gone out to lunch past a certain margin."); 63MODULE_LICENSE("GPL"); 64MODULE_VERSION(VERSION_STR); 65 66/* options - nonmodular */ 67#ifndef MODULE 68 69static int __init hangcheck_parse_tick(char *str) 70{ 71 int par; 72 73 if (get_option(&str, &par)) 74 hangcheck_tick = par; 75 return 1; 76} 77 78static int __init hangcheck_parse_margin(char *str) 79{ 80 int par; 81 82 if (get_option(&str, &par)) 83 hangcheck_margin = par; 84 return 1; 85} 86 87static int __init hangcheck_parse_reboot(char *str) 88{ 89 int par; 90 91 if (get_option(&str, &par)) 92 hangcheck_reboot = par; 93 return 1; 94} 95 96static int __init hangcheck_parse_dump_tasks(char *str) 97{ 98 int par; 99 100 if (get_option(&str, &par)) 101 hangcheck_dump_tasks = par; 102 return 1; 103} 104 105__setup("hcheck_tick", hangcheck_parse_tick); 106__setup("hcheck_margin", hangcheck_parse_margin); 107__setup("hcheck_reboot", hangcheck_parse_reboot); 108__setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); 109#endif /* not MODULE */ 110 111#define TIMER_FREQ 1000000000ULL 112 113/* Last time scheduled */ 114static unsigned long long hangcheck_tsc, hangcheck_tsc_margin; 115 116static void hangcheck_fire(struct timer_list *); 117 118static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire); 119 120static void hangcheck_fire(struct timer_list *unused) 121{ 122 unsigned long long cur_tsc, tsc_diff; 123 124 cur_tsc = ktime_get_ns(); 125 126 if (cur_tsc > hangcheck_tsc) 127 tsc_diff = cur_tsc - hangcheck_tsc; 128 else 129 tsc_diff = (cur_tsc + (~0ULL - hangcheck_tsc)); /* or something */ 130 131 if (tsc_diff > hangcheck_tsc_margin) { 132 if (hangcheck_dump_tasks) { 133 pr_crit("Hangcheck: Task state:\n"); 134#ifdef CONFIG_MAGIC_SYSRQ 135 handle_sysrq('t'); 136#endif /* CONFIG_MAGIC_SYSRQ */ 137 } 138 if (hangcheck_reboot) { 139 pr_crit("Hangcheck: hangcheck is restarting the machine.\n"); 140 emergency_restart(); 141 } else { 142 pr_crit("Hangcheck: hangcheck value past margin!\n"); 143 } 144 } 145#if 0 146 /* 147 * Enable to investigate delays in detail 148 */ 149 pr_debug("Hangcheck: called %lld ns since last time (%lld ns overshoot)\n", 150 tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ); 151#endif 152 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 153 hangcheck_tsc = ktime_get_ns(); 154} 155 156 157static int __init hangcheck_init(void) 158{ 159 pr_debug("Hangcheck: starting hangcheck timer %s (tick is %d seconds, margin is %d seconds).\n", 160 VERSION_STR, hangcheck_tick, hangcheck_margin); 161 hangcheck_tsc_margin = 162 (unsigned long long)hangcheck_margin + hangcheck_tick; 163 hangcheck_tsc_margin *= TIMER_FREQ; 164 165 hangcheck_tsc = ktime_get_ns(); 166 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 167 168 return 0; 169} 170 171 172static void __exit hangcheck_exit(void) 173{ 174 timer_delete_sync(&hangcheck_ticktock); 175 pr_debug("Hangcheck: Stopped hangcheck timer.\n"); 176} 177 178module_init(hangcheck_init); 179module_exit(hangcheck_exit);