Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fanotify: add watchdog for permission events

This is to make it easier to debug issues with AV software, which time and
again deadlocks with no indication of where the issue comes from, and the
kernel being blamed for the deadlock. Then we need to analyze dumps to
prove that the kernel is not in fact at fault.

The deadlock comes from recursion: handling the event triggers another
permission event, in some roundabout way, obviously, otherwise it would
have been found in testing.

With this patch a warning is printed when permission event is received by
userspace but not answered for more than the timeout specified in
/proc/sys/fs/fanotify/watchdog_timeout. The watchdog can be turned off by
setting the timeout to zero (which is the default).

The timeout is very coarse (T <= t < 2T) but I guess it's good enough for
the purpose.

Overhead should be minimal.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Link: https://patch.msgid.link/20250909143053.112171-1-mszeredi@redhat.com
Signed-off-by: Jan Kara <jack@suse.cz>

authored by

Miklos Szeredi and committed by
Jan Kara
b8cf8fda 62e59ffe

+106
+2
fs/notify/fanotify/fanotify.h
··· 441 441 size_t count; 442 442 u32 response; /* userspace answer to the event */ 443 443 unsigned short state; /* state of the event */ 444 + unsigned short watchdog_cnt; /* already scanned by watchdog? */ 444 445 int fd; /* fd we passed to userspace for this event */ 446 + pid_t recv_pid; /* pid of task receiving the event */ 445 447 union { 446 448 struct fanotify_response_info_header hdr; 447 449 struct fanotify_response_info_audit_rule audit_rule;
+102
fs/notify/fanotify/fanotify_user.c
··· 50 50 51 51 /* configurable via /proc/sys/fs/fanotify/ */ 52 52 static int fanotify_max_queued_events __read_mostly; 53 + static int perm_group_timeout __read_mostly; 53 54 54 55 #ifdef CONFIG_SYSCTL 55 56 ··· 86 85 .proc_handler = proc_dointvec_minmax, 87 86 .extra1 = SYSCTL_ZERO 88 87 }, 88 + { 89 + .procname = "watchdog_timeout", 90 + .data = &perm_group_timeout, 91 + .maxlen = sizeof(int), 92 + .mode = 0644, 93 + .proc_handler = proc_dointvec_minmax, 94 + .extra1 = SYSCTL_ZERO, 95 + }, 89 96 }; 90 97 91 98 static void __init fanotify_sysctls_init(void) ··· 103 94 #else 104 95 #define fanotify_sysctls_init() do { } while (0) 105 96 #endif /* CONFIG_SYSCTL */ 97 + 98 + static LIST_HEAD(perm_group_list); 99 + static DEFINE_SPINLOCK(perm_group_lock); 100 + static void perm_group_watchdog(struct work_struct *work); 101 + static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog); 102 + 103 + static void perm_group_watchdog_schedule(void) 104 + { 105 + schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout)); 106 + } 107 + 108 + static void perm_group_watchdog(struct work_struct *work) 109 + { 110 + struct fsnotify_group *group; 111 + struct fanotify_perm_event *event; 112 + struct task_struct *task; 113 + pid_t failed_pid = 0; 114 + 115 + guard(spinlock)(&perm_group_lock); 116 + if (list_empty(&perm_group_list)) 117 + return; 118 + 119 + list_for_each_entry(group, &perm_group_list, 120 + fanotify_data.perm_grp_list) { 121 + /* 122 + * Ok to test without lock, racing with an addition is 123 + * fine, will deal with it next round 124 + */ 125 + if (list_empty(&group->fanotify_data.access_list)) 126 + continue; 127 + 128 + spin_lock(&group->notification_lock); 129 + list_for_each_entry(event, &group->fanotify_data.access_list, 130 + fae.fse.list) { 131 + if (likely(event->watchdog_cnt == 0)) { 132 + event->watchdog_cnt = 1; 133 + } else if (event->watchdog_cnt == 1) { 134 + /* Report on event only once */ 135 + event->watchdog_cnt = 2; 136 + 137 + /* Do not report same pid repeatedly */ 138 + if (event->recv_pid == failed_pid) 139 + continue; 140 + 141 + failed_pid = event->recv_pid; 142 + rcu_read_lock(); 143 + task = find_task_by_pid_ns(event->recv_pid, 144 + &init_pid_ns); 145 + pr_warn_ratelimited( 146 + "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n", 147 + event->recv_pid, 148 + task ? task->comm : NULL, 149 + perm_group_timeout); 150 + rcu_read_unlock(); 151 + } 152 + } 153 + spin_unlock(&group->notification_lock); 154 + } 155 + perm_group_watchdog_schedule(); 156 + } 157 + 158 + static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group) 159 + { 160 + if (!list_empty(&group->fanotify_data.perm_grp_list)) { 161 + /* Perm event watchdog can no longer scan this group. */ 162 + spin_lock(&perm_group_lock); 163 + list_del_init(&group->fanotify_data.perm_grp_list); 164 + spin_unlock(&perm_group_lock); 165 + } 166 + } 167 + 168 + static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group) 169 + { 170 + if (!perm_group_timeout) 171 + return; 172 + 173 + spin_lock(&perm_group_lock); 174 + if (list_empty(&group->fanotify_data.perm_grp_list)) { 175 + /* Add to perm_group_list for monitoring by watchdog. */ 176 + if (list_empty(&perm_group_list)) 177 + perm_group_watchdog_schedule(); 178 + list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list); 179 + } 180 + spin_unlock(&perm_group_lock); 181 + } 106 182 107 183 /* 108 184 * All flags that may be specified in parameter event_f_flags of fanotify_init. ··· 1047 953 spin_lock(&group->notification_lock); 1048 954 list_add_tail(&event->fse.list, 1049 955 &group->fanotify_data.access_list); 956 + FANOTIFY_PERM(event)->recv_pid = current->pid; 1050 957 spin_unlock(&group->notification_lock); 1051 958 } 1052 959 } ··· 1106 1011 * leave access_list by now either. 1107 1012 */ 1108 1013 fsnotify_group_stop_queueing(group); 1014 + 1015 + fanotify_perm_watchdog_group_remove(group); 1109 1016 1110 1017 /* 1111 1018 * Process all permission events on access_list and notification queue ··· 1562 1465 fsnotify_group_unlock(group); 1563 1466 1564 1467 fsnotify_put_mark(fsn_mark); 1468 + 1469 + if (!ret && (mask & FANOTIFY_PERM_EVENTS)) 1470 + fanotify_perm_watchdog_group_add(group); 1471 + 1565 1472 return ret; 1566 1473 } 1567 1474 ··· 1726 1625 group->fanotify_data.f_flags = event_f_flags; 1727 1626 init_waitqueue_head(&group->fanotify_data.access_waitq); 1728 1627 INIT_LIST_HEAD(&group->fanotify_data.access_list); 1628 + INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list); 1729 1629 switch (class) { 1730 1630 case FAN_CLASS_NOTIF: 1731 1631 group->priority = FSNOTIFY_PRIO_NORMAL;
+2
include/linux/fsnotify_backend.h
··· 273 273 int f_flags; /* event_f_flags from fanotify_init() */ 274 274 struct ucounts *ucounts; 275 275 mempool_t error_events_pool; 276 + /* chained on perm_group_list */ 277 + struct list_head perm_grp_list; 276 278 } fanotify_data; 277 279 #endif /* CONFIG_FANOTIFY */ 278 280 };