Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

inotify: Fix possible deadlock in fsnotify_destroy_mark

[Syzbot reported]
WARNING: possible circular locking dependency detected
6.11.0-rc4-syzkaller-00019-gb311c1b497e5 #0 Not tainted
------------------------------------------------------
kswapd0/78 is trying to acquire lock:
ffff88801b8d8930 (&group->mark_mutex){+.+.}-{3:3}, at: fsnotify_group_lock include/linux/fsnotify_backend.h:270 [inline]
ffff88801b8d8930 (&group->mark_mutex){+.+.}-{3:3}, at: fsnotify_destroy_mark+0x38/0x3c0 fs/notify/mark.c:578

but task is already holding lock:
ffffffff8ea2fd60 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6841 [inline]
ffffffff8ea2fd60 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xbb4/0x35a0 mm/vmscan.c:7223

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (fs_reclaim){+.+.}-{0:0}:
...
kmem_cache_alloc_noprof+0x3d/0x2a0 mm/slub.c:4044
inotify_new_watch fs/notify/inotify/inotify_user.c:599 [inline]
inotify_update_watch fs/notify/inotify/inotify_user.c:647 [inline]
__do_sys_inotify_add_watch fs/notify/inotify/inotify_user.c:786 [inline]
__se_sys_inotify_add_watch+0x72e/0x1070 fs/notify/inotify/inotify_user.c:729
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #0 (&group->mark_mutex){+.+.}-{3:3}:
...
__mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752
fsnotify_group_lock include/linux/fsnotify_backend.h:270 [inline]
fsnotify_destroy_mark+0x38/0x3c0 fs/notify/mark.c:578
fsnotify_destroy_marks+0x14a/0x660 fs/notify/mark.c:934
fsnotify_inoderemove include/linux/fsnotify.h:264 [inline]
dentry_unlink_inode+0x2e0/0x430 fs/dcache.c:403
__dentry_kill+0x20d/0x630 fs/dcache.c:610
shrink_kill+0xa9/0x2c0 fs/dcache.c:1055
shrink_dentry_list+0x2c0/0x5b0 fs/dcache.c:1082
prune_dcache_sb+0x10f/0x180 fs/dcache.c:1163
super_cache_scan+0x34f/0x4b0 fs/super.c:221
do_shrink_slab+0x701/0x1160 mm/shrinker.c:435
shrink_slab+0x1093/0x14d0 mm/shrinker.c:662
shrink_one+0x43b/0x850 mm/vmscan.c:4815
shrink_many mm/vmscan.c:4876 [inline]
lru_gen_shrink_node mm/vmscan.c:4954 [inline]
shrink_node+0x3799/0x3de0 mm/vmscan.c:5934
kswapd_shrink_node mm/vmscan.c:6762 [inline]
balance_pgdat mm/vmscan.c:6954 [inline]
kswapd+0x1bcd/0x35a0 mm/vmscan.c:7223

[Analysis]
The problem is that inotify_new_watch() is using GFP_KERNEL to allocate
new watches under group->mark_mutex, however if dentry reclaim races
with unlinking of an inode, it can end up dropping the last dentry reference
for an unlinked inode resulting in removal of fsnotify mark from reclaim
context which wants to acquire group->mark_mutex as well.

This scenario shows that all notification groups are in principle prone
to this kind of a deadlock (previously, we considered only fanotify and
dnotify to be problematic for other reasons) so make sure all
allocations under group->mark_mutex happen with GFP_NOFS.

Reported-and-tested-by: syzbot+c679f13773f295d2da53@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c679f13773f295d2da53
Signed-off-by: Lizhi Xu <lizhi.xu@windriver.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20240927143642.2369508-1-lizhi.xu@windriver.com

authored by

Lizhi Xu and committed by
Jan Kara
cad3f4a2 35ceae44

+6 -22
+1 -1
fs/nfsd/filecache.c
··· 792 792 } 793 793 794 794 nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops, 795 - FSNOTIFY_GROUP_NOFS); 795 + 0); 796 796 if (IS_ERR(nfsd_file_fsnotify_group)) { 797 797 pr_err("nfsd: unable to create fsnotify group: %ld\n", 798 798 PTR_ERR(nfsd_file_fsnotify_group));
+1 -2
fs/notify/dnotify/dnotify.c
··· 406 406 SLAB_PANIC|SLAB_ACCOUNT); 407 407 dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); 408 408 409 - dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, 410 - FSNOTIFY_GROUP_NOFS); 409 + dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, 0); 411 410 if (IS_ERR(dnotify_group)) 412 411 panic("unable to allocate fsnotify group for dnotify\n"); 413 412 dnotify_sysctl_init();
+1 -1
fs/notify/fanotify/fanotify_user.c
··· 1480 1480 1481 1481 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 1482 1482 group = fsnotify_alloc_group(&fanotify_fsnotify_ops, 1483 - FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS); 1483 + FSNOTIFY_GROUP_USER); 1484 1484 if (IS_ERR(group)) { 1485 1485 return PTR_ERR(group); 1486 1486 }
-11
fs/notify/group.c
··· 115 115 const struct fsnotify_ops *ops, 116 116 int flags, gfp_t gfp) 117 117 { 118 - static struct lock_class_key nofs_marks_lock; 119 118 struct fsnotify_group *group; 120 119 121 120 group = kzalloc(sizeof(struct fsnotify_group), gfp); ··· 135 136 136 137 group->ops = ops; 137 138 group->flags = flags; 138 - /* 139 - * For most backends, eviction of inode with a mark is not expected, 140 - * because marks hold a refcount on the inode against eviction. 141 - * 142 - * Use a different lockdep class for groups that support evictable 143 - * inode marks, because with evictable marks, mark_mutex is NOT 144 - * fs-reclaim safe - the mutex is taken when evicting inodes. 145 - */ 146 - if (flags & FSNOTIFY_GROUP_NOFS) 147 - lockdep_set_class(&group->mark_mutex, &nofs_marks_lock); 148 139 149 140 return group; 150 141 }
+3 -7
include/linux/fsnotify_backend.h
··· 217 217 218 218 #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ 219 219 #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ 220 - #define FSNOTIFY_GROUP_NOFS 0x04 /* group lock is not direct reclaim safe */ 221 220 int flags; 222 221 unsigned int owner_flags; /* stored flags of mark_mutex owner */ 223 222 ··· 267 268 static inline void fsnotify_group_lock(struct fsnotify_group *group) 268 269 { 269 270 mutex_lock(&group->mark_mutex); 270 - if (group->flags & FSNOTIFY_GROUP_NOFS) 271 - group->owner_flags = memalloc_nofs_save(); 271 + group->owner_flags = memalloc_nofs_save(); 272 272 } 273 273 274 274 static inline void fsnotify_group_unlock(struct fsnotify_group *group) 275 275 { 276 - if (group->flags & FSNOTIFY_GROUP_NOFS) 277 - memalloc_nofs_restore(group->owner_flags); 276 + memalloc_nofs_restore(group->owner_flags); 278 277 mutex_unlock(&group->mark_mutex); 279 278 } 280 279 281 280 static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) 282 281 { 283 282 WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); 284 - if (group->flags & FSNOTIFY_GROUP_NOFS) 285 - WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); 283 + WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); 286 284 } 287 285 288 286 /* When calling fsnotify tell it if the data is a path or inode */