Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fsnotify: avoid spurious EMFILE errors from inotify_init()

Inotify instance is destroyed when all references to it are dropped.
That not only means that the corresponding file descriptor needs to be
closed but also that all corresponding instance marks are freed (as each
mark holds a reference to the inotify instance). However marks are
freed only after SRCU period ends which can take some time and thus if
user rapidly creates and frees inotify instances, number of existing
inotify instances can exceed max_user_instances limit although from user
point of view there is always at most one existing instance. Thus
inotify_init() returns EMFILE error which is hard to justify from user
point of view. This problem is exposed by LTP inotify06 testcase on
some machines.

We fix the problem by making sure all group marks are properly freed
while destroying inotify instance. We wait for SRCU period to end in
that path anyway since we have to make sure there is no event being
added to the instance while we are tearing down the instance. So it
takes only some plumbing to allow for marks to be destroyed in that path
as well and not from a dedicated work item.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Tested-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jan Kara and committed by
Linus Torvalds
35e48176 2600a46e

+81 -23
+7
fs/notify/fsnotify.h
··· 56 56 fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, 57 57 &mnt->mnt_root->d_lock); 58 58 } 59 + /* prepare for freeing all marks associated with given group */ 60 + extern void fsnotify_detach_group_marks(struct fsnotify_group *group); 61 + /* 62 + * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list 63 + */ 64 + extern void fsnotify_mark_destroy_list(void); 65 + 59 66 /* 60 67 * update the dentry->d_flags of all of inode's children to indicate if inode cares 61 68 * about events that happen to its children.
+13 -4
fs/notify/group.c
··· 47 47 */ 48 48 void fsnotify_destroy_group(struct fsnotify_group *group) 49 49 { 50 - /* clear all inode marks for this group */ 51 - fsnotify_clear_marks_by_group(group); 50 + /* clear all inode marks for this group, attach them to destroy_list */ 51 + fsnotify_detach_group_marks(group); 52 52 53 - synchronize_srcu(&fsnotify_mark_srcu); 53 + /* 54 + * Wait for fsnotify_mark_srcu period to end and free all marks in 55 + * destroy_list 56 + */ 57 + fsnotify_mark_destroy_list(); 54 58 55 - /* clear the notification queue of all events */ 59 + /* 60 + * Since we have waited for fsnotify_mark_srcu in 61 + * fsnotify_mark_destroy_list() there can be no outstanding event 62 + * notification against this group. So clearing the notification queue 63 + * of all events is reliable now. 64 + */ 56 65 fsnotify_flush_notify(group); 57 66 58 67 /*
+61 -17
fs/notify/mark.c
··· 97 97 static DEFINE_SPINLOCK(destroy_lock); 98 98 static LIST_HEAD(destroy_list); 99 99 100 - static void fsnotify_mark_destroy(struct work_struct *work); 101 - static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); 100 + static void fsnotify_mark_destroy_workfn(struct work_struct *work); 101 + static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); 102 102 103 103 void fsnotify_get_mark(struct fsnotify_mark *mark) 104 104 { ··· 173 173 } 174 174 175 175 /* 176 - * Free fsnotify mark. The freeing is actually happening from a kthread which 177 - * first waits for srcu period end. Caller must have a reference to the mark 178 - * or be protected by fsnotify_mark_srcu. 176 + * Prepare mark for freeing and add it to the list of marks prepared for 177 + * freeing. The actual freeing must happen after SRCU period ends and the 178 + * caller is responsible for this. 179 + * 180 + * The function returns true if the mark was added to the list of marks for 181 + * freeing. The function returns false if someone else has already called 182 + * __fsnotify_free_mark() for the mark. 179 183 */ 180 - void fsnotify_free_mark(struct fsnotify_mark *mark) 184 + static bool __fsnotify_free_mark(struct fsnotify_mark *mark) 181 185 { 182 186 struct fsnotify_group *group = mark->group; 183 187 ··· 189 185 /* something else already called this function on this mark */ 190 186 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { 191 187 spin_unlock(&mark->lock); 192 - return; 188 + return false; 193 189 } 194 190 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 195 191 spin_unlock(&mark->lock); 196 - 197 - spin_lock(&destroy_lock); 198 - list_add(&mark->g_list, &destroy_list); 199 - spin_unlock(&destroy_lock); 200 - queue_delayed_work(system_unbound_wq, &reaper_work, 201 - FSNOTIFY_REAPER_DELAY); 202 192 203 193 /* 204 194 * Some groups like to know that marks are being freed. This is a ··· 201 203 */ 202 204 if (group->ops->freeing_mark) 203 205 group->ops->freeing_mark(mark, group); 206 + 207 + spin_lock(&destroy_lock); 208 + list_add(&mark->g_list, &destroy_list); 209 + spin_unlock(&destroy_lock); 210 + 211 + return true; 212 + } 213 + 214 + /* 215 + * Free fsnotify mark. The freeing is actually happening from a workqueue which 216 + * first waits for srcu period end. Caller must have a reference to the mark 217 + * or be protected by fsnotify_mark_srcu. 218 + */ 219 + void fsnotify_free_mark(struct fsnotify_mark *mark) 220 + { 221 + if (__fsnotify_free_mark(mark)) { 222 + queue_delayed_work(system_unbound_wq, &reaper_work, 223 + FSNOTIFY_REAPER_DELAY); 224 + } 204 225 } 205 226 206 227 void fsnotify_destroy_mark(struct fsnotify_mark *mark, ··· 485 468 } 486 469 487 470 /* 488 - * Given a group, destroy all of the marks associated with that group. 471 + * Given a group, prepare for freeing all the marks associated with that group. 472 + * The marks are attached to the list of marks prepared for destruction, the 473 + * caller is responsible for freeing marks in that list after SRCU period has 474 + * ended. 489 475 */ 490 - void fsnotify_clear_marks_by_group(struct fsnotify_group *group) 476 + void fsnotify_detach_group_marks(struct fsnotify_group *group) 491 477 { 492 - fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); 478 + struct fsnotify_mark *mark; 479 + 480 + while (1) { 481 + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 482 + if (list_empty(&group->marks_list)) { 483 + mutex_unlock(&group->mark_mutex); 484 + break; 485 + } 486 + mark = list_first_entry(&group->marks_list, 487 + struct fsnotify_mark, g_list); 488 + fsnotify_get_mark(mark); 489 + fsnotify_detach_mark(mark); 490 + mutex_unlock(&group->mark_mutex); 491 + __fsnotify_free_mark(mark); 492 + fsnotify_put_mark(mark); 493 + } 493 494 } 494 495 495 496 void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) ··· 534 499 mark->free_mark = free_mark; 535 500 } 536 501 537 - static void fsnotify_mark_destroy(struct work_struct *work) 502 + /* 503 + * Destroy all marks in destroy_list, waits for SRCU period to finish before 504 + * actually freeing marks. 505 + */ 506 + void fsnotify_mark_destroy_list(void) 538 507 { 539 508 struct fsnotify_mark *mark, *next; 540 509 struct list_head private_destroy_list; ··· 554 515 list_del_init(&mark->g_list); 555 516 fsnotify_put_mark(mark); 556 517 } 518 + } 519 + 520 + static void fsnotify_mark_destroy_workfn(struct work_struct *work) 521 + { 522 + fsnotify_mark_destroy_list(); 557 523 }
-2
include/linux/fsnotify_backend.h
··· 359 359 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); 360 360 /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/ 361 361 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); 362 - /* run all the marks in a group, and flag them to be freed */ 363 - extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); 364 362 extern void fsnotify_get_mark(struct fsnotify_mark *mark); 365 363 extern void fsnotify_put_mark(struct fsnotify_mark *mark); 366 364 extern void fsnotify_unmount_inodes(struct super_block *sb);