Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] inotify: lock avoidance with parent watch status in dentry

Previous inotify work avoidance is good when inotify is completely unused,
but it breaks down if even a single watch is in place anywhere in the
system. Robin Holt notices that udev is one such culprit - it slows down a
512-thread application on a 512 CPU system from 6 seconds to 22 minutes.

Solve this by adding a flag in the dentry that tells inotify whether or not
its parent inode has a watch on it. Event queueing to parent will skip
taking locks if this flag is cleared. Setting and clearing of this flag on
all child dentries versus event delivery: this is no in terms of race
cases, and that was shown to be equivalent to always performing the check.

The essential behaviour is that activity occuring _after_ a watch has been
added and _before_ it has been removed, will generate events.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Nick Piggin and committed by
Linus Torvalds
c32ccd87 bf36b901

+117 -10
+8
fs/dcache.c
··· 802 802 if (inode) 803 803 list_add(&entry->d_alias, &inode->i_dentry); 804 804 entry->d_inode = inode; 805 + fsnotify_d_instantiate(entry, inode); 805 806 spin_unlock(&dcache_lock); 806 807 security_d_instantiate(entry, inode); 807 808 } ··· 854 853 list_add(&entry->d_alias, &inode->i_dentry); 855 854 do_negative: 856 855 entry->d_inode = inode; 856 + fsnotify_d_instantiate(entry, inode); 857 857 spin_unlock(&dcache_lock); 858 858 security_d_instantiate(entry, inode); 859 859 return NULL; ··· 985 983 new = __d_find_alias(inode, 1); 986 984 if (new) { 987 985 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 986 + fsnotify_d_instantiate(new, inode); 988 987 spin_unlock(&dcache_lock); 989 988 security_d_instantiate(new, inode); 990 989 d_rehash(dentry); ··· 995 992 /* d_instantiate takes dcache_lock, so we do it by hand */ 996 993 list_add(&dentry->d_alias, &inode->i_dentry); 997 994 dentry->d_inode = inode; 995 + fsnotify_d_instantiate(dentry, inode); 998 996 spin_unlock(&dcache_lock); 999 997 security_d_instantiate(dentry, inode); 1000 998 d_rehash(dentry); ··· 1180 1176 spin_lock(&dentry->d_lock); 1181 1177 isdir = S_ISDIR(dentry->d_inode->i_mode); 1182 1178 if (atomic_read(&dentry->d_count) == 1) { 1179 + /* remove this and other inotify debug checks after 2.6.18 */ 1180 + dentry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED; 1181 + 1183 1182 dentry_iput(dentry); 1184 1183 fsnotify_nameremove(dentry, isdir); 1185 1184 return; ··· 1349 1342 1350 1343 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); 1351 1344 spin_unlock(&target->d_lock); 1345 + fsnotify_d_move(dentry); 1352 1346 spin_unlock(&dentry->d_lock); 1353 1347 write_sequnlock(&rename_lock); 1354 1348 spin_unlock(&dcache_lock);
+77 -10
fs/inotify.c
··· 38 38 #include <asm/ioctls.h> 39 39 40 40 static atomic_t inotify_cookie; 41 - static atomic_t inotify_watches; 42 41 43 42 static kmem_cache_t *watch_cachep; 44 43 static kmem_cache_t *event_cachep; ··· 380 381 } 381 382 382 383 /* 384 + * inotify_inode_watched - returns nonzero if there are watches on this inode 385 + * and zero otherwise. We call this lockless, we do not care if we race. 386 + */ 387 + static inline int inotify_inode_watched(struct inode *inode) 388 + { 389 + return !list_empty(&inode->inotify_watches); 390 + } 391 + 392 + /* 393 + * Get child dentry flag into synch with parent inode. 394 + * Flag should always be clear for negative dentrys. 395 + */ 396 + static void set_dentry_child_flags(struct inode *inode, int watched) 397 + { 398 + struct dentry *alias; 399 + 400 + spin_lock(&dcache_lock); 401 + list_for_each_entry(alias, &inode->i_dentry, d_alias) { 402 + struct dentry *child; 403 + 404 + list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { 405 + if (!child->d_inode) { 406 + WARN_ON(child->d_flags & DCACHE_INOTIFY_PARENT_WATCHED); 407 + continue; 408 + } 409 + spin_lock(&child->d_lock); 410 + if (watched) { 411 + WARN_ON(child->d_flags & 412 + DCACHE_INOTIFY_PARENT_WATCHED); 413 + child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; 414 + } else { 415 + WARN_ON(!(child->d_flags & 416 + DCACHE_INOTIFY_PARENT_WATCHED)); 417 + child->d_flags&=~DCACHE_INOTIFY_PARENT_WATCHED; 418 + } 419 + spin_unlock(&child->d_lock); 420 + } 421 + } 422 + spin_unlock(&dcache_lock); 423 + } 424 + 425 + /* 383 426 * create_watch - creates a watch on the given device. 384 427 * 385 428 * Callers must hold dev->mutex. Calls inotify_dev_get_wd() so may sleep. ··· 467 426 get_inotify_watch(watch); 468 427 469 428 atomic_inc(&dev->user->inotify_watches); 470 - atomic_inc(&inotify_watches); 471 429 472 430 return watch; 473 431 } ··· 498 458 list_del(&watch->i_list); 499 459 list_del(&watch->d_list); 500 460 461 + if (!inotify_inode_watched(watch->inode)) 462 + set_dentry_child_flags(watch->inode, 0); 463 + 501 464 atomic_dec(&dev->user->inotify_watches); 502 - atomic_dec(&inotify_watches); 503 465 idr_remove(&dev->idr, watch->wd); 504 466 put_inotify_watch(watch); 505 467 } ··· 523 481 remove_watch_no_event(watch, dev); 524 482 } 525 483 484 + /* Kernel API */ 485 + 526 486 /* 527 - * inotify_inode_watched - returns nonzero if there are watches on this inode 528 - * and zero otherwise. We call this lockless, we do not care if we race. 487 + * inotify_d_instantiate - instantiate dcache entry for inode 529 488 */ 530 - static inline int inotify_inode_watched(struct inode *inode) 489 + void inotify_d_instantiate(struct dentry *entry, struct inode *inode) 531 490 { 532 - return !list_empty(&inode->inotify_watches); 491 + struct dentry *parent; 492 + 493 + if (!inode) 494 + return; 495 + 496 + WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED); 497 + spin_lock(&entry->d_lock); 498 + parent = entry->d_parent; 499 + if (inotify_inode_watched(parent->d_inode)) 500 + entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; 501 + spin_unlock(&entry->d_lock); 533 502 } 534 503 535 - /* Kernel API */ 504 + /* 505 + * inotify_d_move - dcache entry has been moved 506 + */ 507 + void inotify_d_move(struct dentry *entry) 508 + { 509 + struct dentry *parent; 510 + 511 + parent = entry->d_parent; 512 + if (inotify_inode_watched(parent->d_inode)) 513 + entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; 514 + else 515 + entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED; 516 + } 536 517 537 518 /** 538 519 * inotify_inode_queue_event - queue an event to all watches on this inode ··· 603 538 struct dentry *parent; 604 539 struct inode *inode; 605 540 606 - if (!atomic_read (&inotify_watches)) 541 + if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED)) 607 542 return; 608 543 609 544 spin_lock(&dentry->d_lock); ··· 1058 993 goto out; 1059 994 } 1060 995 996 + if (!inotify_inode_watched(inode)) 997 + set_dentry_child_flags(inode, 1); 998 + 1061 999 /* Add the watch to the device's and the inode's list */ 1062 1000 list_add(&watch->d_list, &dev->watches); 1063 1001 list_add(&watch->i_list, &inode->inotify_watches); ··· 1133 1065 inotify_max_user_watches = 8192; 1134 1066 1135 1067 atomic_set(&inotify_cookie, 0); 1136 - atomic_set(&inotify_watches, 0); 1137 1068 1138 1069 watch_cachep = kmem_cache_create("inotify_watch_cache", 1139 1070 sizeof(struct inotify_watch),
+2
include/linux/dcache.h
··· 162 162 #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ 163 163 #define DCACHE_UNHASHED 0x0010 164 164 165 + #define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ 166 + 165 167 extern spinlock_t dcache_lock; 166 168 167 169 /**
+19
include/linux/fsnotify.h
··· 17 17 #include <linux/inotify.h> 18 18 19 19 /* 20 + * fsnotify_d_instantiate - instantiate a dentry for inode 21 + * Called with dcache_lock held. 22 + */ 23 + static inline void fsnotify_d_instantiate(struct dentry *entry, 24 + struct inode *inode) 25 + { 26 + inotify_d_instantiate(entry, inode); 27 + } 28 + 29 + /* 30 + * fsnotify_d_move - entry has been moved 31 + * Called with dcache_lock and entry->d_lock held. 32 + */ 33 + static inline void fsnotify_d_move(struct dentry *entry) 34 + { 35 + inotify_d_move(entry); 36 + } 37 + 38 + /* 20 39 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir 21 40 */ 22 41 static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
+11
include/linux/inotify.h
··· 71 71 72 72 #ifdef CONFIG_INOTIFY 73 73 74 + extern void inotify_d_instantiate(struct dentry *, struct inode *); 75 + extern void inotify_d_move(struct dentry *); 74 76 extern void inotify_inode_queue_event(struct inode *, __u32, __u32, 75 77 const char *); 76 78 extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32, ··· 82 80 extern u32 inotify_get_cookie(void); 83 81 84 82 #else 83 + 84 + static inline void inotify_d_instantiate(struct dentry *dentry, 85 + struct inode *inode) 86 + { 87 + } 88 + 89 + static inline void inotify_d_move(struct dentry *dentry) 90 + { 91 + } 85 92 86 93 static inline void inotify_inode_queue_event(struct inode *inode, 87 94 __u32 mask, __u32 cookie,