Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

inode: convert inode_sb_list_lock to per-sb

The process of reducing contention on per-superblock inode lists
starts with moving the locking to match the per-superblock inode
list. This takes the global lock out of the picture and reduces the
contention problems to within a single filesystem. This doesn't get
rid of contention as the locks still have global CPU scope, but it
does isolate operations on different superblocks form each other.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>

authored by

Dave Chinner and committed by
Josef Bacik
74278da9 cbedaac6

+57 -54
+6 -6
fs/block_dev.c
··· 1769 1769 { 1770 1770 struct inode *inode, *old_inode = NULL; 1771 1771 1772 - spin_lock(&inode_sb_list_lock); 1772 + spin_lock(&blockdev_superblock->s_inode_list_lock); 1773 1773 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1774 1774 struct address_space *mapping = inode->i_mapping; 1775 1775 ··· 1781 1781 } 1782 1782 __iget(inode); 1783 1783 spin_unlock(&inode->i_lock); 1784 - spin_unlock(&inode_sb_list_lock); 1784 + spin_unlock(&blockdev_superblock->s_inode_list_lock); 1785 1785 /* 1786 1786 * We hold a reference to 'inode' so it couldn't have been 1787 1787 * removed from s_inodes list while we dropped the 1788 - * inode_sb_list_lock. We cannot iput the inode now as we can 1788 + * s_inode_list_lock We cannot iput the inode now as we can 1789 1789 * be holding the last reference and we cannot iput it under 1790 - * inode_sb_list_lock. So we keep the reference and iput it 1790 + * s_inode_list_lock. So we keep the reference and iput it 1791 1791 * later. 1792 1792 */ 1793 1793 iput(old_inode); ··· 1795 1795 1796 1796 func(I_BDEV(inode), arg); 1797 1797 1798 - spin_lock(&inode_sb_list_lock); 1798 + spin_lock(&blockdev_superblock->s_inode_list_lock); 1799 1799 } 1800 - spin_unlock(&inode_sb_list_lock); 1800 + spin_unlock(&blockdev_superblock->s_inode_list_lock); 1801 1801 iput(old_inode); 1802 1802 }
+6 -4
fs/drop_caches.c
··· 17 17 { 18 18 struct inode *inode, *toput_inode = NULL; 19 19 20 - spin_lock(&inode_sb_list_lock); 20 + spin_lock(&sb->s_inode_list_lock); 21 21 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 22 22 spin_lock(&inode->i_lock); 23 23 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || ··· 27 27 } 28 28 __iget(inode); 29 29 spin_unlock(&inode->i_lock); 30 - spin_unlock(&inode_sb_list_lock); 30 + spin_unlock(&sb->s_inode_list_lock); 31 + 31 32 invalidate_mapping_pages(inode->i_mapping, 0, -1); 32 33 iput(toput_inode); 33 34 toput_inode = inode; 34 - spin_lock(&inode_sb_list_lock); 35 + 36 + spin_lock(&sb->s_inode_list_lock); 35 37 } 36 - spin_unlock(&inode_sb_list_lock); 38 + spin_unlock(&sb->s_inode_list_lock); 37 39 iput(toput_inode); 38 40 } 39 41
+6 -6
fs/fs-writeback.c
··· 2124 2124 */ 2125 2125 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 2126 2126 2127 - spin_lock(&inode_sb_list_lock); 2127 + spin_lock(&sb->s_inode_list_lock); 2128 2128 2129 2129 /* 2130 2130 * Data integrity sync. Must wait for all pages under writeback, ··· 2144 2144 } 2145 2145 __iget(inode); 2146 2146 spin_unlock(&inode->i_lock); 2147 - spin_unlock(&inode_sb_list_lock); 2147 + spin_unlock(&sb->s_inode_list_lock); 2148 2148 2149 2149 /* 2150 2150 * We hold a reference to 'inode' so it couldn't have been 2151 2151 * removed from s_inodes list while we dropped the 2152 - * inode_sb_list_lock. We cannot iput the inode now as we can 2152 + * s_inode_list_lock. We cannot iput the inode now as we can 2153 2153 * be holding the last reference and we cannot iput it under 2154 - * inode_sb_list_lock. So we keep the reference and iput it 2154 + * s_inode_list_lock. So we keep the reference and iput it 2155 2155 * later. 2156 2156 */ 2157 2157 iput(old_inode); ··· 2161 2161 2162 2162 cond_resched(); 2163 2163 2164 - spin_lock(&inode_sb_list_lock); 2164 + spin_lock(&sb->s_inode_list_lock); 2165 2165 } 2166 - spin_unlock(&inode_sb_list_lock); 2166 + spin_unlock(&sb->s_inode_list_lock); 2167 2167 iput(old_inode); 2168 2168 } 2169 2169
+13 -15
fs/inode.c
··· 28 28 * inode->i_state, inode->i_hash, __iget() 29 29 * Inode LRU list locks protect: 30 30 * inode->i_sb->s_inode_lru, inode->i_lru 31 - * inode_sb_list_lock protects: 32 - * sb->s_inodes, inode->i_sb_list 31 + * inode->i_sb->s_inode_list_lock protects: 32 + * inode->i_sb->s_inodes, inode->i_sb_list 33 33 * bdi->wb.list_lock protects: 34 34 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list 35 35 * inode_hash_lock protects: ··· 37 37 * 38 38 * Lock ordering: 39 39 * 40 - * inode_sb_list_lock 40 + * inode->i_sb->s_inode_list_lock 41 41 * inode->i_lock 42 42 * Inode LRU list locks 43 43 * ··· 45 45 * inode->i_lock 46 46 * 47 47 * inode_hash_lock 48 - * inode_sb_list_lock 48 + * inode->i_sb->s_inode_list_lock 49 49 * inode->i_lock 50 50 * 51 51 * iunique_lock ··· 56 56 static unsigned int i_hash_shift __read_mostly; 57 57 static struct hlist_head *inode_hashtable __read_mostly; 58 58 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 59 - 60 - __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); 61 59 62 60 /* 63 61 * Empty aops. Can be used for the cases where the user does not ··· 424 426 */ 425 427 void inode_sb_list_add(struct inode *inode) 426 428 { 427 - spin_lock(&inode_sb_list_lock); 429 + spin_lock(&inode->i_sb->s_inode_list_lock); 428 430 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 429 - spin_unlock(&inode_sb_list_lock); 431 + spin_unlock(&inode->i_sb->s_inode_list_lock); 430 432 } 431 433 EXPORT_SYMBOL_GPL(inode_sb_list_add); 432 434 433 435 static inline void inode_sb_list_del(struct inode *inode) 434 436 { 435 437 if (!list_empty(&inode->i_sb_list)) { 436 - spin_lock(&inode_sb_list_lock); 438 + spin_lock(&inode->i_sb->s_inode_list_lock); 437 439 list_del_init(&inode->i_sb_list); 438 - spin_unlock(&inode_sb_list_lock); 440 + spin_unlock(&inode->i_sb->s_inode_list_lock); 439 441 } 440 442 } 441 443 ··· 592 594 struct inode *inode, *next; 593 595 LIST_HEAD(dispose); 594 596 595 - spin_lock(&inode_sb_list_lock); 597 + spin_lock(&sb->s_inode_list_lock); 596 598 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 597 599 if (atomic_read(&inode->i_count)) 598 600 continue; ··· 608 610 spin_unlock(&inode->i_lock); 609 611 list_add(&inode->i_lru, &dispose); 610 612 } 611 - spin_unlock(&inode_sb_list_lock); 613 + spin_unlock(&sb->s_inode_list_lock); 612 614 613 615 dispose_list(&dispose); 614 616 } ··· 629 631 struct inode *inode, *next; 630 632 LIST_HEAD(dispose); 631 633 632 - spin_lock(&inode_sb_list_lock); 634 + spin_lock(&sb->s_inode_list_lock); 633 635 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 634 636 spin_lock(&inode->i_lock); 635 637 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { ··· 652 654 spin_unlock(&inode->i_lock); 653 655 list_add(&inode->i_lru, &dispose); 654 656 } 655 - spin_unlock(&inode_sb_list_lock); 657 + spin_unlock(&sb->s_inode_list_lock); 656 658 657 659 dispose_list(&dispose); 658 660 ··· 888 890 { 889 891 struct inode *inode; 890 892 891 - spin_lock_prefetch(&inode_sb_list_lock); 893 + spin_lock_prefetch(&sb->s_inode_list_lock); 892 894 893 895 inode = new_inode_pseudo(sb); 894 896 if (inode)
-1
fs/internal.h
··· 112 112 /* 113 113 * inode.c 114 114 */ 115 - extern spinlock_t inode_sb_list_lock; 116 115 extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); 117 116 extern void inode_add_lru(struct inode *inode); 118 117
+10 -10
fs/notify/inode_mark.c
··· 163 163 164 164 /** 165 165 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. 166 - * @list: list of inodes being unmounted (sb->s_inodes) 166 + * @sb: superblock being unmounted. 167 167 * 168 168 * Called during unmount with no locks held, so needs to be safe against 169 - * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. 169 + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. 170 170 */ 171 - void fsnotify_unmount_inodes(struct list_head *list) 171 + void fsnotify_unmount_inodes(struct super_block *sb) 172 172 { 173 173 struct inode *inode, *next_i, *need_iput = NULL; 174 174 175 - spin_lock(&inode_sb_list_lock); 176 - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 175 + spin_lock(&sb->s_inode_list_lock); 176 + list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { 177 177 struct inode *need_iput_tmp; 178 178 179 179 /* ··· 209 209 spin_unlock(&inode->i_lock); 210 210 211 211 /* In case the dropping of a reference would nuke next_i. */ 212 - while (&next_i->i_sb_list != list) { 212 + while (&next_i->i_sb_list != &sb->s_inodes) { 213 213 spin_lock(&next_i->i_lock); 214 214 if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && 215 215 atomic_read(&next_i->i_count)) { ··· 224 224 } 225 225 226 226 /* 227 - * We can safely drop inode_sb_list_lock here because either 227 + * We can safely drop s_inode_list_lock here because either 228 228 * we actually hold references on both inode and next_i or 229 229 * end of list. Also no new inodes will be added since the 230 230 * umount has begun. 231 231 */ 232 - spin_unlock(&inode_sb_list_lock); 232 + spin_unlock(&sb->s_inode_list_lock); 233 233 234 234 if (need_iput_tmp) 235 235 iput(need_iput_tmp); ··· 241 241 242 242 iput(inode); 243 243 244 - spin_lock(&inode_sb_list_lock); 244 + spin_lock(&sb->s_inode_list_lock); 245 245 } 246 - spin_unlock(&inode_sb_list_lock); 246 + spin_unlock(&sb->s_inode_list_lock); 247 247 }
+8 -8
fs/quota/dquot.c
··· 923 923 int reserved = 0; 924 924 #endif 925 925 926 - spin_lock(&inode_sb_list_lock); 926 + spin_lock(&sb->s_inode_list_lock); 927 927 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 928 928 spin_lock(&inode->i_lock); 929 929 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || ··· 934 934 } 935 935 __iget(inode); 936 936 spin_unlock(&inode->i_lock); 937 - spin_unlock(&inode_sb_list_lock); 937 + spin_unlock(&sb->s_inode_list_lock); 938 938 939 939 #ifdef CONFIG_QUOTA_DEBUG 940 940 if (unlikely(inode_get_rsv_space(inode) > 0)) ··· 946 946 /* 947 947 * We hold a reference to 'inode' so it couldn't have been 948 948 * removed from s_inodes list while we dropped the 949 - * inode_sb_list_lock We cannot iput the inode now as we can be 949 + * s_inode_list_lock. We cannot iput the inode now as we can be 950 950 * holding the last reference and we cannot iput it under 951 - * inode_sb_list_lock. So we keep the reference and iput it 951 + * s_inode_list_lock. So we keep the reference and iput it 952 952 * later. 953 953 */ 954 954 old_inode = inode; 955 - spin_lock(&inode_sb_list_lock); 955 + spin_lock(&sb->s_inode_list_lock); 956 956 } 957 - spin_unlock(&inode_sb_list_lock); 957 + spin_unlock(&sb->s_inode_list_lock); 958 958 iput(old_inode); 959 959 960 960 #ifdef CONFIG_QUOTA_DEBUG ··· 1023 1023 struct inode *inode; 1024 1024 int reserved = 0; 1025 1025 1026 - spin_lock(&inode_sb_list_lock); 1026 + spin_lock(&sb->s_inode_list_lock); 1027 1027 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1028 1028 /* 1029 1029 * We have to scan also I_NEW inodes because they can already ··· 1039 1039 } 1040 1040 spin_unlock(&dq_data_lock); 1041 1041 } 1042 - spin_unlock(&inode_sb_list_lock); 1042 + spin_unlock(&sb->s_inode_list_lock); 1043 1043 #ifdef CONFIG_QUOTA_DEBUG 1044 1044 if (reserved) { 1045 1045 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+2 -1
fs/super.c
··· 191 191 INIT_HLIST_NODE(&s->s_instances); 192 192 INIT_HLIST_BL_HEAD(&s->s_anon); 193 193 INIT_LIST_HEAD(&s->s_inodes); 194 + spin_lock_init(&s->s_inode_list_lock); 194 195 195 196 if (list_lru_init_memcg(&s->s_dentry_lru)) 196 197 goto fail; ··· 400 399 sync_filesystem(sb); 401 400 sb->s_flags &= ~MS_ACTIVE; 402 401 403 - fsnotify_unmount_inodes(&sb->s_inodes); 402 + fsnotify_unmount_inodes(sb); 404 403 405 404 evict_inodes(sb); 406 405
+4 -1
include/linux/fs.h
··· 1309 1309 #endif 1310 1310 const struct xattr_handler **s_xattr; 1311 1311 1312 - struct list_head s_inodes; /* all inodes */ 1313 1312 struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ 1314 1313 struct list_head s_mounts; /* list of mounts; _not_ for fs use */ 1315 1314 struct block_device *s_bdev; ··· 1379 1380 * Indicates how deep in a filesystem stack this SB is 1380 1381 */ 1381 1382 int s_stack_depth; 1383 + 1384 + /* s_inode_list_lock protects s_inodes */ 1385 + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; 1386 + struct list_head s_inodes; /* all inodes */ 1382 1387 }; 1383 1388 1384 1389 extern struct timespec current_fs_time(struct super_block *sb);
+2 -2
include/linux/fsnotify_backend.h
··· 357 357 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); 358 358 extern void fsnotify_get_mark(struct fsnotify_mark *mark); 359 359 extern void fsnotify_put_mark(struct fsnotify_mark *mark); 360 - extern void fsnotify_unmount_inodes(struct list_head *list); 360 + extern void fsnotify_unmount_inodes(struct super_block *sb); 361 361 362 362 /* put here because inotify does some weird stuff when destroying watches */ 363 363 extern void fsnotify_init_event(struct fsnotify_event *event, ··· 393 393 return 0; 394 394 } 395 395 396 - static inline void fsnotify_unmount_inodes(struct list_head *list) 396 + static inline void fsnotify_unmount_inodes(struct super_block *sb) 397 397 {} 398 398 399 399 #endif /* CONFIG_FSNOTIFY */