Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs inode / dentry updates from Christian Brauner:
"This contains smaller performance improvements to inodes and dentries:

inode:

- Add rcu based inode lookup variants.

They avoid one inode hash lock acquire in the common case thereby
significantly reducing contention. We already support RCU-based
operations but didn't take advantage of them during inode
insertion.

Callers of iget_locked() get the improvement without any code
changes. Callers that need a custom callback can switch to
iget5_locked_rcu() as e.g., did btrfs.

With 20 threads each walking a dedicated 1000 dirs * 1000 files
directory tree to stat(2) on a 32 core + 24GB ram vm:

before: 3.54s user 892.30s system 1966% cpu 45.549 total
after: 3.28s user 738.66s system 1955% cpu 37.932 total (-16.7%)

Long-term we should pick up the effort to introduce more
fine-grained locking and possibly improve on the currently used
hash implementation.

- Start zeroing i_state in inode_init_always() instead of doing it in
individual filesystems.

This allows us to remove an unneeded lock acquire in new_inode()
and not burden individual filesystems with this.

dcache:

- Move d_lockref out of the area used by RCU lookup to avoid
cacheline ping poing because the embedded name is sharing a
cacheline with d_lockref.

- Fix dentry size on 32bit with CONFIG_SMP=y so it does actually end
up with 128 bytes in total"

* tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs: fix dentry size
vfs: move d_lockref out of the area used by RCU lookup
bcachefs: remove now spurious i_state initialization
xfs: remove now spurious i_state initialization in xfs_inode_alloc
vfs: partially sanitize i_state zeroing on inode creation
xfs: preserve i_state around inode_init_always in xfs_reinit_inode
btrfs: use iget5_locked_rcu
vfs: add rcu-based find_inode variants for iget ops

+99 -33
-1
fs/bcachefs/fs.c
··· 244 244 inode->ei_flags = 0; 245 245 mutex_init(&inode->ei_quota_lock); 246 246 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 247 - inode->v.i_state = 0; 248 247 249 248 if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { 250 249 kmem_cache_free(bch2_inode_cache, inode);
+1 -1
fs/btrfs/inode.c
··· 5587 5587 args.ino = ino; 5588 5588 args.root = root; 5589 5589 5590 - inode = iget5_locked(s, hashval, btrfs_find_actor, 5590 + inode = iget5_locked_rcu(s, hashval, btrfs_find_actor, 5591 5591 btrfs_init_locked_inode, 5592 5592 (void *)&args); 5593 5593 return inode;
+82 -26
fs/inode.c
··· 162 162 inode->i_sb = sb; 163 163 inode->i_blkbits = sb->s_blocksize_bits; 164 164 inode->i_flags = 0; 165 + inode->i_state = 0; 165 166 atomic64_set(&inode->i_sequence, 0); 166 167 atomic_set(&inode->i_count, 1); 167 168 inode->i_op = &empty_iops; ··· 232 231 233 232 if (unlikely(security_inode_alloc(inode))) 234 233 return -ENOMEM; 234 + 235 235 this_cpu_inc(nr_inodes); 236 236 237 237 return 0; ··· 888 886 return freed; 889 887 } 890 888 891 - static void __wait_on_freeing_inode(struct inode *inode); 889 + static void __wait_on_freeing_inode(struct inode *inode, bool locked); 892 890 /* 893 891 * Called with the inode lock held. 894 892 */ 895 893 static struct inode *find_inode(struct super_block *sb, 896 894 struct hlist_head *head, 897 895 int (*test)(struct inode *, void *), 898 - void *data) 896 + void *data, bool locked) 899 897 { 900 898 struct inode *inode = NULL; 901 899 900 + if (locked) 901 + lockdep_assert_held(&inode_hash_lock); 902 + else 903 + lockdep_assert_not_held(&inode_hash_lock); 904 + 905 + rcu_read_lock(); 902 906 repeat: 903 - hlist_for_each_entry(inode, head, i_hash) { 907 + hlist_for_each_entry_rcu(inode, head, i_hash) { 904 908 if (inode->i_sb != sb) 905 909 continue; 906 910 if (!test(inode, data)) 907 911 continue; 908 912 spin_lock(&inode->i_lock); 909 913 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 910 - __wait_on_freeing_inode(inode); 914 + __wait_on_freeing_inode(inode, locked); 911 915 goto repeat; 912 916 } 913 917 if (unlikely(inode->i_state & I_CREATING)) { 914 918 spin_unlock(&inode->i_lock); 919 + rcu_read_unlock(); 915 920 return ERR_PTR(-ESTALE); 916 921 } 917 922 __iget(inode); 918 923 spin_unlock(&inode->i_lock); 924 + rcu_read_unlock(); 919 925 return inode; 920 926 } 927 + rcu_read_unlock(); 921 928 return NULL; 922 929 } 923 930 ··· 935 924 * iget_locked for details. 936 925 */ 937 926 static struct inode *find_inode_fast(struct super_block *sb, 938 - struct hlist_head *head, unsigned long ino) 927 + struct hlist_head *head, unsigned long ino, 928 + bool locked) 939 929 { 940 930 struct inode *inode = NULL; 941 931 932 + if (locked) 933 + lockdep_assert_held(&inode_hash_lock); 934 + else 935 + lockdep_assert_not_held(&inode_hash_lock); 936 + 937 + rcu_read_lock(); 942 938 repeat: 943 - hlist_for_each_entry(inode, head, i_hash) { 939 + hlist_for_each_entry_rcu(inode, head, i_hash) { 944 940 if (inode->i_ino != ino) 945 941 continue; 946 942 if (inode->i_sb != sb) 947 943 continue; 948 944 spin_lock(&inode->i_lock); 949 945 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 950 - __wait_on_freeing_inode(inode); 946 + __wait_on_freeing_inode(inode, locked); 951 947 goto repeat; 952 948 } 953 949 if (unlikely(inode->i_state & I_CREATING)) { 954 950 spin_unlock(&inode->i_lock); 951 + rcu_read_unlock(); 955 952 return ERR_PTR(-ESTALE); 956 953 } 957 954 __iget(inode); 958 955 spin_unlock(&inode->i_lock); 956 + rcu_read_unlock(); 959 957 return inode; 960 958 } 959 + rcu_read_unlock(); 961 960 return NULL; 962 961 } 963 962 ··· 1025 1004 */ 1026 1005 struct inode *new_inode_pseudo(struct super_block *sb) 1027 1006 { 1028 - struct inode *inode = alloc_inode(sb); 1029 - 1030 - if (inode) { 1031 - spin_lock(&inode->i_lock); 1032 - inode->i_state = 0; 1033 - spin_unlock(&inode->i_lock); 1034 - } 1035 - return inode; 1007 + return alloc_inode(sb); 1036 1008 } 1037 1009 1038 1010 /** ··· 1175 1161 1176 1162 again: 1177 1163 spin_lock(&inode_hash_lock); 1178 - old = find_inode(inode->i_sb, head, test, data); 1164 + old = find_inode(inode->i_sb, head, test, data, true); 1179 1165 if (unlikely(old)) { 1180 1166 /* 1181 1167 * Uhhuh, somebody else created the same inode under us. ··· 1249 1235 struct inode *new = alloc_inode(sb); 1250 1236 1251 1237 if (new) { 1252 - new->i_state = 0; 1253 1238 inode = inode_insert5(new, hashval, test, set, data); 1254 1239 if (unlikely(inode != new)) 1255 1240 destroy_inode(new); ··· 1257 1244 return inode; 1258 1245 } 1259 1246 EXPORT_SYMBOL(iget5_locked); 1247 + 1248 + /** 1249 + * iget5_locked_rcu - obtain an inode from a mounted file system 1250 + * @sb: super block of file system 1251 + * @hashval: hash value (usually inode number) to get 1252 + * @test: callback used for comparisons between inodes 1253 + * @set: callback used to initialize a new struct inode 1254 + * @data: opaque data pointer to pass to @test and @set 1255 + * 1256 + * This is equivalent to iget5_locked, except the @test callback must 1257 + * tolerate the inode not being stable, including being mid-teardown. 1258 + */ 1259 + struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, 1260 + int (*test)(struct inode *, void *), 1261 + int (*set)(struct inode *, void *), void *data) 1262 + { 1263 + struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1264 + struct inode *inode, *new; 1265 + 1266 + again: 1267 + inode = find_inode(sb, head, test, data, false); 1268 + if (inode) { 1269 + if (IS_ERR(inode)) 1270 + return NULL; 1271 + wait_on_inode(inode); 1272 + if (unlikely(inode_unhashed(inode))) { 1273 + iput(inode); 1274 + goto again; 1275 + } 1276 + return inode; 1277 + } 1278 + 1279 + new = alloc_inode(sb); 1280 + if (new) { 1281 + inode = inode_insert5(new, hashval, test, set, data); 1282 + if (unlikely(inode != new)) 1283 + destroy_inode(new); 1284 + } 1285 + return inode; 1286 + } 1287 + EXPORT_SYMBOL_GPL(iget5_locked_rcu); 1260 1288 1261 1289 /** 1262 1290 * iget_locked - obtain an inode from a mounted file system ··· 1317 1263 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1318 1264 struct inode *inode; 1319 1265 again: 1320 - spin_lock(&inode_hash_lock); 1321 - inode = find_inode_fast(sb, head, ino); 1322 - spin_unlock(&inode_hash_lock); 1266 + inode = find_inode_fast(sb, head, ino, false); 1323 1267 if (inode) { 1324 1268 if (IS_ERR(inode)) 1325 1269 return NULL; ··· 1335 1283 1336 1284 spin_lock(&inode_hash_lock); 1337 1285 /* We released the lock, so.. */ 1338 - old = find_inode_fast(sb, head, ino); 1286 + old = find_inode_fast(sb, head, ino, true); 1339 1287 if (!old) { 1340 1288 inode->i_ino = ino; 1341 1289 spin_lock(&inode->i_lock); ··· 1471 1419 struct inode *inode; 1472 1420 1473 1421 spin_lock(&inode_hash_lock); 1474 - inode = find_inode(sb, head, test, data); 1422 + inode = find_inode(sb, head, test, data, true); 1475 1423 spin_unlock(&inode_hash_lock); 1476 1424 1477 1425 return IS_ERR(inode) ? NULL : inode; ··· 1526 1474 struct inode *inode; 1527 1475 again: 1528 1476 spin_lock(&inode_hash_lock); 1529 - inode = find_inode_fast(sb, head, ino); 1477 + inode = find_inode_fast(sb, head, ino, true); 1530 1478 spin_unlock(&inode_hash_lock); 1531 1479 1532 1480 if (inode) { ··· 2287 2235 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 2288 2236 * will DTRT. 2289 2237 */ 2290 - static void __wait_on_freeing_inode(struct inode *inode) 2238 + static void __wait_on_freeing_inode(struct inode *inode, bool locked) 2291 2239 { 2292 2240 wait_queue_head_t *wq; 2293 2241 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 2294 2242 wq = bit_waitqueue(&inode->i_state, __I_NEW); 2295 2243 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2296 2244 spin_unlock(&inode->i_lock); 2297 - spin_unlock(&inode_hash_lock); 2245 + rcu_read_unlock(); 2246 + if (locked) 2247 + spin_unlock(&inode_hash_lock); 2298 2248 schedule(); 2299 2249 finish_wait(wq, &wait.wq_entry); 2300 - spin_lock(&inode_hash_lock); 2250 + if (locked) 2251 + spin_lock(&inode_hash_lock); 2252 + rcu_read_lock(); 2301 2253 } 2302 2254 2303 2255 static __initdata unsigned long ihash_entries;
+3 -2
fs/xfs/xfs_icache.c
··· 86 86 return NULL; 87 87 } 88 88 89 - /* VFS doesn't initialise i_mode or i_state! */ 89 + /* VFS doesn't initialise i_mode! */ 90 90 VFS_I(ip)->i_mode = 0; 91 - VFS_I(ip)->i_state = 0; 92 91 mapping_set_large_folios(VFS_I(ip)->i_mapping); 93 92 94 93 XFS_STATS_INC(mp, vn_active); ··· 313 314 dev_t dev = inode->i_rdev; 314 315 kuid_t uid = inode->i_uid; 315 316 kgid_t gid = inode->i_gid; 317 + unsigned long state = inode->i_state; 316 318 317 319 error = inode_init_always(mp->m_super, inode); 318 320 ··· 324 324 inode->i_rdev = dev; 325 325 inode->i_uid = uid; 326 326 inode->i_gid = gid; 327 + inode->i_state = state; 327 328 mapping_set_large_folios(inode->i_mapping); 328 329 return error; 329 330 }
+7 -2
include/linux/dcache.h
··· 71 71 # define DNAME_INLINE_LEN 40 /* 192 bytes */ 72 72 #else 73 73 # ifdef CONFIG_SMP 74 - # define DNAME_INLINE_LEN 40 /* 128 bytes */ 74 + # define DNAME_INLINE_LEN 36 /* 128 bytes */ 75 75 # else 76 76 # define DNAME_INLINE_LEN 44 /* 128 bytes */ 77 77 # endif ··· 89 89 struct inode *d_inode; /* Where the name belongs to - NULL is 90 90 * negative */ 91 91 unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ 92 + /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ 92 93 93 94 /* Ref lookup also touches following */ 94 - struct lockref d_lockref; /* per-dentry lock and refcount */ 95 95 const struct dentry_operations *d_op; 96 96 struct super_block *d_sb; /* The root of the dentry tree */ 97 97 unsigned long d_time; /* used by d_revalidate */ 98 98 void *d_fsdata; /* fs-specific data */ 99 + /* --- cacheline 2 boundary (128 bytes) --- */ 100 + struct lockref d_lockref; /* per-dentry lock and refcount 101 + * keep separate from RCU lookup area if 102 + * possible! 103 + */ 99 104 100 105 union { 101 106 struct list_head d_lru; /* LRU list */
+6 -1
include/linux/fs.h
··· 3047 3047 int (*test)(struct inode *, void *), 3048 3048 int (*set)(struct inode *, void *), 3049 3049 void *data); 3050 - extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); 3050 + struct inode *iget5_locked(struct super_block *, unsigned long, 3051 + int (*test)(struct inode *, void *), 3052 + int (*set)(struct inode *, void *), void *); 3053 + struct inode *iget5_locked_rcu(struct super_block *, unsigned long, 3054 + int (*test)(struct inode *, void *), 3055 + int (*set)(struct inode *, void *), void *); 3051 3056 extern struct inode * iget_locked(struct super_block *, unsigned long); 3052 3057 extern struct inode *find_inode_nowait(struct super_block *, 3053 3058 unsigned long,