Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

new primitive: discard_new_inode()

We don't want open-by-handle picking half-set-up in-core
struct inode from e.g. mkdir() having failed halfway through.
In other words, we don't want such inodes returned by iget_locked()
on their way to extinction. However, we can't just have them
unhashed - otherwise open-by-handle immediately *after* that would've
ended up creating a new in-core inode over the on-disk one that
is in process of being freed right under us.

Solution: new flag (I_CREATING) set by insert_inode_locked() and
removed by unlock_new_inode() and a new primitive (discard_new_inode())
to be used by such halfway-through-setup failure exits instead of
unlock_new_inode() / iput() combinations. That primitive unlocks new
inode, but leaves I_CREATING in place.

iget_locked() treats finding an I_CREATING inode as failure
(-ESTALE, once we sort out the error propagation).
insert_inode_locked() treats the same as instant -EBUSY.
ilookup() treats those as icache miss.

[Fix by Dan Carpenter <dan.carpenter@oracle.com> folded in]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Al Viro c2b6d621 c971e6a0

+47 -6
+1 -1
fs/dcache.c
··· 1892 1892 spin_lock(&inode->i_lock); 1893 1893 __d_instantiate(entry, inode); 1894 1894 WARN_ON(!(inode->i_state & I_NEW)); 1895 - inode->i_state &= ~I_NEW; 1895 + inode->i_state &= ~I_NEW & ~I_CREATING; 1896 1896 smp_mb(); 1897 1897 wake_up_bit(&inode->i_state, __I_NEW); 1898 1898 spin_unlock(&inode->i_lock);
+41 -4
fs/inode.c
··· 804 804 __wait_on_freeing_inode(inode); 805 805 goto repeat; 806 806 } 807 + if (unlikely(inode->i_state & I_CREATING)) { 808 + spin_unlock(&inode->i_lock); 809 + return ERR_PTR(-ESTALE); 810 + } 807 811 __iget(inode); 808 812 spin_unlock(&inode->i_lock); 809 813 return inode; ··· 834 830 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 835 831 __wait_on_freeing_inode(inode); 836 832 goto repeat; 833 + } 834 + if (unlikely(inode->i_state & I_CREATING)) { 835 + spin_unlock(&inode->i_lock); 836 + return ERR_PTR(-ESTALE); 837 837 } 838 838 __iget(inode); 839 839 spin_unlock(&inode->i_lock); ··· 969 961 lockdep_annotate_inode_mutex_key(inode); 970 962 spin_lock(&inode->i_lock); 971 963 WARN_ON(!(inode->i_state & I_NEW)); 972 - inode->i_state &= ~I_NEW; 964 + inode->i_state &= ~I_NEW & ~I_CREATING; 973 965 smp_mb(); 974 966 wake_up_bit(&inode->i_state, __I_NEW); 975 967 spin_unlock(&inode->i_lock); 976 968 } 977 969 EXPORT_SYMBOL(unlock_new_inode); 970 + 971 + void discard_new_inode(struct inode *inode) 972 + { 973 + lockdep_annotate_inode_mutex_key(inode); 974 + spin_lock(&inode->i_lock); 975 + WARN_ON(!(inode->i_state & I_NEW)); 976 + inode->i_state &= ~I_NEW; 977 + smp_mb(); 978 + wake_up_bit(&inode->i_state, __I_NEW); 979 + spin_unlock(&inode->i_lock); 980 + iput(inode); 981 + } 982 + EXPORT_SYMBOL(discard_new_inode); 978 983 979 984 /** 980 985 * lock_two_nondirectories - take two i_mutexes on non-directory objects ··· 1060 1039 * Use the old inode instead of the preallocated one. 1061 1040 */ 1062 1041 spin_unlock(&inode_hash_lock); 1042 + if (IS_ERR(old)) 1043 + return NULL; 1063 1044 wait_on_inode(old); 1064 1045 if (unlikely(inode_unhashed(old))) { 1065 1046 iput(old); ··· 1151 1128 inode = find_inode_fast(sb, head, ino); 1152 1129 spin_unlock(&inode_hash_lock); 1153 1130 if (inode) { 1131 + if (IS_ERR(inode)) 1132 + return NULL; 1154 1133 wait_on_inode(inode); 1155 1134 if (unlikely(inode_unhashed(inode))) { 1156 1135 iput(inode); ··· 1190 1165 */ 1191 1166 spin_unlock(&inode_hash_lock); 1192 1167 destroy_inode(inode); 1168 + if (IS_ERR(old)) 1169 + return NULL; 1193 1170 inode = old; 1194 1171 wait_on_inode(inode); 1195 1172 if (unlikely(inode_unhashed(inode))) { ··· 1309 1282 inode = find_inode(sb, head, test, data); 1310 1283 spin_unlock(&inode_hash_lock); 1311 1284 1312 - return inode; 1285 + return IS_ERR(inode) ? NULL : inode; 1313 1286 } 1314 1287 EXPORT_SYMBOL(ilookup5_nowait); 1315 1288 ··· 1365 1338 spin_unlock(&inode_hash_lock); 1366 1339 1367 1340 if (inode) { 1341 + if (IS_ERR(inode)) 1342 + return NULL; 1368 1343 wait_on_inode(inode); 1369 1344 if (unlikely(inode_unhashed(inode))) { 1370 1345 iput(inode); ··· 1450 1421 } 1451 1422 if (likely(!old)) { 1452 1423 spin_lock(&inode->i_lock); 1453 - inode->i_state |= I_NEW; 1424 + inode->i_state |= I_NEW | I_CREATING; 1454 1425 hlist_add_head(&inode->i_hash, head); 1455 1426 spin_unlock(&inode->i_lock); 1456 1427 spin_unlock(&inode_hash_lock); 1457 1428 return 0; 1429 + } 1430 + if (unlikely(old->i_state & I_CREATING)) { 1431 + spin_unlock(&old->i_lock); 1432 + spin_unlock(&inode_hash_lock); 1433 + return -EBUSY; 1458 1434 } 1459 1435 __iget(old); 1460 1436 spin_unlock(&old->i_lock); ··· 1477 1443 int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1478 1444 int (*test)(struct inode *, void *), void *data) 1479 1445 { 1480 - struct inode *old = inode_insert5(inode, hashval, test, NULL, data); 1446 + struct inode *old; 1447 + 1448 + inode->i_state |= I_CREATING; 1449 + old = inode_insert5(inode, hashval, test, NULL, data); 1481 1450 1482 1451 if (old != inode) { 1483 1452 iput(old);
+5 -1
include/linux/fs.h
··· 2016 2016 * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper 2017 2017 * and work dirs among overlayfs mounts. 2018 2018 * 2019 + * I_CREATING New object's inode in the middle of setting up. 2020 + * 2019 2021 * Q: What is the difference between I_WILL_FREE and I_FREEING? 2020 2022 */ 2021 2023 #define I_DIRTY_SYNC (1 << 0) ··· 2038 2036 #define __I_DIRTY_TIME_EXPIRED 12 2039 2037 #define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) 2040 2038 #define I_WB_SWITCH (1 << 13) 2041 - #define I_OVL_INUSE (1 << 14) 2039 + #define I_OVL_INUSE (1 << 14) 2040 + #define I_CREATING (1 << 15) 2042 2041 2043 2042 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) 2044 2043 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) ··· 2922 2919 static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; 2923 2920 #endif 2924 2921 extern void unlock_new_inode(struct inode *); 2922 + extern void discard_new_inode(struct inode *); 2925 2923 extern unsigned int get_next_ino(void); 2926 2924 extern void evict_inodes(struct super_block *sb); 2927 2925