Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vfs: kill FS_REVAL_DOT by adding a d_weak_revalidate dentry op

The following set of operations on a NFS client and server will cause

server# mkdir a
client# cd a
server# mv a a.bak
client# sleep 30 # (or whatever the dir attrcache timeout is)
client# stat .
stat: cannot stat `.': Stale NFS file handle

Obviously, we should not be getting an ESTALE error back there since the
inode still exists on the server. The problem is that the lookup code
will call d_revalidate on the dentry that "." refers to, because NFS has
FS_REVAL_DOT set.

nfs_lookup_revalidate will see that the parent directory has changed and
will try to reverify the dentry by redoing a LOOKUP. That of course
fails, so the lookup code returns ESTALE.

The problem here is that d_revalidate is really a bad fit for this case.
What we really want to know at this point is whether the inode is still
good or not, but we don't really care what name it goes by or whether
the dcache is still valid.

Add a new d_op->d_weak_revalidate operation and have complete_walk call
that instead of d_revalidate. The intent there is to allow for a
"weaker" d_revalidate that just checks to see whether the inode is still
good. This is also gives us an opportunity to kill off the FS_REVAL_DOT
special casing.

[AV: changed method name, added note in porting, fixed confusion re
having it possibly called from RCU mode (it won't be)]

Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

Jeff Layton and committed by
Al Viro
ecf3d1f1 4f4a4fad

+84 -16
+2
Documentation/filesystems/Locking
··· 10 10 --------------------------- dentry_operations -------------------------- 11 11 prototypes: 12 12 int (*d_revalidate)(struct dentry *, unsigned int); 13 + int (*d_weak_revalidate)(struct dentry *, unsigned int); 13 14 int (*d_hash)(const struct dentry *, const struct inode *, 14 15 struct qstr *); 15 16 int (*d_compare)(const struct dentry *, const struct inode *, ··· 26 25 locking rules: 27 26 rename_lock ->d_lock may block rcu-walk 28 27 d_revalidate: no no yes (ref-walk) maybe 28 + d_weak_revalidate:no no yes no 29 29 d_hash no no no maybe 30 30 d_compare: yes no no maybe 31 31 d_delete: no yes no no
+4
Documentation/filesystems/porting
··· 441 441 two, it gets "is it an O_EXCL or equivalent?" boolean argument. Note that 442 442 local filesystems can ignore tha argument - they are guaranteed that the 443 443 object doesn't exist. It's remote/distributed ones that might care... 444 + -- 445 + [mandatory] 446 + FS_REVAL_DOT is gone; if you used to have it, add ->d_weak_revalidate() 447 + in your dentry operations instead.
+22 -2
Documentation/filesystems/vfs.txt
··· 900 900 901 901 struct dentry_operations { 902 902 int (*d_revalidate)(struct dentry *, unsigned int); 903 + int (*d_weak_revalidate)(struct dentry *, unsigned int); 903 904 int (*d_hash)(const struct dentry *, const struct inode *, 904 905 struct qstr *); 905 906 int (*d_compare)(const struct dentry *, const struct inode *, ··· 916 915 917 916 d_revalidate: called when the VFS needs to revalidate a dentry. This 918 917 is called whenever a name look-up finds a dentry in the 919 - dcache. Most filesystems leave this as NULL, because all their 920 - dentries in the dcache are valid 918 + dcache. Most local filesystems leave this as NULL, because all their 919 + dentries in the dcache are valid. Network filesystems are different 920 + since things can change on the server without the client necessarily 921 + being aware of it. 922 + 923 + This function should return a positive value if the dentry is still 924 + valid, and zero or a negative error code if it isn't. 921 925 922 926 d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU). 923 927 If in rcu-walk mode, the filesystem must revalidate the dentry without ··· 932 926 933 927 If a situation is encountered that rcu-walk cannot handle, return 934 928 -ECHILD and it will be called again in ref-walk mode. 929 + 930 + d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry. 931 + This is called when a path-walk ends at dentry that was not acquired by 932 + doing a lookup in the parent directory. This includes "/", "." and "..", 933 + as well as procfs-style symlinks and mountpoint traversal. 934 + 935 + In this case, we are less concerned with whether the dentry is still 936 + fully correct, but rather that the inode is still valid. As with 937 + d_revalidate, most local filesystems will set this to NULL since their 938 + dcache entries are always valid. 939 + 940 + This function has the same return code semantics as d_revalidate. 941 + 942 + d_weak_revalidate is only called after leaving rcu-walk mode. 935 943 936 944 d_hash: called when the VFS adds a dentry to the hash table. The first 937 945 dentry passed to d_hash is the parent directory that the name is
+1
fs/9p/vfs_dentry.c
··· 137 137 138 138 const struct dentry_operations v9fs_cached_dentry_operations = { 139 139 .d_revalidate = v9fs_lookup_revalidate, 140 + .d_weak_revalidate = v9fs_lookup_revalidate, 140 141 .d_delete = v9fs_cached_dentry_delete, 141 142 .d_release = v9fs_dentry_release, 142 143 };
+1 -1
fs/9p/vfs_super.c
··· 363 363 .mount = v9fs_mount, 364 364 .kill_sb = v9fs_kill_super, 365 365 .owner = THIS_MODULE, 366 - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT, 366 + .fs_flags = FS_RENAME_DOES_D_MOVE, 367 367 };
+3
fs/dcache.c
··· 1358 1358 WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | 1359 1359 DCACHE_OP_COMPARE | 1360 1360 DCACHE_OP_REVALIDATE | 1361 + DCACHE_OP_WEAK_REVALIDATE | 1361 1362 DCACHE_OP_DELETE )); 1362 1363 dentry->d_op = op; 1363 1364 if (!op) ··· 1369 1368 dentry->d_flags |= DCACHE_OP_COMPARE; 1370 1369 if (op->d_revalidate) 1371 1370 dentry->d_flags |= DCACHE_OP_REVALIDATE; 1371 + if (op->d_weak_revalidate) 1372 + dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; 1372 1373 if (op->d_delete) 1373 1374 dentry->d_flags |= DCACHE_OP_DELETE; 1374 1375 if (op->d_prune)
+2 -6
fs/namei.c
··· 600 600 if (likely(!(nd->flags & LOOKUP_JUMPED))) 601 601 return 0; 602 602 603 - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) 603 + if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) 604 604 return 0; 605 605 606 - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) 607 - return 0; 608 - 609 - /* Note: we do not d_invalidate() */ 610 - status = d_revalidate(dentry, nd->flags); 606 + status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); 611 607 if (status > 0) 612 608 return 0; 613 609
+40
fs/nfs/dir.c
··· 1136 1136 } 1137 1137 1138 1138 /* 1139 + * A weaker form of d_revalidate for revalidating just the dentry->d_inode 1140 + * when we don't really care about the dentry name. This is called when a 1141 + * pathwalk ends on a dentry that was not found via a normal lookup in the 1142 + * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). 1143 + * 1144 + * In this situation, we just want to verify that the inode itself is OK 1145 + * since the dentry might have changed on the server. 1146 + */ 1147 + static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) 1148 + { 1149 + int error; 1150 + struct inode *inode = dentry->d_inode; 1151 + 1152 + /* 1153 + * I believe we can only get a negative dentry here in the case of a 1154 + * procfs-style symlink. Just assume it's correct for now, but we may 1155 + * eventually need to do something more here. 1156 + */ 1157 + if (!inode) { 1158 + dfprintk(LOOKUPCACHE, "%s: %s/%s has negative inode\n", 1159 + __func__, dentry->d_parent->d_name.name, 1160 + dentry->d_name.name); 1161 + return 1; 1162 + } 1163 + 1164 + if (is_bad_inode(inode)) { 1165 + dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", 1166 + __func__, dentry->d_parent->d_name.name, 1167 + dentry->d_name.name); 1168 + return 0; 1169 + } 1170 + 1171 + error = nfs_revalidate_inode(NFS_SERVER(inode), inode); 1172 + dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", 1173 + __func__, inode->i_ino, error ? "invalid" : "valid"); 1174 + return !error; 1175 + } 1176 + 1177 + /* 1139 1178 * This is called from dput() when d_count is going to 0. 1140 1179 */ 1141 1180 static int nfs_dentry_delete(const struct dentry *dentry) ··· 1241 1202 1242 1203 const struct dentry_operations nfs_dentry_operations = { 1243 1204 .d_revalidate = nfs_lookup_revalidate, 1205 + .d_weak_revalidate = nfs_weak_revalidate, 1244 1206 .d_delete = nfs_dentry_delete, 1245 1207 .d_iput = nfs_dentry_iput, 1246 1208 .d_automount = nfs_d_automount,
+3 -3
fs/nfs/nfs4super.c
··· 28 28 .name = "nfs4", 29 29 .mount = nfs4_remote_mount, 30 30 .kill_sb = nfs_kill_super, 31 - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 31 + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, 32 32 }; 33 33 34 34 static struct file_system_type nfs4_remote_referral_fs_type = { ··· 36 36 .name = "nfs4", 37 37 .mount = nfs4_remote_referral_mount, 38 38 .kill_sb = nfs_kill_super, 39 - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 39 + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, 40 40 }; 41 41 42 42 struct file_system_type nfs4_referral_fs_type = { ··· 44 44 .name = "nfs4", 45 45 .mount = nfs4_referral_mount, 46 46 .kill_sb = nfs_kill_super, 47 - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 47 + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, 48 48 }; 49 49 50 50 static const struct super_operations nfs4_sops = {
+3 -3
fs/nfs/super.c
··· 292 292 .name = "nfs", 293 293 .mount = nfs_fs_mount, 294 294 .kill_sb = nfs_kill_super, 295 - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 295 + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, 296 296 }; 297 297 EXPORT_SYMBOL_GPL(nfs_fs_type); 298 298 ··· 301 301 .name = "nfs", 302 302 .mount = nfs_xdev_mount, 303 303 .kill_sb = nfs_kill_super, 304 - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 304 + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, 305 305 }; 306 306 307 307 const struct super_operations nfs_sops = { ··· 331 331 .name = "nfs4", 332 332 .mount = nfs_fs_mount, 333 333 .kill_sb = nfs_kill_super, 334 - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 334 + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, 335 335 }; 336 336 EXPORT_SYMBOL_GPL(nfs4_fs_type); 337 337
+3
include/linux/dcache.h
··· 145 145 146 146 struct dentry_operations { 147 147 int (*d_revalidate)(struct dentry *, unsigned int); 148 + int (*d_weak_revalidate)(struct dentry *, unsigned int); 148 149 int (*d_hash)(const struct dentry *, const struct inode *, 149 150 struct qstr *); 150 151 int (*d_compare)(const struct dentry *, const struct inode *, ··· 192 191 #define DCACHE_CANT_MOUNT 0x0100 193 192 #define DCACHE_GENOCIDE 0x0200 194 193 #define DCACHE_SHRINK_LIST 0x0400 194 + 195 + #define DCACHE_OP_WEAK_REVALIDATE 0x0800 195 196 196 197 #define DCACHE_NFSFS_RENAMED 0x1000 197 198 /* this dentry has been "silly renamed" and has to be deleted on the last
-1
include/linux/fs.h
··· 1807 1807 #define FS_HAS_SUBTYPE 4 1808 1808 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ 1809 1809 #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ 1810 - #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ 1811 1810 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ 1812 1811 struct dentry *(*mount) (struct file_system_type *, int, 1813 1812 const char *, void *);