Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nfsd: don't hold i_mutex over userspace upcalls

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir. If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about. We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd. But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex. So we could just drop the i_mutex elsewhere and do
something like

mutex_lock()
lookup_one_len()
mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

authored by

NeilBrown and committed by
Al Viro
bbddca8e db39c167

+86 -19
+71
fs/namei.c
··· 2272 2272 * 2273 2273 * Note that this routine is purely a helper for filesystem usage and should 2274 2274 * not be called by generic code. 2275 + * 2276 + * The caller must hold base->i_mutex. 2275 2277 */ 2276 2278 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 2277 2279 { ··· 2316 2314 return __lookup_hash(&this, base, 0); 2317 2315 } 2318 2316 EXPORT_SYMBOL(lookup_one_len); 2317 + 2318 + /** 2319 + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component 2320 + * @name: pathname component to lookup 2321 + * @base: base directory to lookup from 2322 + * @len: maximum length @len should be interpreted to 2323 + * 2324 + * Note that this routine is purely a helper for filesystem usage and should 2325 + * not be called by generic code. 2326 + * 2327 + * Unlike lookup_one_len, it should be called without the parent 2328 + * i_mutex held, and will take the i_mutex itself if necessary. 2329 + */ 2330 + struct dentry *lookup_one_len_unlocked(const char *name, 2331 + struct dentry *base, int len) 2332 + { 2333 + struct qstr this; 2334 + unsigned int c; 2335 + int err; 2336 + struct dentry *ret; 2337 + 2338 + this.name = name; 2339 + this.len = len; 2340 + this.hash = full_name_hash(name, len); 2341 + if (!len) 2342 + return ERR_PTR(-EACCES); 2343 + 2344 + if (unlikely(name[0] == '.')) { 2345 + if (len < 2 || (len == 2 && name[1] == '.')) 2346 + return ERR_PTR(-EACCES); 2347 + } 2348 + 2349 + while (len--) { 2350 + c = *(const unsigned char *)name++; 2351 + if (c == '/' || c == '\0') 2352 + return ERR_PTR(-EACCES); 2353 + } 2354 + /* 2355 + * See if the low-level filesystem might want 2356 + * to use its own hash.. 2357 + */ 2358 + if (base->d_flags & DCACHE_OP_HASH) { 2359 + int err = base->d_op->d_hash(base, &this); 2360 + if (err < 0) 2361 + return ERR_PTR(err); 2362 + } 2363 + 2364 + err = inode_permission(base->d_inode, MAY_EXEC); 2365 + if (err) 2366 + return ERR_PTR(err); 2367 + 2368 + /* 2369 + * __d_lookup() is used to try to get a quick answer and avoid the 2370 + * mutex. A false-negative does no harm. 2371 + */ 2372 + ret = __d_lookup(base, &this); 2373 + if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) { 2374 + dput(ret); 2375 + ret = NULL; 2376 + } 2377 + if (ret) 2378 + return ret; 2379 + 2380 + mutex_lock(&base->d_inode->i_mutex); 2381 + ret = __lookup_hash(&this, base, 0); 2382 + mutex_unlock(&base->d_inode->i_mutex); 2383 + return ret; 2384 + } 2385 + EXPORT_SYMBOL(lookup_one_len_unlocked); 2319 2386 2320 2387 int user_path_at_empty(int dfd, const char __user *name, unsigned flags, 2321 2388 struct path *path, int *empty)
+1 -1
fs/nfsd/nfs3xdr.c
··· 823 823 } else 824 824 dchild = dget(dparent); 825 825 } else 826 - dchild = lookup_one_len(name, dparent, namlen); 826 + dchild = lookup_one_len_unlocked(name, dparent, namlen); 827 827 if (IS_ERR(dchild)) 828 828 return rv; 829 829 if (d_mountpoint(dchild))
+4 -4
fs/nfsd/nfs4xdr.c
··· 2838 2838 __be32 nfserr; 2839 2839 int ignore_crossmnt = 0; 2840 2840 2841 - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); 2841 + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen); 2842 2842 if (IS_ERR(dentry)) 2843 2843 return nfserrno(PTR_ERR(dentry)); 2844 2844 if (d_really_is_negative(dentry)) { 2845 2845 /* 2846 - * nfsd_buffered_readdir drops the i_mutex between 2847 - * readdir and calling this callback, leaving a window 2848 - * where this directory entry could have gone away. 2846 + * we're not holding the i_mutex here, so there's 2847 + * a window where this directory entry could have gone 2848 + * away. 2849 2849 */ 2850 2850 dput(dentry); 2851 2851 return nfserr_noent;
+9 -14
fs/nfsd/vfs.c
··· 217 217 host_err = PTR_ERR(dentry); 218 218 if (IS_ERR(dentry)) 219 219 goto out_nfserr; 220 - /* 221 - * check if we have crossed a mount point ... 222 - */ 223 220 if (nfsd_mountpoint(dentry, exp)) { 221 + /* 222 + * We don't need the i_mutex after all. It's 223 + * still possible we could open this (regular 224 + * files can be mountpoints too), but the 225 + * i_mutex is just there to prevent renames of 226 + * something that we might be about to delegate, 227 + * and a mountpoint won't be renamed: 228 + */ 229 + fh_unlock(fhp); 224 230 if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { 225 231 dput(dentry); 226 232 goto out_nfserr; ··· 1815 1809 offset = *offsetp; 1816 1810 1817 1811 while (1) { 1818 - struct inode *dir_inode = file_inode(file); 1819 1812 unsigned int reclen; 1820 1813 1821 1814 cdp->err = nfserr_eof; /* will be cleared on successful read */ ··· 1833 1828 if (!size) 1834 1829 break; 1835 1830 1836 - /* 1837 - * Various filldir functions may end up calling back into 1838 - * lookup_one_len() and the file system's ->lookup() method. 1839 - * These expect i_mutex to be held, as it would within readdir. 1840 - */ 1841 - host_err = mutex_lock_killable(&dir_inode->i_mutex); 1842 - if (host_err) 1843 - break; 1844 - 1845 1831 de = (struct buffered_dirent *)buf.dirent; 1846 1832 while (size > 0) { 1847 1833 offset = de->offset; ··· 1849 1853 size -= reclen; 1850 1854 de = (struct buffered_dirent *)((char *)de + reclen); 1851 1855 } 1852 - mutex_unlock(&dir_inode->i_mutex); 1853 1856 if (size > 0) /* We bailed out early */ 1854 1857 break; 1855 1858
+1
include/linux/namei.h
··· 77 77 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int); 78 78 79 79 extern struct dentry *lookup_one_len(const char *, struct dentry *, int); 80 + extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); 80 81 81 82 extern int follow_down_one(struct path *); 82 83 extern int follow_down(struct path *);