Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

NFS: fix the handling of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping

There is a possible race in how the nfs_invalidate_mapping function is
handled. Currently, we go and invalidate the pages in the file and then
clear NFS_INO_INVALID_DATA.

The problem is that it's possible for a stale page to creep into the
mapping after the page was invalidated (i.e., via readahead). If another
writer comes along and sets the flag after that happens but before
invalidate_inode_pages2 returns then we could clear the flag
without the cache having been properly invalidated.

So, we must clear the flag first and then invalidate the pages. Doing
this however, opens another race:

It's possible to have two concurrent read() calls that end up in
nfs_revalidate_mapping at the same time. The first one clears the
NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.

Just before calling that though, the other task races in, checks the
flag and finds it cleared. At that point, it trusts that the mapping is
good and gets the lock on the page, allowing the read() to be satisfied
from the cache even though the data is no longer valid.

These effects are easily manifested by running diotest3 from the LTP
test suite on NFS. That program does a series of DIO writes and buffered
reads. The operations are serialized and page-aligned but the existing
code fails the test since it occasionally allows a read to come out of
the cache incorrectly. While mixing direct and buffered I/O isn't
recommended, I believe it's possible to hit this in other ways that just
use buffered I/O, though that situation is much harder to reproduce.

The problem is that the checking/clearing of that flag and the
invalidation of the mapping really need to be atomic. Fix this by
serializing concurrent invalidations with a bitlock.

At the same time, we also need to allow other places that check
NFS_INO_INVALID_DATA to check whether we might be in the middle of
invalidating the file, so fix up a couple of places that do that
to look for the new NFS_INO_INVALIDATING flag.

Doing this requires us to be careful not to set the bitlock
unnecessarily, so this code only does that if it believes it will
be doing an invalidation.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>

authored by

Jeff Layton and committed by
Trond Myklebust
d529ef83 7dd7d959

+47 -6
+2 -1
fs/nfs/dir.c
··· 288 288 289 289 new_pos = desc->current_index + i; 290 290 if (ctx->attr_gencount != nfsi->attr_gencount 291 - || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { 291 + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) 292 + || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) { 292 293 ctx->duped = 0; 293 294 ctx->attr_gencount = nfsi->attr_gencount; 294 295 } else if (new_pos < desc->ctx->pos) {
+38 -4
fs/nfs/inode.c
··· 977 977 if (ret < 0) 978 978 return ret; 979 979 } 980 - spin_lock(&inode->i_lock); 981 - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 982 - if (S_ISDIR(inode->i_mode)) 980 + if (S_ISDIR(inode->i_mode)) { 981 + spin_lock(&inode->i_lock); 983 982 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 984 - spin_unlock(&inode->i_lock); 983 + spin_unlock(&inode->i_lock); 984 + } 985 985 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 986 986 nfs_fscache_wait_on_invalidate(inode); 987 987 ··· 1008 1008 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 1009 1009 { 1010 1010 struct nfs_inode *nfsi = NFS_I(inode); 1011 + unsigned long *bitlock = &nfsi->flags; 1011 1012 int ret = 0; 1012 1013 1013 1014 /* swapfiles are not supposed to be shared. */ ··· 1020 1019 if (ret < 0) 1021 1020 goto out; 1022 1021 } 1022 + 1023 + /* 1024 + * We must clear NFS_INO_INVALID_DATA first to ensure that 1025 + * invalidations that come in while we're shooting down the mappings 1026 + * are respected. But, that leaves a race window where one revalidator 1027 + * can clear the flag, and then another checks it before the mapping 1028 + * gets invalidated. Fix that by serializing access to this part of 1029 + * the function. 1030 + * 1031 + * At the same time, we need to allow other tasks to see whether we 1032 + * might be in the middle of invalidating the pages, so we only set 1033 + * the bit lock here if it looks like we're going to be doing that. 1034 + */ 1035 + for (;;) { 1036 + ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, 1037 + nfs_wait_bit_killable, TASK_KILLABLE); 1038 + if (ret) 1039 + goto out; 1040 + if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA)) 1041 + goto out; 1042 + if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock)) 1043 + break; 1044 + } 1045 + 1046 + spin_lock(&inode->i_lock); 1023 1047 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { 1048 + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 1049 + spin_unlock(&inode->i_lock); 1024 1050 trace_nfs_invalidate_mapping_enter(inode); 1025 1051 ret = nfs_invalidate_mapping(inode, mapping); 1026 1052 trace_nfs_invalidate_mapping_exit(inode, ret); 1053 + } else { 1054 + /* something raced in and cleared the flag */ 1055 + spin_unlock(&inode->i_lock); 1027 1056 } 1028 1057 1058 + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); 1059 + smp_mb__after_clear_bit(); 1060 + wake_up_bit(bitlock, NFS_INO_INVALIDATING); 1029 1061 out: 1030 1062 return ret; 1031 1063 }
+1
fs/nfs/nfstrace.h
··· 36 36 __print_flags(v, "|", \ 37 37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ 38 38 { 1 << NFS_INO_STALE, "STALE" }, \ 39 + { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ 39 40 { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ 40 41 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ 41 42 { 1 << NFS_INO_COMMIT, "COMMIT" }, \
+5 -1
fs/nfs/write.c
··· 909 909 */ 910 910 static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) 911 911 { 912 + struct nfs_inode *nfsi = NFS_I(inode); 913 + 912 914 if (nfs_have_delegated_attributes(inode)) 913 915 goto out; 914 - if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) 916 + if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) 917 + return false; 918 + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) 915 919 return false; 916 920 out: 917 921 return PageUptodate(page) != 0;
+1
include/linux/nfs_fs.h
··· 215 215 #define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */ 216 216 #define NFS_INO_STALE (1) /* possible stale inode */ 217 217 #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ 218 + #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ 218 219 #define NFS_INO_FLUSHING (4) /* inode is flushing out data */ 219 220 #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ 220 221 #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */