Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'iversion-v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux

Pull i_version updates from Jeff Layton:
"This overhauls how we handle i_version queries from nfsd.

Instead of having special routines and grabbing the i_version field
directly out of the inode in some cases, we've moved most of the
handling into the various filesystems' getattr operations. As a bonus,
this makes ceph's change attribute usable by knfsd as well.

This should pave the way for future work to make this value queryable
by userland, and to make it more resilient against rolling back on a
crash"

* tag 'iversion-v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux:
nfsd: remove fetch_iversion export operation
nfsd: use the getattr operation to fetch i_version
nfsd: move nfsd4_change_attribute to nfsfh.c
ceph: report the inode version in getattr if requested
nfs: report the inode version in getattr if requested
vfs: plumb i_version handling into struct kstat
fs: clarify when the i_version counter must be updated
fs: uninline inode_query_iversion

+157 -87
+11 -5
fs/ceph/inode.c
··· 2417 2417 { 2418 2418 int mask = 0; 2419 2419 2420 - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) 2420 + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) 2421 2421 mask |= CEPH_CAP_AUTH_SHARED; 2422 2422 2423 - if (want & (STATX_NLINK|STATX_CTIME)) { 2423 + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { 2424 2424 /* 2425 2425 * The link count for directories depends on inode->i_subdirs, 2426 2426 * and that is only updated when Fs caps are held. ··· 2431 2431 mask |= CEPH_CAP_LINK_SHARED; 2432 2432 } 2433 2433 2434 - if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| 2435 - STATX_BLOCKS)) 2434 + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) 2436 2435 mask |= CEPH_CAP_FILE_SHARED; 2437 2436 2438 - if (want & (STATX_CTIME)) 2437 + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) 2439 2438 mask |= CEPH_CAP_XATTR_SHARED; 2440 2439 2441 2440 return mask; ··· 2475 2476 if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) { 2476 2477 stat->btime = ci->i_btime; 2477 2478 valid_mask |= STATX_BTIME; 2479 + } 2480 + 2481 + if (request_mask & STATX_CHANGE_COOKIE) { 2482 + stat->change_cookie = inode_peek_iversion_raw(inode); 2483 + valid_mask |= STATX_CHANGE_COOKIE; 2478 2484 } 2479 2485 2480 2486 if (ceph_snap(inode) == CEPH_NOSNAP) ··· 2523 2519 stat->nlink = 1 + 1 + ci->i_subdirs; 2524 2520 } 2525 2521 2522 + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; 2523 + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; 2526 2524 stat->result_mask = request_mask & valid_mask; 2527 2525 return err; 2528 2526 }
+36
fs/libfs.c
··· 1582 1582 return true; 1583 1583 } 1584 1584 EXPORT_SYMBOL(inode_maybe_inc_iversion); 1585 + 1586 + /** 1587 + * inode_query_iversion - read i_version for later use 1588 + * @inode: inode from which i_version should be read 1589 + * 1590 + * Read the inode i_version counter. This should be used by callers that wish 1591 + * to store the returned i_version for later comparison. This will guarantee 1592 + * that a later query of the i_version will result in a different value if 1593 + * anything has changed. 1594 + * 1595 + * In this implementation, we fetch the current value, set the QUERIED flag and 1596 + * then try to swap it into place with a cmpxchg, if it wasn't already set. If 1597 + * that fails, we try again with the newly fetched value from the cmpxchg. 1598 + */ 1599 + u64 inode_query_iversion(struct inode *inode) 1600 + { 1601 + u64 cur, new; 1602 + 1603 + cur = inode_peek_iversion_raw(inode); 1604 + do { 1605 + /* If flag is already set, then no need to swap */ 1606 + if (cur & I_VERSION_QUERIED) { 1607 + /* 1608 + * This barrier (and the implicit barrier in the 1609 + * cmpxchg below) pairs with the barrier in 1610 + * inode_maybe_inc_iversion(). 1611 + */ 1612 + smp_mb(); 1613 + break; 1614 + } 1615 + 1616 + new = cur | I_VERSION_QUERIED; 1617 + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 1618 + return cur >> I_VERSION_QUERIED_SHIFT; 1619 + } 1620 + EXPORT_SYMBOL(inode_query_iversion);
-7
fs/nfs/export.c
··· 145 145 return parent; 146 146 } 147 147 148 - static u64 nfs_fetch_iversion(struct inode *inode) 149 - { 150 - nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); 151 - return inode_peek_iversion_raw(inode); 152 - } 153 - 154 148 const struct export_operations nfs_export_ops = { 155 149 .encode_fh = nfs_encode_fh, 156 150 .fh_to_dentry = nfs_fh_to_dentry, 157 151 .get_parent = nfs_get_parent, 158 - .fetch_iversion = nfs_fetch_iversion, 159 152 .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| 160 153 EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| 161 154 EXPORT_OP_NOATOMIC_ATTR,
+12 -4
fs/nfs/inode.c
··· 825 825 reply_mask |= STATX_UID | STATX_GID; 826 826 if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) 827 827 reply_mask |= STATX_BLOCKS; 828 + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) 829 + reply_mask |= STATX_CHANGE_COOKIE; 828 830 return reply_mask; 829 831 } 830 832 ··· 845 843 846 844 request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | 847 845 STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | 848 - STATX_INO | STATX_SIZE | STATX_BLOCKS; 846 + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | 847 + STATX_CHANGE_COOKIE; 849 848 850 849 if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { 851 850 if (readdirplus_enabled) ··· 854 851 goto out_no_revalidate; 855 852 } 856 853 857 - /* Flush out writes to the server in order to update c/mtime. */ 858 - if ((request_mask & (STATX_CTIME | STATX_MTIME)) && 854 + /* Flush out writes to the server in order to update c/mtime/version. */ 855 + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && 859 856 S_ISREG(inode->i_mode)) 860 857 filemap_write_and_wait(inode->i_mapping); 861 858 ··· 875 872 /* Is the user requesting attributes that might need revalidation? */ 876 873 if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| 877 874 STATX_MTIME|STATX_UID|STATX_GID| 878 - STATX_SIZE|STATX_BLOCKS))) 875 + STATX_SIZE|STATX_BLOCKS| 876 + STATX_CHANGE_COOKIE))) 879 877 goto out_no_revalidate; 880 878 881 879 /* Check whether the cached attributes are stale */ ··· 914 910 915 911 generic_fillattr(&init_user_ns, inode, stat); 916 912 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 913 + stat->change_cookie = inode_peek_iversion_raw(inode); 914 + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; 915 + if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) 916 + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; 917 917 if (S_ISDIR(inode->i_mode)) 918 918 stat->blksize = NFS_SERVER(inode)->dtsize; 919 919 out:
+3 -1
fs/nfsd/nfs4xdr.c
··· 2965 2965 goto out; 2966 2966 } 2967 2967 2968 - err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); 2968 + err = vfs_getattr(&path, &stat, 2969 + STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, 2970 + AT_STATX_SYNC_AS_STAT); 2969 2971 if (err) 2970 2972 goto out_nfserr; 2971 2973 if (!(stat.result_mask & STATX_BTIME))
+42
fs/nfsd/nfsfh.c
··· 628 628 stat.mtime = inode->i_mtime; 629 629 stat.ctime = inode->i_ctime; 630 630 stat.size = inode->i_size; 631 + if (v4 && IS_I_VERSION(inode)) { 632 + stat.change_cookie = inode_query_iversion(inode); 633 + stat.result_mask |= STATX_CHANGE_COOKIE; 634 + } 631 635 } 632 636 if (v4) 633 637 fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); ··· 663 659 if (err) { 664 660 fhp->fh_post_saved = false; 665 661 fhp->fh_post_attr.ctime = inode->i_ctime; 662 + if (v4 && IS_I_VERSION(inode)) { 663 + fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); 664 + fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; 665 + } 666 666 } else 667 667 fhp->fh_post_saved = true; 668 668 if (v4) ··· 755 747 if (fhp->fh_export->ex_uuid) 756 748 return FSIDSOURCE_UUID; 757 749 return FSIDSOURCE_DEV; 750 + } 751 + 752 + /* 753 + * We could use i_version alone as the change attribute. However, i_version 754 + * can go backwards on a regular file after an unclean shutdown. On its own 755 + * that doesn't necessarily cause a problem, but if i_version goes backwards 756 + * and then is incremented again it could reuse a value that was previously 757 + * used before boot, and a client who queried the two values might incorrectly 758 + * assume nothing changed. 759 + * 760 + * By using both ctime and the i_version counter we guarantee that as long as 761 + * time doesn't go backwards we never reuse an old value. If the filesystem 762 + * advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not 763 + * needed. 764 + * 765 + * We only need to do this for regular files as well. For directories, we 766 + * assume that the new change attr is always logged to stable storage in some 767 + * fashion before the results can be seen. 768 + */ 769 + u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) 770 + { 771 + u64 chattr; 772 + 773 + if (stat->result_mask & STATX_CHANGE_COOKIE) { 774 + chattr = stat->change_cookie; 775 + if (S_ISREG(inode->i_mode) && 776 + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { 777 + chattr += (u64)stat->ctime.tv_sec << 30; 778 + chattr += stat->ctime.tv_nsec; 779 + } 780 + } else { 781 + chattr = time_to_chattr(&stat->ctime); 782 + } 783 + return chattr; 758 784 }
+1 -28
fs/nfsd/nfsfh.h
··· 293 293 fhp->fh_pre_saved = false; 294 294 } 295 295 296 - /* 297 - * We could use i_version alone as the change attribute. However, 298 - * i_version can go backwards after a reboot. On its own that doesn't 299 - * necessarily cause a problem, but if i_version goes backwards and then 300 - * is incremented again it could reuse a value that was previously used 301 - * before boot, and a client who queried the two values might 302 - * incorrectly assume nothing changed. 303 - * 304 - * By using both ctime and the i_version counter we guarantee that as 305 - * long as time doesn't go backwards we never reuse an old value. 306 - */ 307 - static inline u64 nfsd4_change_attribute(struct kstat *stat, 308 - struct inode *inode) 309 - { 310 - if (inode->i_sb->s_export_op->fetch_iversion) 311 - return inode->i_sb->s_export_op->fetch_iversion(inode); 312 - else if (IS_I_VERSION(inode)) { 313 - u64 chattr; 314 - 315 - chattr = stat->ctime.tv_sec; 316 - chattr <<= 30; 317 - chattr += stat->ctime.tv_nsec; 318 - chattr += inode_query_iversion(inode); 319 - return chattr; 320 - } else 321 - return time_to_chattr(&stat->ctime); 322 - } 323 - 296 + u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); 324 297 extern void fh_fill_pre_attrs(struct svc_fh *fhp); 325 298 extern void fh_fill_post_attrs(struct svc_fh *fhp); 326 299 extern void fh_fill_both_attrs(struct svc_fh *fhp);
+6 -1
fs/nfsd/vfs.h
··· 170 170 171 171 static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) 172 172 { 173 + u32 request_mask = STATX_BASIC_STATS; 173 174 struct path p = {.mnt = fh->fh_export->ex_path.mnt, 174 175 .dentry = fh->fh_dentry}; 175 - return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, 176 + 177 + if (fh->fh_maxsize == NFS4_FHSIZE) 178 + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); 179 + 180 + return nfserrno(vfs_getattr(&p, stat, request_mask, 176 181 AT_STATX_SYNC_AS_STAT)); 177 182 } 178 183
+15 -2
fs/stat.c
··· 18 18 #include <linux/syscalls.h> 19 19 #include <linux/pagemap.h> 20 20 #include <linux/compat.h> 21 + #include <linux/iversion.h> 21 22 22 23 #include <linux/uaccess.h> 23 24 #include <asm/unistd.h> ··· 122 121 123 122 stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | 124 123 STATX_ATTR_DAX); 124 + 125 + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { 126 + stat->result_mask |= STATX_CHANGE_COOKIE; 127 + stat->change_cookie = inode_query_iversion(inode); 128 + } 125 129 126 130 mnt_userns = mnt_user_ns(path->mnt); 127 131 if (inode->i_op->getattr) ··· 608 602 609 603 memset(&tmp, 0, sizeof(tmp)); 610 604 611 - tmp.stx_mask = stat->result_mask; 605 + /* STATX_CHANGE_COOKIE is kernel-only for now */ 606 + tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; 612 607 tmp.stx_blksize = stat->blksize; 613 - tmp.stx_attributes = stat->attributes; 608 + /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ 609 + tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; 614 610 tmp.stx_nlink = stat->nlink; 615 611 tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); 616 612 tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); ··· 650 642 return -EINVAL; 651 643 if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) 652 644 return -EINVAL; 645 + 646 + /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests 647 + * from userland. 648 + */ 649 + mask &= ~STATX_CHANGE_COOKIE; 653 650 654 651 error = vfs_statx(dfd, filename, flags, &stat, mask); 655 652 if (error)
-1
include/linux/exportfs.h
··· 213 213 bool write, u32 *device_generation); 214 214 int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, 215 215 int nr_iomaps, struct iattr *iattr); 216 - u64 (*fetch_iversion)(struct inode *); 217 216 #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ 218 217 #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ 219 218 #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */
+22 -38
include/linux/iversion.h
··· 9 9 * --------------------------- 10 10 * The change attribute (i_version) is mandated by NFSv4 and is mostly for 11 11 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must 12 - * appear different to observers if there was a change to the inode's data or 13 - * metadata since it was last queried. 12 + * appear larger to observers if there was an explicit change to the inode's 13 + * data or metadata since it was last queried. 14 + * 15 + * An explicit change is one that would ordinarily result in a change to the 16 + * inode status change time (aka ctime). i_version must appear to change, even 17 + * if the ctime does not (since the whole point is to avoid missing updates due 18 + * to timestamp granularity). If POSIX or other relevant spec mandates that the 19 + * ctime must change due to an operation, then the i_version counter must be 20 + * incremented as well. 21 + * 22 + * Making the i_version update completely atomic with the operation itself would 23 + * be prohibitively expensive. Traditionally the kernel has updated the times on 24 + * directories after an operation that changes its contents. For regular files, 25 + * the ctime is usually updated before the data is copied into the cache for a 26 + * write. This means that there is a window of time when an observer can 27 + * associate a new timestamp with old file contents. Since the purpose of the 28 + * i_version is to allow for better cache coherency, the i_version must always 29 + * be updated after the results of the operation are visible. Updating it before 30 + * and after a change is also permitted. (Note that no filesystems currently do 31 + * this. Fixing that is a work-in-progress). 14 32 * 15 33 * Observers see the i_version as a 64-bit number that never decreases. If it 16 34 * remains the same since it was last checked, then nothing has changed in the ··· 252 234 return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; 253 235 } 254 236 255 - /** 256 - * inode_query_iversion - read i_version for later use 257 - * @inode: inode from which i_version should be read 258 - * 259 - * Read the inode i_version counter. This should be used by callers that wish 260 - * to store the returned i_version for later comparison. This will guarantee 261 - * that a later query of the i_version will result in a different value if 262 - * anything has changed. 263 - * 264 - * In this implementation, we fetch the current value, set the QUERIED flag and 265 - * then try to swap it into place with a cmpxchg, if it wasn't already set. If 266 - * that fails, we try again with the newly fetched value from the cmpxchg. 267 - */ 268 - static inline u64 269 - inode_query_iversion(struct inode *inode) 270 - { 271 - u64 cur, new; 272 - 273 - cur = inode_peek_iversion_raw(inode); 274 - do { 275 - /* If flag is already set, then no need to swap */ 276 - if (cur & I_VERSION_QUERIED) { 277 - /* 278 - * This barrier (and the implicit barrier in the 279 - * cmpxchg below) pairs with the barrier in 280 - * inode_maybe_inc_iversion(). 281 - */ 282 - smp_mb(); 283 - break; 284 - } 285 - 286 - new = cur | I_VERSION_QUERIED; 287 - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 288 - return cur >> I_VERSION_QUERIED_SHIFT; 289 - } 290 - 291 237 /* 292 238 * For filesystems without any sort of change attribute, the best we can 293 239 * do is fake one up from the ctime: ··· 264 282 chattr += t->tv_nsec; 265 283 return chattr; 266 284 } 285 + 286 + u64 inode_query_iversion(struct inode *inode); 267 287 268 288 /** 269 289 * inode_eq_iversion_raw - check whether the raw i_version counter has changed
+9
include/linux/stat.h
··· 52 52 u64 mnt_id; 53 53 u32 dio_mem_align; 54 54 u32 dio_offset_align; 55 + u64 change_cookie; 55 56 }; 57 + 58 + /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ 59 + 60 + /* mask values */ 61 + #define STATX_CHANGE_COOKIE 0x40000000U /* Want/got stx_change_attr */ 62 + 63 + /* file attribute values */ 64 + #define STATX_ATTR_CHANGE_MONOTONIC 0x8000000000000000ULL /* version monotonically increases */ 56 65 57 66 #endif