Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'v6.6-rc3.vfs.ctime.revert' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull finegrained timestamp reverts from Christian Brauner:
"Earlier this week we sent a few minor fixes for the multi-grained
timestamp work in [1]. While we were polishing those up after Linus
realized that there might be a nicer way to fix them we received a
regression report in [2] that fine grained timestamps break gnulib
tests and thus possibly other tools.

The kernel will elide fine-grain timestamp updates when no one is
actively querying for them to avoid performance impacts. So a sequence
like write(f1) stat(f2) write(f2) stat(f2) write(f1) stat(f1) may
result in timestamp f1 to be older than the final f2 timestamp even
though f1 was last written too but the second write didn't update the
timestamp.

Such plotholes can lead to subtle bugs when programs compare
timestamps. For example, the nap() function in [2] will estimate that
it needs to wait one ns on a fine-grain timestamp enabled filesytem
between subsequent calls to observe a timestamp change. But in general
we don't update timestamps with more than one jiffie if we think that
no one is actively querying for fine-grain timestamps to avoid
performance impacts.

While discussing various fixes the decision was to go back to the
drawing board and ultimately to explore a solution that involves only
exposing such fine-grained timestamps to nfs internally and never to
userspace.

As there are multiple solutions discussed the honest thing to do here
is not to fix this up or disable it but to cleanly revert. The general
infrastructure will probably come back but there is no reason to keep
this code in mainline.

The general changes to timestamp handling are valid and a good cleanup
that will stay. The revert is fully bisectable"

Link: https://lore.kernel.org/all/20230918-hirte-neuzugang-4c2324e7bae3@brauner [1]
Link: https://lore.kernel.org/all/bf0524debb976627693e12ad23690094e4514303.camel@linuxfromscratch.org [2]

* tag 'v6.6-rc3.vfs.ctime.revert' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
Revert "fs: add infrastructure for multigrain timestamps"
Revert "btrfs: convert to multigrain timestamps"
Revert "ext4: switch to multigrain timestamps"
Revert "xfs: switch to multigrain timestamps"
Revert "tmpfs: add support for multigrain timestamps"

+38 -178
+20 -4
fs/btrfs/file.c
··· 1106 1106 btrfs_drew_write_unlock(&inode->root->snapshot_lock); 1107 1107 } 1108 1108 1109 + static void update_time_for_write(struct inode *inode) 1110 + { 1111 + struct timespec64 now, ctime; 1112 + 1113 + if (IS_NOCMTIME(inode)) 1114 + return; 1115 + 1116 + now = current_time(inode); 1117 + if (!timespec64_equal(&inode->i_mtime, &now)) 1118 + inode->i_mtime = now; 1119 + 1120 + ctime = inode_get_ctime(inode); 1121 + if (!timespec64_equal(&ctime, &now)) 1122 + inode_set_ctime_to_ts(inode, now); 1123 + 1124 + if (IS_I_VERSION(inode)) 1125 + inode_inc_iversion(inode); 1126 + } 1127 + 1109 1128 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, 1110 1129 size_t count) 1111 1130 { ··· 1156 1137 * need to start yet another transaction to update the inode as we will 1157 1138 * update the inode when we finish writing whatever data we write. 1158 1139 */ 1159 - if (!IS_NOCMTIME(inode)) { 1160 - inode->i_mtime = inode_set_ctime_current(inode); 1161 - inode_inc_iversion(inode); 1162 - } 1140 + update_time_for_write(inode); 1163 1141 1164 1142 start_pos = round_down(pos, fs_info->sectorsize); 1165 1143 oldsize = i_size_read(inode);
+2 -3
fs/btrfs/super.c
··· 2150 2150 .name = "btrfs", 2151 2151 .mount = btrfs_mount, 2152 2152 .kill_sb = btrfs_kill_super, 2153 - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME, 2153 + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, 2154 2154 }; 2155 2155 2156 2156 static struct file_system_type btrfs_root_fs_type = { ··· 2158 2158 .name = "btrfs", 2159 2159 .mount = btrfs_mount_root, 2160 2160 .kill_sb = btrfs_kill_super, 2161 - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | 2162 - FS_ALLOW_IDMAP | FS_MGTIME, 2161 + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, 2163 2162 }; 2164 2163 2165 2164 MODULE_ALIAS_FS("btrfs");
+1 -1
fs/ext4/super.c
··· 7314 7314 .init_fs_context = ext4_init_fs_context, 7315 7315 .parameters = ext4_param_specs, 7316 7316 .kill_sb = ext4_kill_sb, 7317 - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, 7317 + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, 7318 7318 }; 7319 7319 MODULE_ALIAS_FS("ext4"); 7320 7320
+3 -79
fs/inode.c
··· 2102 2102 } 2103 2103 EXPORT_SYMBOL(file_remove_privs); 2104 2104 2105 - /** 2106 - * current_mgtime - Return FS time (possibly fine-grained) 2107 - * @inode: inode. 2108 - * 2109 - * Return the current time truncated to the time granularity supported by 2110 - * the fs, as suitable for a ctime/mtime change. If the ctime is flagged 2111 - * as having been QUERIED, get a fine-grained timestamp. 2112 - */ 2113 - struct timespec64 current_mgtime(struct inode *inode) 2114 - { 2115 - struct timespec64 now, ctime; 2116 - atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; 2117 - long nsec = atomic_long_read(pnsec); 2118 - 2119 - if (nsec & I_CTIME_QUERIED) { 2120 - ktime_get_real_ts64(&now); 2121 - return timestamp_truncate(now, inode); 2122 - } 2123 - 2124 - ktime_get_coarse_real_ts64(&now); 2125 - now = timestamp_truncate(now, inode); 2126 - 2127 - /* 2128 - * If we've recently fetched a fine-grained timestamp 2129 - * then the coarse-grained one may still be earlier than the 2130 - * existing ctime. Just keep the existing value if so. 2131 - */ 2132 - ctime = inode_get_ctime(inode); 2133 - if (timespec64_compare(&ctime, &now) > 0) 2134 - now = ctime; 2135 - 2136 - return now; 2137 - } 2138 - EXPORT_SYMBOL(current_mgtime); 2139 - 2140 - static struct timespec64 current_ctime(struct inode *inode) 2141 - { 2142 - if (is_mgtime(inode)) 2143 - return current_mgtime(inode); 2144 - return current_time(inode); 2145 - } 2146 - 2147 2105 static int inode_needs_update_time(struct inode *inode) 2148 2106 { 2149 2107 int sync_it = 0; 2150 - struct timespec64 now = current_ctime(inode); 2108 + struct timespec64 now = current_time(inode); 2151 2109 struct timespec64 ctime; 2152 2110 2153 2111 /* First try to exhaust all avenues to not sync */ ··· 2536 2578 */ 2537 2579 struct timespec64 inode_set_ctime_current(struct inode *inode) 2538 2580 { 2539 - struct timespec64 now; 2540 - struct timespec64 ctime; 2581 + struct timespec64 now = current_time(inode); 2541 2582 2542 - ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec); 2543 - if (!(ctime.tv_nsec & I_CTIME_QUERIED)) { 2544 - now = current_time(inode); 2545 - 2546 - /* Just copy it into place if it's not multigrain */ 2547 - if (!is_mgtime(inode)) { 2548 - inode_set_ctime_to_ts(inode, now); 2549 - return now; 2550 - } 2551 - 2552 - /* 2553 - * If we've recently updated with a fine-grained timestamp, 2554 - * then the coarse-grained one may still be earlier than the 2555 - * existing ctime. Just keep the existing value if so. 2556 - */ 2557 - ctime.tv_sec = inode->__i_ctime.tv_sec; 2558 - if (timespec64_compare(&ctime, &now) > 0) 2559 - return ctime; 2560 - 2561 - /* 2562 - * Ctime updates are usually protected by the inode_lock, but 2563 - * we can still race with someone setting the QUERIED flag. 2564 - * Try to swap the new nsec value into place. If it's changed 2565 - * in the interim, then just go with a fine-grained timestamp. 2566 - */ 2567 - if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec, 2568 - now.tv_nsec) != ctime.tv_nsec) 2569 - goto fine_grained; 2570 - inode->__i_ctime.tv_sec = now.tv_sec; 2571 - return now; 2572 - } 2573 - fine_grained: 2574 - ktime_get_real_ts64(&now); 2575 - inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode)); 2583 + inode_set_ctime(inode, now.tv_sec, now.tv_nsec); 2576 2584 return now; 2577 2585 } 2578 2586 EXPORT_SYMBOL(inode_set_ctime_current);
+2 -39
fs/stat.c
··· 27 27 #include "mount.h" 28 28 29 29 /** 30 - * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED 31 - * @stat: where to store the resulting values 32 - * @request_mask: STATX_* values requested 33 - * @inode: inode from which to grab the c/mtime 34 - * 35 - * Given @inode, grab the ctime and mtime out if it and store the result 36 - * in @stat. When fetching the value, flag it as queried so the next write 37 - * will use a fine-grained timestamp. 38 - */ 39 - void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) 40 - { 41 - atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; 42 - 43 - /* If neither time was requested, then don't report them */ 44 - if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { 45 - stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); 46 - return; 47 - } 48 - 49 - stat->mtime = inode->i_mtime; 50 - stat->ctime.tv_sec = inode->__i_ctime.tv_sec; 51 - /* 52 - * Atomically set the QUERIED flag and fetch the new value with 53 - * the flag masked off. 54 - */ 55 - stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) & 56 - ~I_CTIME_QUERIED; 57 - } 58 - EXPORT_SYMBOL(fill_mg_cmtime); 59 - 60 - /** 61 30 * generic_fillattr - Fill in the basic attributes from the inode struct 62 31 * @idmap: idmap of the mount the inode was found from 63 32 * @request_mask: statx request_mask ··· 58 89 stat->rdev = inode->i_rdev; 59 90 stat->size = i_size_read(inode); 60 91 stat->atime = inode->i_atime; 61 - 62 - if (is_mgtime(inode)) { 63 - fill_mg_cmtime(stat, request_mask, inode); 64 - } else { 65 - stat->mtime = inode->i_mtime; 66 - stat->ctime = inode_get_ctime(inode); 67 - } 68 - 92 + stat->mtime = inode->i_mtime; 93 + stat->ctime = inode_get_ctime(inode); 69 94 stat->blksize = i_blocksize(inode); 70 95 stat->blocks = inode->i_blocks; 71 96
+3 -3
fs/xfs/libxfs/xfs_trans_inode.c
··· 62 62 ASSERT(tp); 63 63 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 64 64 65 - /* If the mtime changes, then ctime must also change */ 66 - ASSERT(flags & XFS_ICHGTIME_CHG); 65 + tv = current_time(inode); 67 66 68 - tv = inode_set_ctime_current(inode); 69 67 if (flags & XFS_ICHGTIME_MOD) 70 68 inode->i_mtime = tv; 69 + if (flags & XFS_ICHGTIME_CHG) 70 + inode_set_ctime_to_ts(inode, tv); 71 71 if (flags & XFS_ICHGTIME_CREATE) 72 72 ip->i_crtime = tv; 73 73 }
+3 -3
fs/xfs/xfs_iops.c
··· 573 573 stat->gid = vfsgid_into_kgid(vfsgid); 574 574 stat->ino = ip->i_ino; 575 575 stat->atime = inode->i_atime; 576 + stat->mtime = inode->i_mtime; 577 + stat->ctime = inode_get_ctime(inode); 576 578 stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); 577 - 578 - fill_mg_cmtime(stat, request_mask, inode); 579 579 580 580 if (xfs_has_v3inodes(mp)) { 581 581 if (request_mask & STATX_BTIME) { ··· 917 917 if (newsize != oldsize && 918 918 !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { 919 919 iattr->ia_ctime = iattr->ia_mtime = 920 - current_mgtime(inode); 920 + current_time(inode); 921 921 iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; 922 922 } 923 923
+1 -1
fs/xfs/xfs_super.c
··· 2065 2065 .init_fs_context = xfs_init_fs_context, 2066 2066 .parameters = xfs_fs_parameters, 2067 2067 .kill_sb = xfs_kill_sb, 2068 - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, 2068 + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, 2069 2069 }; 2070 2070 MODULE_ALIAS_FS("xfs"); 2071 2071
+2 -44
include/linux/fs.h
··· 1508 1508 kgid_has_mapping(fs_userns, kgid); 1509 1509 } 1510 1510 1511 - struct timespec64 current_mgtime(struct inode *inode); 1512 1511 struct timespec64 current_time(struct inode *inode); 1513 1512 struct timespec64 inode_set_ctime_current(struct inode *inode); 1514 - 1515 - /* 1516 - * Multigrain timestamps 1517 - * 1518 - * Conditionally use fine-grained ctime and mtime timestamps when there 1519 - * are users actively observing them via getattr. The primary use-case 1520 - * for this is NFS clients that use the ctime to distinguish between 1521 - * different states of the file, and that are often fooled by multiple 1522 - * operations that occur in the same coarse-grained timer tick. 1523 - * 1524 - * The kernel always keeps normalized struct timespec64 values in the ctime, 1525 - * which means that only the first 30 bits of the value are used. Use the 1526 - * 31st bit of the ctime's tv_nsec field as a flag to indicate that the value 1527 - * has been queried since it was last updated. 1528 - */ 1529 - #define I_CTIME_QUERIED (1L<<30) 1530 1513 1531 1514 /** 1532 1515 * inode_get_ctime - fetch the current ctime from the inode 1533 1516 * @inode: inode from which to fetch ctime 1534 1517 * 1535 - * Grab the current ctime tv_nsec field from the inode, mask off the 1536 - * I_CTIME_QUERIED flag and return it. This is mostly intended for use by 1537 - * internal consumers of the ctime that aren't concerned with ensuring a 1538 - * fine-grained update on the next change (e.g. when preparing to store 1539 - * the value in the backing store for later retrieval). 1540 - * 1541 - * This is safe to call regardless of whether the underlying filesystem 1542 - * is using multigrain timestamps. 1518 + * Grab the current ctime from the inode and return it. 1543 1519 */ 1544 1520 static inline struct timespec64 inode_get_ctime(const struct inode *inode) 1545 1521 { 1546 - struct timespec64 ctime; 1547 - 1548 - ctime.tv_sec = inode->__i_ctime.tv_sec; 1549 - ctime.tv_nsec = inode->__i_ctime.tv_nsec & ~I_CTIME_QUERIED; 1550 - 1551 - return ctime; 1522 + return inode->__i_ctime; 1552 1523 } 1553 1524 1554 1525 /** ··· 2305 2334 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ 2306 2335 #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ 2307 2336 #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ 2308 - #define FS_MGTIME 64 /* FS uses multigrain timestamps */ 2309 2337 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ 2310 2338 int (*init_fs_context)(struct fs_context *); 2311 2339 const struct fs_parameter_spec *parameters; ··· 2327 2357 }; 2328 2358 2329 2359 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) 2330 - 2331 - /** 2332 - * is_mgtime: is this inode using multigrain timestamps 2333 - * @inode: inode to test for multigrain timestamps 2334 - * 2335 - * Return true if the inode uses multigrain timestamps, false otherwise. 2336 - */ 2337 - static inline bool is_mgtime(const struct inode *inode) 2338 - { 2339 - return inode->i_sb->s_type->fs_flags & FS_MGTIME; 2340 - } 2341 2360 2342 2361 extern struct dentry *mount_bdev(struct file_system_type *fs_type, 2343 2362 int flags, const char *dev_name, void *data, ··· 3013 3054 extern int page_symlink(struct inode *inode, const char *symname, int len); 3014 3055 extern const struct inode_operations page_symlink_inode_operations; 3015 3056 extern void kfree_link(void *); 3016 - void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); 3017 3057 void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); 3018 3058 void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); 3019 3059 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
+1 -1
mm/shmem.c
··· 4586 4586 #endif 4587 4587 .kill_sb = kill_litter_super, 4588 4588 #ifdef CONFIG_SHMEM 4589 - .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, 4589 + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 4590 4590 #else 4591 4591 .fs_flags = FS_USERNS_MOUNT, 4592 4592 #endif