Merge tag 'vfs-6.11.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull misc vfs updates from Christian Brauner:
"Features:

- Support passing NULL along AT_EMPTY_PATH for statx().

NULL paths with any flag value other than AT_EMPTY_PATH go the
usual route and end up with -EFAULT to retain compatibility (Rust
is abusing calls of the sort to detect availability of statx)

This avoids path lookup code, lockref management, memory allocation
and in case of NULL path userspace memory access (which can be
quite expensive with SMAP on x86_64)

- Don't block i_writecount during exec. Remove the
deny_write_access() mechanism for executables

- Relax open_by_handle_at() permissions in specific cases where we
can prove that the caller had sufficient privileges to open a file

- Switch timespec64 fields in struct inode to discrete integers
freeing up 4 bytes

Fixes:

- Fix false positive circular locking warning in hfsplus

- Initialize hfs_inode_info after hfs_alloc_inode() in hfs

- Avoid accidental overflows in vfs_fallocate()

- Don't interrupt fallocate with EINTR in tmpfs to avoid constantly
restarting shmem_fallocate()

- Add missing quote in comment in fs/readdir

Cleanups:

- Don't assign and test in an if statement in mqueue. Move the
assignment out of the if statement

- Reflow the logic in may_create_in_sticky()

- Remove the usage of the deprecated ida_simple_xx() API from procfs

- Reject FSCONFIG_CMD_CREATE_EXCL requets that depend on the new
mount api early

- Rename variables in copy_tree() to make it easier to understand

- Replace WARN(down_read_trylock, ...) abuse with proper asserts in
various places in the VFS

- Get rid of user_path_at_empty() and drop the empty argument from
getname_flags()

- Check for error while copying and no path in one branch in
getname_flags()

- Avoid redundant smp_mb() for THP handling in do_dentry_open()

- Rename parent_ino to d_parent_ino and make it use RCU

- Remove unused header include in fs/readdir

- Export in_group_capable() helper and switch f2fs and fuse over to
it instead of open-coding the logic in both places"

* tag 'vfs-6.11.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (27 commits)
ipc: mqueue: remove assignment from IS_ERR argument
vfs: rename parent_ino to d_parent_ino and make it use RCU
vfs: support statx(..., NULL, AT_EMPTY_PATH, ...)
stat: use vfs_empty_path() helper
fs: new helper vfs_empty_path()
fs: reflow may_create_in_sticky()
vfs: remove redundant smp_mb for thp handling in do_dentry_open
fuse: Use in_group_or_capable() helper
f2fs: Use in_group_or_capable() helper
fs: Export in_group_or_capable()
vfs: reorder checks in may_create_in_sticky
hfs: fix to initialize fields of hfs_inode_info after hfs_alloc_inode()
proc: Remove usage of the deprecated ida_simple_xx() API
hfsplus: fix to avoid false alarm of circular locking
Improve readability of copy_tree
vfs: shave a branch in getname_flags
vfs: retire user_path_at_empty and drop empty arg from getname_flags
vfs: stop using user_path_at_empty in do_readlinkat
tmpfs: don't interrupt fallocate with EINTR
fs: don't block i_writecount during exec
...

Linus Torvalds 2 years ago b051320d 2ffd45da

+545 -335

38 changed files

expand all

attr.c

binfmt_elf.c

binfmt_elf_fdpic.c

binfmt_misc.c

dcache.c

exec.c

exportfs

expfs.c

f2fs

acl.c

file.c

fhandle.c

fsopen.c

fuse

acl.c

hfs

inode.c

hfsplus

bfind.c

extents.c

hfsplus_fs.h

ioctl.c

inode.c

internal.h

mount.h

namei.c

namespace.c

nfsd

nfsfh.c

open.c

proc

generic.c

quota

dquot.c

readdir.c

stat.c

include

linux

dcache.h

exportfs.h

fs.h

namei.h

io_uring

statx.c

xattr.c

ipc

mqueue.c

kernel

fork.c

khugepaged.c

shmem.c

-2

fs/attr.c

··· 17 17 #include <linux/filelock.h> 18 18 #include <linux/security.h> 19 19 20 - #include "internal.h" 21 - 22 20 /** 23 21 * setattr_should_drop_sgid - determine whether the setgid bit needs to be 24 22 * removed

-2

fs/binfmt_elf.c

··· 1216 1216 } 1217 1217 reloc_func_desc = interp_load_addr; 1218 1218 1219 - allow_write_access(interpreter); 1220 1219 fput(interpreter); 1221 1220 1222 1221 kfree(interp_elf_ex); ··· 1307 1308 kfree(interp_elf_ex); 1308 1309 kfree(interp_elf_phdata); 1309 1310 out_free_file: 1310 - allow_write_access(interpreter); 1311 1311 if (interpreter) 1312 1312 fput(interpreter); 1313 1313 out_free_ph:

+1 -4

fs/binfmt_elf_fdpic.c

··· 394 394 goto error; 395 395 } 396 396 397 - allow_write_access(interpreter); 398 397 fput(interpreter); 399 398 interpreter = NULL; 400 399 } ··· 465 466 retval = 0; 466 467 467 468 error: 468 - if (interpreter) { 469 - allow_write_access(interpreter); 469 + if (interpreter) 470 470 fput(interpreter); 471 - } 472 471 kfree(interpreter_name); 473 472 kfree(exec_params.phdrs); 474 473 kfree(exec_params.loadmap);

+2 -5

fs/binfmt_misc.c

··· 247 247 if (retval < 0) 248 248 goto ret; 249 249 250 - if (fmt->flags & MISC_FMT_OPEN_FILE) { 250 + if (fmt->flags & MISC_FMT_OPEN_FILE) 251 251 interp_file = file_clone_open(fmt->interp_file); 252 - if (!IS_ERR(interp_file)) 253 - deny_write_access(interp_file); 254 - } else { 252 + else 255 253 interp_file = open_exec(fmt->interpreter); 256 - } 257 254 retval = PTR_ERR(interp_file); 258 255 if (IS_ERR(interp_file)) 259 256 goto ret;

+29 -1

fs/dcache.c

··· 1555 1555 { 1556 1556 struct dentry *dentry; 1557 1557 1558 - WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); 1558 + rwsem_assert_held_write(&sb->s_umount); 1559 1559 1560 1560 dentry = sb->s_root; 1561 1561 sb->s_root = NULL; ··· 3105 3105 d_instantiate(dentry, inode); 3106 3106 } 3107 3107 EXPORT_SYMBOL(d_tmpfile); 3108 + 3109 + /* 3110 + * Obtain inode number of the parent dentry. 3111 + */ 3112 + ino_t d_parent_ino(struct dentry *dentry) 3113 + { 3114 + struct dentry *parent; 3115 + struct inode *iparent; 3116 + unsigned seq; 3117 + ino_t ret; 3118 + 3119 + scoped_guard(rcu) { 3120 + seq = raw_seqcount_begin(&dentry->d_seq); 3121 + parent = READ_ONCE(dentry->d_parent); 3122 + iparent = d_inode_rcu(parent); 3123 + if (likely(iparent)) { 3124 + ret = iparent->i_ino; 3125 + if (!read_seqcount_retry(&dentry->d_seq, seq)) 3126 + return ret; 3127 + } 3128 + } 3129 + 3130 + spin_lock(&dentry->d_lock); 3131 + ret = dentry->d_parent->d_inode->i_ino; 3132 + spin_unlock(&dentry->d_lock); 3133 + return ret; 3134 + } 3135 + EXPORT_SYMBOL(d_parent_ino); 3108 3136 3109 3137 static __initdata unsigned long dhash_entries; 3110 3138 static int __init set_dhash_entries(char *str)

+3 -11

fs/exec.c

··· 952 952 path_noexec(&file->f_path))) 953 953 goto exit; 954 954 955 - err = deny_write_access(file); 956 - if (err) 957 - goto exit; 958 - 959 955 out: 960 956 return file; 961 957 ··· 967 971 * 968 972 * Returns ERR_PTR on failure or allocated struct file on success. 969 973 * 970 - * As this is a wrapper for the internal do_open_execat(), callers 971 - * must call allow_write_access() before fput() on release. Also see 974 + * As this is a wrapper for the internal do_open_execat(). Also see 972 975 * do_close_execat(). 973 976 */ 974 977 struct file *open_exec(const char *name) ··· 1519 1524 /* Matches do_open_execat() */ 1520 1525 static void do_close_execat(struct file *file) 1521 1526 { 1522 - if (!file) 1523 - return; 1524 - allow_write_access(file); 1525 - fput(file); 1527 + if (file) 1528 + fput(file); 1526 1529 } 1527 1530 1528 1531 static void free_bprm(struct linux_binprm *bprm) ··· 1839 1846 bprm->file = bprm->interpreter; 1840 1847 bprm->interpreter = NULL; 1841 1848 1842 - allow_write_access(exec); 1843 1849 if (unlikely(bprm->have_execfd)) { 1844 1850 if (bprm->executable) { 1845 1851 fput(exec);

+7 -2

fs/exportfs/expfs.c

··· 427 427 428 428 struct dentry * 429 429 exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, 430 - int fileid_type, 430 + int fileid_type, unsigned int flags, 431 431 int (*acceptable)(void *, struct dentry *), 432 432 void *context) 433 433 { ··· 444 444 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); 445 445 if (IS_ERR_OR_NULL(result)) 446 446 return result; 447 + 448 + if ((flags & EXPORT_FH_DIR_ONLY) && !d_is_dir(result)) { 449 + err = -ENOTDIR; 450 + goto err_result; 451 + } 447 452 448 453 /* 449 454 * If no acceptance criteria was specified by caller, a disconnected ··· 586 581 { 587 582 struct dentry *ret; 588 583 589 - ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 584 + ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 0, 590 585 acceptable, context); 591 586 if (IS_ERR_OR_NULL(ret)) { 592 587 if (ret == ERR_PTR(-ENOMEM))

+1 -2

fs/f2fs/acl.c

··· 219 219 return error; 220 220 if (error == 0) 221 221 *acl = NULL; 222 - if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && 223 - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) 222 + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) 224 223 mode &= ~S_ISGID; 225 224 *mode_p = mode; 226 225 return 0;

+2 -4

fs/f2fs/file.c

··· 185 185 if (!dentry) 186 186 return 0; 187 187 188 - *pino = parent_ino(dentry); 188 + *pino = d_parent_ino(dentry); 189 189 dput(dentry); 190 190 return 1; 191 191 } ··· 923 923 inode_set_ctime_to_ts(inode, attr->ia_ctime); 924 924 if (ia_valid & ATTR_MODE) { 925 925 umode_t mode = attr->ia_mode; 926 - vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); 927 926 928 - if (!vfsgid_in_group_p(vfsgid) && 929 - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) 927 + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) 930 928 mode &= ~S_ISGID; 931 929 set_acl_inode(inode, mode); 932 930 }

+147 -45

fs/fhandle.c

··· 115 115 return err; 116 116 } 117 117 118 - static struct vfsmount *get_vfsmount_from_fd(int fd) 118 + static int get_path_from_fd(int fd, struct path *root) 119 119 { 120 - struct vfsmount *mnt; 121 - 122 120 if (fd == AT_FDCWD) { 123 121 struct fs_struct *fs = current->fs; 124 122 spin_lock(&fs->lock); 125 - mnt = mntget(fs->pwd.mnt); 123 + *root = fs->pwd; 124 + path_get(root); 126 125 spin_unlock(&fs->lock); 127 126 } else { 128 127 struct fd f = fdget(fd); 129 128 if (!f.file) 130 - return ERR_PTR(-EBADF); 131 - mnt = mntget(f.file->f_path.mnt); 129 + return -EBADF; 130 + *root = f.file->f_path; 131 + path_get(root); 132 132 fdput(f); 133 133 } 134 - return mnt; 134 + 135 + return 0; 135 136 } 137 + 138 + enum handle_to_path_flags { 139 + HANDLE_CHECK_PERMS = (1 << 0), 140 + HANDLE_CHECK_SUBTREE = (1 << 1), 141 + }; 142 + 143 + struct handle_to_path_ctx { 144 + struct path root; 145 + enum handle_to_path_flags flags; 146 + unsigned int fh_flags; 147 + }; 136 148 137 149 static int vfs_dentry_acceptable(void *context, struct dentry *dentry) 138 150 { 139 - return 1; 140 - } 141 - 142 - static int do_handle_to_path(int mountdirfd, struct file_handle *handle, 143 - struct path *path) 144 - { 151 + struct handle_to_path_ctx *ctx = context; 152 + struct user_namespace *user_ns = current_user_ns(); 153 + struct dentry *d, *root = ctx->root.dentry; 154 + struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt); 145 155 int retval = 0; 146 - int handle_dwords; 147 156 148 - path->mnt = get_vfsmount_from_fd(mountdirfd); 149 - if (IS_ERR(path->mnt)) { 150 - retval = PTR_ERR(path->mnt); 151 - goto out_err; 157 + if (!root) 158 + return 1; 159 + 160 + /* Old permission model with global CAP_DAC_READ_SEARCH. */ 161 + if (!ctx->flags) 162 + return 1; 163 + 164 + /* 165 + * It's racy as we're not taking rename_lock but we're able to ignore 166 + * permissions and we just need an approximation whether we were able 167 + * to follow a path to the file. 168 + * 169 + * It's also potentially expensive on some filesystems especially if 170 + * there is a deep path. 171 + */ 172 + d = dget(dentry); 173 + while (d != root && !IS_ROOT(d)) { 174 + struct dentry *parent = dget_parent(d); 175 + 176 + /* 177 + * We know that we have the ability to override DAC permissions 178 + * as we've verified this earlier via CAP_DAC_READ_SEARCH. But 179 + * we also need to make sure that there aren't any unmapped 180 + * inodes in the path that would prevent us from reaching the 181 + * file. 182 + */ 183 + if (!privileged_wrt_inode_uidgid(user_ns, idmap, 184 + d_inode(parent))) { 185 + dput(d); 186 + dput(parent); 187 + return retval; 188 + } 189 + 190 + dput(d); 191 + d = parent; 152 192 } 153 - /* change the handle size to multiple of sizeof(u32) */ 154 - handle_dwords = handle->handle_bytes >> 2; 155 - path->dentry = exportfs_decode_fh(path->mnt, 156 - (struct fid *)handle->f_handle, 157 - handle_dwords, handle->handle_type, 158 - vfs_dentry_acceptable, NULL); 159 - if (IS_ERR(path->dentry)) { 160 - retval = PTR_ERR(path->dentry); 161 - goto out_mnt; 162 - } 163 - return 0; 164 - out_mnt: 165 - mntput(path->mnt); 166 - out_err: 193 + 194 + if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root) 195 + retval = 1; 196 + WARN_ON_ONCE(d != root && d != root->d_sb->s_root); 197 + dput(d); 167 198 return retval; 168 199 } 169 200 201 + static int do_handle_to_path(struct file_handle *handle, struct path *path, 202 + struct handle_to_path_ctx *ctx) 203 + { 204 + int handle_dwords; 205 + struct vfsmount *mnt = ctx->root.mnt; 206 + 207 + /* change the handle size to multiple of sizeof(u32) */ 208 + handle_dwords = handle->handle_bytes >> 2; 209 + path->dentry = exportfs_decode_fh_raw(mnt, 210 + (struct fid *)handle->f_handle, 211 + handle_dwords, handle->handle_type, 212 + ctx->fh_flags, 213 + vfs_dentry_acceptable, ctx); 214 + if (IS_ERR_OR_NULL(path->dentry)) { 215 + if (path->dentry == ERR_PTR(-ENOMEM)) 216 + return -ENOMEM; 217 + return -ESTALE; 218 + } 219 + path->mnt = mntget(mnt); 220 + return 0; 221 + } 222 + 223 + /* 224 + * Allow relaxed permissions of file handles if the caller has the 225 + * ability to mount the filesystem or create a bind-mount of the 226 + * provided @mountdirfd. 227 + * 228 + * In both cases the caller may be able to get an unobstructed way to 229 + * the encoded file handle. If the caller is only able to create a 230 + * bind-mount we need to verify that there are no locked mounts on top 231 + * of it that could prevent us from getting to the encoded file. 232 + * 233 + * In principle, locked mounts can prevent the caller from mounting the 234 + * filesystem but that only applies to procfs and sysfs neither of which 235 + * support decoding file handles. 236 + */ 237 + static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, 238 + unsigned int o_flags) 239 + { 240 + struct path *root = &ctx->root; 241 + 242 + /* 243 + * Restrict to O_DIRECTORY to provide a deterministic API that avoids a 244 + * confusing api in the face of disconnected non-dir dentries. 245 + * 246 + * There's only one dentry for each directory inode (VFS rule)... 247 + */ 248 + if (!(o_flags & O_DIRECTORY)) 249 + return false; 250 + 251 + if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) 252 + ctx->flags = HANDLE_CHECK_PERMS; 253 + else if (is_mounted(root->mnt) && 254 + ns_capable(real_mount(root->mnt)->mnt_ns->user_ns, 255 + CAP_SYS_ADMIN) && 256 + !has_locked_children(real_mount(root->mnt), root->dentry)) 257 + ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; 258 + else 259 + return false; 260 + 261 + /* Are we able to override DAC permissions? */ 262 + if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) 263 + return false; 264 + 265 + ctx->fh_flags = EXPORT_FH_DIR_ONLY; 266 + return true; 267 + } 268 + 170 269 static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, 171 - struct path *path) 270 + struct path *path, unsigned int o_flags) 172 271 { 173 272 int retval = 0; 174 273 struct file_handle f_handle; 175 274 struct file_handle *handle = NULL; 275 + struct handle_to_path_ctx ctx = {}; 176 276 177 - /* 178 - * With handle we don't look at the execute bit on the 179 - * directory. Ideally we would like CAP_DAC_SEARCH. 180 - * But we don't have that 181 - */ 182 - if (!capable(CAP_DAC_READ_SEARCH)) { 183 - retval = -EPERM; 277 + retval = get_path_from_fd(mountdirfd, &ctx.root); 278 + if (retval) 184 279 goto out_err; 280 + 281 + if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) { 282 + retval = -EPERM; 283 + goto out_path; 185 284 } 285 + 186 286 if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { 187 287 retval = -EFAULT; 188 - goto out_err; 288 + goto out_path; 189 289 } 190 290 if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || 191 291 (f_handle.handle_bytes == 0)) { 192 292 retval = -EINVAL; 193 - goto out_err; 293 + goto out_path; 194 294 } 195 295 handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), 196 296 GFP_KERNEL); 197 297 if (!handle) { 198 298 retval = -ENOMEM; 199 - goto out_err; 299 + goto out_path; 200 300 } 201 301 /* copy the full handle */ 202 302 *handle = f_handle; ··· 307 207 goto out_handle; 308 208 } 309 209 310 - retval = do_handle_to_path(mountdirfd, handle, path); 210 + retval = do_handle_to_path(handle, path, &ctx); 311 211 312 212 out_handle: 313 213 kfree(handle); 214 + out_path: 215 + path_put(&ctx.root); 314 216 out_err: 315 217 return retval; 316 218 } ··· 325 223 struct file *file; 326 224 int fd; 327 225 328 - retval = handle_to_path(mountdirfd, ufh, &path); 226 + retval = handle_to_path(mountdirfd, ufh, &path, open_flag); 329 227 if (retval) 330 228 return retval; 331 229

+2 -5

fs/fsopen.c

··· 220 220 if (!mount_capable(fc)) 221 221 return -EPERM; 222 222 223 - /* require the new mount api */ 224 - if (exclusive && fc->ops == &legacy_fs_context_ops) 225 - return -EOPNOTSUPP; 226 - 227 223 fc->phase = FS_CONTEXT_CREATING; 228 224 fc->exclusive = exclusive; 229 225 ··· 407 411 case FSCONFIG_SET_PATH: 408 412 case FSCONFIG_SET_PATH_EMPTY: 409 413 case FSCONFIG_SET_FD: 414 + case FSCONFIG_CMD_CREATE_EXCL: 410 415 ret = -EOPNOTSUPP; 411 416 goto out_f; 412 417 } ··· 448 451 fallthrough; 449 452 case FSCONFIG_SET_PATH: 450 453 param.type = fs_value_is_filename; 451 - param.name = getname_flags(_value, lookup_flags, NULL); 454 + param.name = getname_flags(_value, lookup_flags); 452 455 if (IS_ERR(param.name)) { 453 456 ret = PTR_ERR(param.name); 454 457 goto out_key;

+2 -2

fs/fuse/acl.c

··· 146 146 * be stripped. 147 147 */ 148 148 if (fc->posix_acl && 149 - !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && 150 - !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) 149 + !in_group_or_capable(&nop_mnt_idmap, inode, 150 + i_gid_into_vfsgid(&nop_mnt_idmap, inode))) 151 151 extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; 152 152 153 153 ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);

fs/hfs/inode.c

··· 200 200 HFS_I(inode)->flags = 0; 201 201 HFS_I(inode)->rsrc_inode = NULL; 202 202 HFS_I(inode)->fs_blocks = 0; 203 + HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; 203 204 if (S_ISDIR(mode)) { 204 205 inode->i_size = 2; 205 206 HFS_SB(sb)->folder_count++; ··· 276 275 for (count = 0, i = 0; i < 3; i++) 277 276 count += be16_to_cpu(ext[i].count); 278 277 HFS_I(inode)->first_blocks = count; 278 + HFS_I(inode)->cached_start = 0; 279 + HFS_I(inode)->cached_blocks = 0; 279 280 280 281 inode->i_size = HFS_I(inode)->phys_size = log_size; 281 282 HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;

+2 -13

fs/hfsplus/bfind.c

··· 25 25 fd->key = ptr + tree->max_key_len + 2; 26 26 hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", 27 27 tree->cnid, __builtin_return_address(0)); 28 - switch (tree->cnid) { 29 - case HFSPLUS_CAT_CNID: 30 - mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); 31 - break; 32 - case HFSPLUS_EXT_CNID: 33 - mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); 34 - break; 35 - case HFSPLUS_ATTR_CNID: 36 - mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); 37 - break; 38 - default: 39 - BUG(); 40 - } 28 + mutex_lock_nested(&tree->tree_lock, 29 + hfsplus_btree_lock_class(tree)); 41 30 return 0; 42 31 } 43 32

+6 -3

fs/hfsplus/extents.c

··· 430 430 hfsplus_free_extents(sb, ext_entry, total_blocks - start, 431 431 total_blocks); 432 432 total_blocks = start; 433 - mutex_lock(&fd.tree->tree_lock); 433 + mutex_lock_nested(&fd.tree->tree_lock, 434 + hfsplus_btree_lock_class(fd.tree)); 434 435 } while (total_blocks > blocks); 435 436 hfs_find_exit(&fd); 436 437 ··· 593 592 alloc_cnt, alloc_cnt - blk_cnt); 594 593 hfsplus_dump_extent(hip->first_extents); 595 594 hip->first_blocks = blk_cnt; 596 - mutex_lock(&fd.tree->tree_lock); 595 + mutex_lock_nested(&fd.tree->tree_lock, 596 + hfsplus_btree_lock_class(fd.tree)); 597 597 break; 598 598 } 599 599 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); ··· 608 606 hfsplus_free_extents(sb, hip->cached_extents, 609 607 alloc_cnt - start, alloc_cnt - blk_cnt); 610 608 hfsplus_dump_extent(hip->cached_extents); 611 - mutex_lock(&fd.tree->tree_lock); 609 + mutex_lock_nested(&fd.tree->tree_lock, 610 + hfsplus_btree_lock_class(fd.tree)); 612 611 if (blk_cnt > start) { 613 612 hip->extent_state |= HFSPLUS_EXT_DIRTY; 614 613 break;

+21

fs/hfsplus/hfsplus_fs.h

··· 553 553 return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET); 554 554 } 555 555 556 + static inline enum hfsplus_btree_mutex_classes 557 + hfsplus_btree_lock_class(struct hfs_btree *tree) 558 + { 559 + enum hfsplus_btree_mutex_classes class; 560 + 561 + switch (tree->cnid) { 562 + case HFSPLUS_CAT_CNID: 563 + class = CATALOG_BTREE_MUTEX; 564 + break; 565 + case HFSPLUS_EXT_CNID: 566 + class = EXTENTS_BTREE_MUTEX; 567 + break; 568 + case HFSPLUS_ATTR_CNID: 569 + class = ATTR_BTREE_MUTEX; 570 + break; 571 + default: 572 + BUG(); 573 + } 574 + return class; 575 + } 576 + 556 577 /* compatibility */ 557 578 #define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } 558 579 #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)

+2 -2

fs/hfsplus/ioctl.c

··· 40 40 41 41 /* Directory containing the bootable system */ 42 42 vh->finder_info[0] = bvh->finder_info[0] = 43 - cpu_to_be32(parent_ino(dentry)); 43 + cpu_to_be32(d_parent_ino(dentry)); 44 44 45 45 /* 46 46 * Bootloader. Just using the inode here breaks in the case of ··· 51 51 52 52 /* Per spec, the OS X system folder - same as finder_info[0] here */ 53 53 vh->finder_info[5] = bvh->finder_info[5] = 54 - cpu_to_be32(parent_ino(dentry)); 54 + cpu_to_be32(d_parent_ino(dentry)); 55 55 56 56 mutex_unlock(&sbi->vh_mutex); 57 57 return 0;

fs/inode.c

··· 2538 2538 return true; 2539 2539 return false; 2540 2540 } 2541 + EXPORT_SYMBOL(in_group_or_capable); 2541 2542 2542 2543 /** 2543 2544 * mode_strip_sgid - handle the sgid bit for non-directories

+14

fs/internal.h

··· 247 247 int getname_statx_lookup_flags(int flags); 248 248 int do_statx(int dfd, struct filename *filename, unsigned int flags, 249 249 unsigned int mask, struct statx __user *buffer); 250 + int do_statx_fd(int fd, unsigned int flags, unsigned int mask, 251 + struct statx __user *buffer); 250 252 251 253 /* 252 254 * fs/splice.c: ··· 323 321 int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, 324 322 struct path *path); 325 323 void stashed_dentry_prune(struct dentry *dentry); 324 + /** 325 + * path_mounted - check whether path is mounted 326 + * @path: path to check 327 + * 328 + * Determine whether @path refers to the root of a mount. 329 + * 330 + * Return: true if @path is the root of a mount, false if not. 331 + */ 332 + static inline bool path_mounted(const struct path *path) 333 + { 334 + return path->mnt->mnt_root == path->dentry; 335 + }

fs/mount.h

··· 152 152 } 153 153 154 154 extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); 155 + bool has_locked_children(struct mount *mnt, struct dentry *dentry);

+62 -36

fs/namei.c

··· 126 126 #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) 127 127 128 128 struct filename * 129 - getname_flags(const char __user *filename, int flags, int *empty) 129 + getname_flags(const char __user *filename, int flags) 130 130 { 131 131 struct filename *result; 132 132 char *kname; ··· 148 148 result->name = kname; 149 149 150 150 len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); 151 - if (unlikely(len < 0)) { 152 - __putname(result); 153 - return ERR_PTR(len); 151 + /* 152 + * Handle both empty path and copy failure in one go. 153 + */ 154 + if (unlikely(len <= 0)) { 155 + if (unlikely(len < 0)) { 156 + __putname(result); 157 + return ERR_PTR(len); 158 + } 159 + 160 + /* The empty path is special. */ 161 + if (!(flags & LOOKUP_EMPTY)) { 162 + __putname(result); 163 + return ERR_PTR(-ENOENT); 164 + } 154 165 } 155 166 156 167 /* ··· 191 180 kfree(result); 192 181 return ERR_PTR(len); 193 182 } 183 + /* The empty path is special. */ 184 + if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { 185 + __putname(kname); 186 + kfree(result); 187 + return ERR_PTR(-ENOENT); 188 + } 194 189 if (unlikely(len == PATH_MAX)) { 195 190 __putname(kname); 196 191 kfree(result); ··· 205 188 } 206 189 207 190 atomic_set(&result->refcnt, 1); 208 - /* The empty path is special. */ 209 - if (unlikely(!len)) { 210 - if (empty) 211 - *empty = 1; 212 - if (!(flags & LOOKUP_EMPTY)) { 213 - putname(result); 214 - return ERR_PTR(-ENOENT); 215 - } 216 - } 217 - 218 191 result->uptr = filename; 219 192 result->aname = NULL; 220 193 audit_getname(result); ··· 216 209 { 217 210 int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; 218 211 219 - return getname_flags(filename, flags, NULL); 212 + return getname_flags(filename, flags); 220 213 } 221 214 222 215 struct filename * 223 216 getname(const char __user * filename) 224 217 { 225 - return getname_flags(filename, 0, NULL); 218 + return getname_flags(filename, 0); 226 219 } 227 220 228 221 struct filename * ··· 1240 1233 * 1241 1234 * Returns 0 if the open is allowed, -ve on error. 1242 1235 */ 1243 - static int may_create_in_sticky(struct mnt_idmap *idmap, 1244 - struct nameidata *nd, struct inode *const inode) 1236 + static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, 1237 + struct inode *const inode) 1245 1238 { 1246 1239 umode_t dir_mode = nd->dir_mode; 1247 - vfsuid_t dir_vfsuid = nd->dir_vfsuid; 1240 + vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; 1248 1241 1249 - if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || 1250 - (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || 1251 - likely(!(dir_mode & S_ISVTX)) || 1252 - vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || 1253 - vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) 1242 + if (likely(!(dir_mode & S_ISVTX))) 1254 1243 return 0; 1255 1244 1256 - if (likely(dir_mode & 0002) || 1257 - (dir_mode & 0020 && 1258 - ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || 1259 - (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { 1260 - const char *operation = S_ISFIFO(inode->i_mode) ? 1261 - "sticky_create_fifo" : 1262 - "sticky_create_regular"; 1263 - audit_log_path_denied(AUDIT_ANOM_CREAT, operation); 1245 + if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) 1246 + return 0; 1247 + 1248 + if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) 1249 + return 0; 1250 + 1251 + i_vfsuid = i_uid_into_vfsuid(idmap, inode); 1252 + 1253 + if (vfsuid_eq(i_vfsuid, dir_vfsuid)) 1254 + return 0; 1255 + 1256 + if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) 1257 + return 0; 1258 + 1259 + if (likely(dir_mode & 0002)) { 1260 + audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); 1264 1261 return -EACCES; 1265 1262 } 1263 + 1264 + if (dir_mode & 0020) { 1265 + if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { 1266 + audit_log_path_denied(AUDIT_ANOM_CREAT, 1267 + "sticky_create_fifo"); 1268 + return -EACCES; 1269 + } 1270 + 1271 + if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { 1272 + audit_log_path_denied(AUDIT_ANOM_CREAT, 1273 + "sticky_create_regular"); 1274 + return -EACCES; 1275 + } 1276 + } 1277 + 1266 1278 return 0; 1267 1279 } 1268 1280 ··· 2995 2969 } 2996 2970 #endif 2997 2971 2998 - int user_path_at_empty(int dfd, const char __user *name, unsigned flags, 2999 - struct path *path, int *empty) 2972 + int user_path_at(int dfd, const char __user *name, unsigned flags, 2973 + struct path *path) 3000 2974 { 3001 - struct filename *filename = getname_flags(name, flags, empty); 2975 + struct filename *filename = getname_flags(name, flags); 3002 2976 int ret = filename_lookup(dfd, filename, flags, path, NULL); 3003 2977 3004 2978 putname(filename); 3005 2979 return ret; 3006 2980 } 3007 - EXPORT_SYMBOL(user_path_at_empty); 2981 + EXPORT_SYMBOL(user_path_at); 3008 2982 3009 2983 int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, 3010 2984 struct inode *inode)

+32 -42

fs/namespace.c

··· 1846 1846 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); 1847 1847 } 1848 1848 1849 - /** 1850 - * path_mounted - check whether path is mounted 1851 - * @path: path to check 1852 - * 1853 - * Determine whether @path refers to the root of a mount. 1854 - * 1855 - * Return: true if @path is the root of a mount, false if not. 1856 - */ 1857 - static inline bool path_mounted(const struct path *path) 1858 - { 1859 - return path->mnt->mnt_root == path->dentry; 1860 - } 1861 - 1862 1849 static void warn_mandlock(void) 1863 1850 { 1864 1851 pr_warn_once("=======================================================\n" ··· 1953 1966 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1954 1967 } 1955 1968 1956 - struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1969 + struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, 1957 1970 int flag) 1958 1971 { 1959 - struct mount *res, *p, *q, *r, *parent; 1972 + struct mount *res, *src_parent, *src_root_child, *src_mnt, 1973 + *dst_parent, *dst_mnt; 1960 1974 1961 - if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) 1975 + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root)) 1962 1976 return ERR_PTR(-EINVAL); 1963 1977 1964 1978 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) 1965 1979 return ERR_PTR(-EINVAL); 1966 1980 1967 - res = q = clone_mnt(mnt, dentry, flag); 1968 - if (IS_ERR(q)) 1969 - return q; 1981 + res = dst_mnt = clone_mnt(src_root, dentry, flag); 1982 + if (IS_ERR(dst_mnt)) 1983 + return dst_mnt; 1970 1984 1971 - q->mnt_mountpoint = mnt->mnt_mountpoint; 1985 + src_parent = src_root; 1986 + dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; 1972 1987 1973 - p = mnt; 1974 - list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1975 - struct mount *s; 1976 - if (!is_subdir(r->mnt_mountpoint, dentry)) 1988 + list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { 1989 + if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) 1977 1990 continue; 1978 1991 1979 - for (s = r; s; s = next_mnt(s, r)) { 1992 + for (src_mnt = src_root_child; src_mnt; 1993 + src_mnt = next_mnt(src_mnt, src_root_child)) { 1980 1994 if (!(flag & CL_COPY_UNBINDABLE) && 1981 - IS_MNT_UNBINDABLE(s)) { 1982 - if (s->mnt.mnt_flags & MNT_LOCKED) { 1995 + IS_MNT_UNBINDABLE(src_mnt)) { 1996 + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) { 1983 1997 /* Both unbindable and locked. */ 1984 - q = ERR_PTR(-EPERM); 1998 + dst_mnt = ERR_PTR(-EPERM); 1985 1999 goto out; 1986 2000 } else { 1987 - s = skip_mnt_tree(s); 2001 + src_mnt = skip_mnt_tree(src_mnt); 1988 2002 continue; 1989 2003 } 1990 2004 } 1991 2005 if (!(flag & CL_COPY_MNT_NS_FILE) && 1992 - is_mnt_ns_file(s->mnt.mnt_root)) { 1993 - s = skip_mnt_tree(s); 2006 + is_mnt_ns_file(src_mnt->mnt.mnt_root)) { 2007 + src_mnt = skip_mnt_tree(src_mnt); 1994 2008 continue; 1995 2009 } 1996 - while (p != s->mnt_parent) { 1997 - p = p->mnt_parent; 1998 - q = q->mnt_parent; 2010 + while (src_parent != src_mnt->mnt_parent) { 2011 + src_parent = src_parent->mnt_parent; 2012 + dst_mnt = dst_mnt->mnt_parent; 1999 2013 } 2000 - p = s; 2001 - parent = q; 2002 - q = clone_mnt(p, p->mnt.mnt_root, flag); 2003 - if (IS_ERR(q)) 2014 + 2015 + src_parent = src_mnt; 2016 + dst_parent = dst_mnt; 2017 + dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag); 2018 + if (IS_ERR(dst_mnt)) 2004 2019 goto out; 2005 2020 lock_mount_hash(); 2006 - list_add_tail(&q->mnt_list, &res->mnt_list); 2007 - attach_mnt(q, parent, p->mnt_mp, false); 2021 + list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); 2022 + attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); 2008 2023 unlock_mount_hash(); 2009 2024 } 2010 2025 } 2011 2026 return res; 2027 + 2012 2028 out: 2013 2029 if (res) { 2014 2030 lock_mount_hash(); 2015 2031 umount_tree(res, UMOUNT_SYNC); 2016 2032 unlock_mount_hash(); 2017 2033 } 2018 - return q; 2034 + return dst_mnt; 2019 2035 } 2020 2036 2021 2037 /* Caller should check returned pointer for errors */ ··· 2068 2078 namespace_unlock(); 2069 2079 } 2070 2080 2071 - static bool has_locked_children(struct mount *mnt, struct dentry *dentry) 2081 + bool has_locked_children(struct mount *mnt, struct dentry *dentry) 2072 2082 { 2073 2083 struct mount *child; 2074 2084

+1 -1

fs/nfsd/nfsfh.c

··· 247 247 dentry = dget(exp->ex_path.dentry); 248 248 else { 249 249 dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, 250 - data_left, fileid_type, 250 + data_left, fileid_type, 0, 251 251 nfsd_acceptable, exp); 252 252 if (IS_ERR_OR_NULL(dentry)) { 253 253 trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,

+10 -7

fs/open.c

··· 247 247 { 248 248 struct inode *inode = file_inode(file); 249 249 long ret; 250 + loff_t sum; 250 251 251 252 if (offset < 0 || len <= 0) 252 253 return -EINVAL; ··· 320 319 if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) 321 320 return -ENODEV; 322 321 323 - /* Check for wrap through zero too */ 324 - if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 322 + /* Check for wraparound */ 323 + if (check_add_overflow(offset, len, &sum)) 324 + return -EFBIG; 325 + 326 + if (sum > inode->i_sb->s_maxbytes) 325 327 return -EFBIG; 326 328 327 329 if (!file->f_op->fallocate) ··· 986 982 */ 987 983 if (f->f_mode & FMODE_WRITE) { 988 984 /* 989 - * Paired with smp_mb() in collapse_file() to ensure nr_thps 990 - * is up to date and the update to i_writecount by 991 - * get_write_access() is visible. Ensures subsequent insertion 992 - * of THPs into the page cache will fail. 985 + * Depends on full fence from get_write_access() to synchronize 986 + * against collapse_file() regarding i_writecount and nr_thps 987 + * updates. Ensures subsequent insertion of THPs into the page 988 + * cache will fail. 993 989 */ 994 - smp_mb(); 995 990 if (filemap_nr_thps(inode->i_mapping)) { 996 991 struct address_space *mapping = inode->i_mapping; 997 992

+3 -3

fs/proc/generic.c

··· 202 202 { 203 203 int i; 204 204 205 - i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, 206 - GFP_KERNEL); 205 + i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST, 206 + GFP_KERNEL); 207 207 if (i < 0) 208 208 return i; 209 209 ··· 213 213 214 214 void proc_free_inum(unsigned int inum) 215 215 { 216 - ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); 216 + ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); 217 217 } 218 218 219 219 static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)

+2 -6

fs/quota/dquot.c

··· 2246 2246 int cnt; 2247 2247 struct quota_info *dqopt = sb_dqopt(sb); 2248 2248 2249 - /* s_umount should be held in exclusive mode */ 2250 - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) 2251 - up_read(&sb->s_umount); 2249 + rwsem_assert_held_write(&sb->s_umount); 2252 2250 2253 2251 /* Cannot turn off usage accounting without turning off limits, or 2254 2252 * suspend quotas and simultaneously turn quotas off. */ ··· 2508 2510 int ret = 0, cnt; 2509 2511 unsigned int flags; 2510 2512 2511 - /* s_umount should be held in exclusive mode */ 2512 - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) 2513 - up_read(&sb->s_umount); 2513 + rwsem_assert_held_write(&sb->s_umount); 2514 2514 2515 2515 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 2516 2516 if (type != -1 && cnt != type)

+1 -3

fs/readdir.c

··· 22 22 #include <linux/compat.h> 23 23 #include <linux/uaccess.h> 24 24 25 - #include <asm/unaligned.h> 26 - 27 25 /* 28 26 * Some filesystems were never converted to '->iterate_shared()' 29 27 * and their directory iterators want the inode lock held for ··· 70 72 EXPORT_SYMBOL(wrap_directory_iterator); 71 73 72 74 /* 73 - * Note the "unsafe_put_user() semantics: we goto a 75 + * Note the "unsafe_put_user()" semantics: we goto a 74 76 * label for errors. 75 77 */ 76 78 #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \

+111 -59

fs/stat.c

··· 214 214 return lookup_flags; 215 215 } 216 216 217 + static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, 218 + u32 request_mask) 219 + { 220 + int error = vfs_getattr(path, stat, request_mask, flags); 221 + 222 + if (request_mask & STATX_MNT_ID_UNIQUE) { 223 + stat->mnt_id = real_mount(path->mnt)->mnt_id_unique; 224 + stat->result_mask |= STATX_MNT_ID_UNIQUE; 225 + } else { 226 + stat->mnt_id = real_mount(path->mnt)->mnt_id; 227 + stat->result_mask |= STATX_MNT_ID; 228 + } 229 + 230 + if (path_mounted(path)) 231 + stat->attributes |= STATX_ATTR_MOUNT_ROOT; 232 + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; 233 + 234 + /* Handle STATX_DIOALIGN for block devices. */ 235 + if (request_mask & STATX_DIOALIGN) { 236 + struct inode *inode = d_backing_inode(path->dentry); 237 + 238 + if (S_ISBLK(inode->i_mode)) 239 + bdev_statx_dioalign(inode, stat); 240 + } 241 + 242 + return error; 243 + } 244 + 245 + static int vfs_statx_fd(int fd, int flags, struct kstat *stat, 246 + u32 request_mask) 247 + { 248 + CLASS(fd_raw, f)(fd); 249 + if (!f.file) 250 + return -EBADF; 251 + return vfs_statx_path(&f.file->f_path, flags, stat, request_mask); 252 + } 253 + 217 254 /** 218 255 * vfs_statx - Get basic and extra attributes by filename 219 256 * @dfd: A file descriptor representing the base dir for a relative filename ··· 280 243 retry: 281 244 error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); 282 245 if (error) 283 - goto out; 284 - 285 - error = vfs_getattr(&path, stat, request_mask, flags); 286 - 287 - if (request_mask & STATX_MNT_ID_UNIQUE) { 288 - stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; 289 - stat->result_mask |= STATX_MNT_ID_UNIQUE; 290 - } else { 291 - stat->mnt_id = real_mount(path.mnt)->mnt_id; 292 - stat->result_mask |= STATX_MNT_ID; 293 - } 294 - 295 - if (path.mnt->mnt_root == path.dentry) 296 - stat->attributes |= STATX_ATTR_MOUNT_ROOT; 297 - stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; 298 - 299 - /* Handle STATX_DIOALIGN for block devices. */ 300 - if (request_mask & STATX_DIOALIGN) { 301 - struct inode *inode = d_backing_inode(path.dentry); 302 - 303 - if (S_ISBLK(inode->i_mode)) 304 - bdev_statx_dioalign(inode, stat); 305 - } 306 - 246 + return error; 247 + error = vfs_statx_path(&path, flags, stat, request_mask); 307 248 path_put(&path); 308 249 if (retry_estale(error, lookup_flags)) { 309 250 lookup_flags |= LOOKUP_REVAL; 310 251 goto retry; 311 252 } 312 - out: 313 253 return error; 314 254 } 315 255 ··· 303 289 * If AT_EMPTY_PATH is set, we expect the common case to be that 304 290 * empty path, and avoid doing all the extra pathname work. 305 291 */ 306 - if (dfd >= 0 && flags == AT_EMPTY_PATH) { 307 - char c; 292 + if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) 293 + return vfs_fstat(dfd, stat); 308 294 309 - ret = get_user(c, filename); 310 - if (unlikely(ret)) 311 - return ret; 312 - 313 - if (likely(!c)) 314 - return vfs_fstat(dfd, stat); 315 - } 316 - 317 - name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); 295 + name = getname_flags(filename, getname_statx_lookup_flags(statx_flags)); 318 296 ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); 319 297 putname(name); 320 298 ··· 494 488 char __user *buf, int bufsiz) 495 489 { 496 490 struct path path; 491 + struct filename *name; 497 492 int error; 498 - int empty = 0; 499 493 unsigned int lookup_flags = LOOKUP_EMPTY; 500 494 501 495 if (bufsiz <= 0) 502 496 return -EINVAL; 503 497 504 498 retry: 505 - error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); 506 - if (!error) { 507 - struct inode *inode = d_backing_inode(path.dentry); 499 + name = getname_flags(pathname, lookup_flags); 500 + error = filename_lookup(dfd, name, lookup_flags, &path, NULL); 501 + if (unlikely(error)) { 502 + putname(name); 503 + return error; 504 + } 508 505 509 - error = empty ? -ENOENT : -EINVAL; 510 - /* 511 - * AFS mountpoints allow readlink(2) but are not symlinks 512 - */ 513 - if (d_is_symlink(path.dentry) || inode->i_op->readlink) { 514 - error = security_inode_readlink(path.dentry); 515 - if (!error) { 516 - touch_atime(&path); 517 - error = vfs_readlink(path.dentry, buf, bufsiz); 518 - } 506 + /* 507 + * AFS mountpoints allow readlink(2) but are not symlinks 508 + */ 509 + if (d_is_symlink(path.dentry) || 510 + d_backing_inode(path.dentry)->i_op->readlink) { 511 + error = security_inode_readlink(path.dentry); 512 + if (!error) { 513 + touch_atime(&path); 514 + error = vfs_readlink(path.dentry, buf, bufsiz); 519 515 } 520 - path_put(&path); 521 - if (retry_estale(error, lookup_flags)) { 522 - lookup_flags |= LOOKUP_REVAL; 523 - goto retry; 524 - } 516 + } else { 517 + error = (name->name[0] == '\0') ? -ENOENT : -EINVAL; 518 + } 519 + path_put(&path); 520 + putname(name); 521 + if (retry_estale(error, lookup_flags)) { 522 + lookup_flags |= LOOKUP_REVAL; 523 + goto retry; 525 524 } 526 525 return error; 527 526 } ··· 685 674 if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) 686 675 return -EINVAL; 687 676 688 - /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests 677 + /* 678 + * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests 689 679 * from userland. 690 680 */ 691 681 mask &= ~STATX_CHANGE_COOKIE; ··· 698 686 return cp_statx(&stat, buffer); 699 687 } 700 688 689 + int do_statx_fd(int fd, unsigned int flags, unsigned int mask, 690 + struct statx __user *buffer) 691 + { 692 + struct kstat stat; 693 + int error; 694 + 695 + if (mask & STATX__RESERVED) 696 + return -EINVAL; 697 + if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) 698 + return -EINVAL; 699 + 700 + /* 701 + * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests 702 + * from userland. 703 + */ 704 + mask &= ~STATX_CHANGE_COOKIE; 705 + 706 + error = vfs_statx_fd(fd, flags, &stat, mask); 707 + if (error) 708 + return error; 709 + 710 + return cp_statx(&stat, buffer); 711 + } 712 + 701 713 /** 702 714 * sys_statx - System call to get enhanced stats 703 715 * @dfd: Base directory to pathwalk from *or* fd to stat. 704 - * @filename: File to stat or "" with AT_EMPTY_PATH 716 + * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH 705 717 * @flags: AT_* flags to control pathwalk. 706 718 * @mask: Parts of statx struct actually required. 707 719 * @buffer: Result buffer. 708 720 * 709 721 * Note that fstat() can be emulated by setting dfd to the fd of interest, 710 - * supplying "" as the filename and setting AT_EMPTY_PATH in the flags. 722 + * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH 723 + * in the flags. 711 724 */ 712 725 SYSCALL_DEFINE5(statx, 713 726 int, dfd, const char __user *, filename, unsigned, flags, ··· 740 703 struct statx __user *, buffer) 741 704 { 742 705 int ret; 706 + unsigned lflags; 743 707 struct filename *name; 744 708 745 - name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL); 709 + /* 710 + * Short-circuit handling of NULL and "" paths. 711 + * 712 + * For a NULL path we require and accept only the AT_EMPTY_PATH flag 713 + * (possibly |'d with AT_STATX flags). 714 + * 715 + * However, glibc on 32-bit architectures implements fstatat as statx 716 + * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags. 717 + * Supporting this results in the uglification below. 718 + */ 719 + lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE); 720 + if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) 721 + return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer); 722 + 723 + name = getname_flags(filename, getname_statx_lookup_flags(flags)); 746 724 ret = do_statx(dfd, name, flags, mask, buffer); 747 725 putname(name); 748 726

include/linux/dcache.h

··· 278 278 return dentry->d_lockref.count; 279 279 } 280 280 281 + ino_t d_parent_ino(struct dentry *dentry); 282 + 281 283 /* 282 284 * helper function for dentry_operations.d_dname() members 283 285 */

include/linux/exportfs.h

··· 158 158 159 159 #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ 160 160 #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ 161 + #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ 161 162 162 163 /** 163 164 * struct export_operations - for nfsd to communicate with file systems ··· 306 305 extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, 307 306 struct fid *fid, int fh_len, 308 307 int fileid_type, 308 + unsigned int flags, 309 309 int (*acceptable)(void *, struct dentry *), 310 310 void *context); 311 311 extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,

+53 -32

include/linux/fs.h

··· 660 660 }; 661 661 dev_t i_rdev; 662 662 loff_t i_size; 663 - struct timespec64 __i_atime; 664 - struct timespec64 __i_mtime; 665 - struct timespec64 __i_ctime; /* use inode_*_ctime accessors! */ 663 + time64_t i_atime_sec; 664 + time64_t i_mtime_sec; 665 + time64_t i_ctime_sec; 666 + u32 i_atime_nsec; 667 + u32 i_mtime_nsec; 668 + u32 i_ctime_nsec; 669 + u32 i_generation; 666 670 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 667 671 unsigned short i_bytes; 668 672 u8 i_blkbits; ··· 723 719 unsigned i_dir_seq; 724 720 }; 725 721 726 - __u32 i_generation; 727 722 728 723 #ifdef CONFIG_FSNOTIFY 729 724 __u32 i_fsnotify_mask; /* all events this inode cares about */ 725 + /* 32-bit hole reserved for expanding i_fsnotify_mask */ 730 726 struct fsnotify_mark_connector __rcu *i_fsnotify_marks; 731 727 #endif 732 728 ··· 1542 1538 1543 1539 static inline time64_t inode_get_atime_sec(const struct inode *inode) 1544 1540 { 1545 - return inode->__i_atime.tv_sec; 1541 + return inode->i_atime_sec; 1546 1542 } 1547 1543 1548 1544 static inline long inode_get_atime_nsec(const struct inode *inode) 1549 1545 { 1550 - return inode->__i_atime.tv_nsec; 1546 + return inode->i_atime_nsec; 1551 1547 } 1552 1548 1553 1549 static inline struct timespec64 inode_get_atime(const struct inode *inode) 1554 1550 { 1555 - return inode->__i_atime; 1551 + struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), 1552 + .tv_nsec = inode_get_atime_nsec(inode) }; 1553 + 1554 + return ts; 1556 1555 } 1557 1556 1558 1557 static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, 1559 1558 struct timespec64 ts) 1560 1559 { 1561 - inode->__i_atime = ts; 1560 + inode->i_atime_sec = ts.tv_sec; 1561 + inode->i_atime_nsec = ts.tv_nsec; 1562 1562 return ts; 1563 1563 } 1564 1564 ··· 1571 1563 { 1572 1564 struct timespec64 ts = { .tv_sec = sec, 1573 1565 .tv_nsec = nsec }; 1566 + 1574 1567 return inode_set_atime_to_ts(inode, ts); 1575 1568 } 1576 1569 1577 1570 static inline time64_t inode_get_mtime_sec(const struct inode *inode) 1578 1571 { 1579 - return inode->__i_mtime.tv_sec; 1572 + return inode->i_mtime_sec; 1580 1573 } 1581 1574 1582 1575 static inline long inode_get_mtime_nsec(const struct inode *inode) 1583 1576 { 1584 - return inode->__i_mtime.tv_nsec; 1577 + return inode->i_mtime_nsec; 1585 1578 } 1586 1579 1587 1580 static inline struct timespec64 inode_get_mtime(const struct inode *inode) 1588 1581 { 1589 - return inode->__i_mtime; 1582 + struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), 1583 + .tv_nsec = inode_get_mtime_nsec(inode) }; 1584 + return ts; 1590 1585 } 1591 1586 1592 1587 static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, 1593 1588 struct timespec64 ts) 1594 1589 { 1595 - inode->__i_mtime = ts; 1590 + inode->i_mtime_sec = ts.tv_sec; 1591 + inode->i_mtime_nsec = ts.tv_nsec; 1596 1592 return ts; 1597 1593 } 1598 1594 ··· 1610 1598 1611 1599 static inline time64_t inode_get_ctime_sec(const struct inode *inode) 1612 1600 { 1613 - return inode->__i_ctime.tv_sec; 1601 + return inode->i_ctime_sec; 1614 1602 } 1615 1603 1616 1604 static inline long inode_get_ctime_nsec(const struct inode *inode) 1617 1605 { 1618 - return inode->__i_ctime.tv_nsec; 1606 + return inode->i_ctime_nsec; 1619 1607 } 1620 1608 1621 1609 static inline struct timespec64 inode_get_ctime(const struct inode *inode) 1622 1610 { 1623 - return inode->__i_ctime; 1611 + struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), 1612 + .tv_nsec = inode_get_ctime_nsec(inode) }; 1613 + 1614 + return ts; 1624 1615 } 1625 1616 1626 1617 static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, 1627 1618 struct timespec64 ts) 1628 1619 { 1629 - inode->__i_ctime = ts; 1620 + inode->i_ctime_sec = ts.tv_sec; 1621 + inode->i_ctime_nsec = ts.tv_nsec; 1630 1622 return ts; 1631 1623 } 1632 1624 ··· 1942 1926 extern bool may_open_dev(const struct path *path); 1943 1927 umode_t mode_strip_sgid(struct mnt_idmap *idmap, 1944 1928 const struct inode *dir, umode_t mode); 1929 + bool in_group_or_capable(struct mnt_idmap *idmap, 1930 + const struct inode *inode, vfsgid_t vfsgid); 1945 1931 1946 1932 /* 1947 1933 * This is the "filldir" function type, used by readdir() to let ··· 2703 2685 } 2704 2686 extern int filp_close(struct file *, fl_owner_t id); 2705 2687 2706 - extern struct filename *getname_flags(const char __user *, int, int *); 2688 + extern struct filename *getname_flags(const char __user *, int); 2707 2689 extern struct filename *getname_uflags(const char __user *, int); 2708 2690 extern struct filename *getname(const char __user *); 2709 2691 extern struct filename *getname_kernel(const char *); ··· 3454 3436 return 0; 3455 3437 } 3456 3438 3457 - static inline ino_t parent_ino(struct dentry *dentry) 3458 - { 3459 - ino_t res; 3460 - 3461 - /* 3462 - * Don't strictly need d_lock here? If the parent ino could change 3463 - * then surely we'd have a deeper race in the caller? 3464 - */ 3465 - spin_lock(&dentry->d_lock); 3466 - res = dentry->d_parent->d_inode->i_ino; 3467 - spin_unlock(&dentry->d_lock); 3468 - return res; 3469 - } 3470 - 3471 3439 /* Transaction based IO helpers */ 3472 3440 3473 3441 /* ··· 3578 3574 static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) 3579 3575 { 3580 3576 return ctx->actor(ctx, "..", 2, ctx->pos, 3581 - parent_ino(file->f_path.dentry), DT_DIR); 3577 + d_parent_ino(file->f_path.dentry), DT_DIR); 3582 3578 } 3583 3579 static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) 3584 3580 { ··· 3616 3612 int advice); 3617 3613 extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, 3618 3614 int advice); 3615 + 3616 + static inline bool vfs_empty_path(int dfd, const char __user *path) 3617 + { 3618 + char c; 3619 + 3620 + if (dfd < 0) 3621 + return false; 3622 + 3623 + /* We now allow NULL to be used for empty path. */ 3624 + if (!path) 3625 + return true; 3626 + 3627 + if (unlikely(get_user(c, path))) 3628 + return false; 3629 + 3630 + return !c; 3631 + } 3619 3632 3620 3633 #endif /* _LINUX_FS_H */

+1 -7

include/linux/namei.h

··· 50 50 51 51 extern int path_pts(struct path *path); 52 52 53 - extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); 54 - 55 - static inline int user_path_at(int dfd, const char __user *name, unsigned flags, 56 - struct path *path) 57 - { 58 - return user_path_at_empty(dfd, name, flags, path, NULL); 59 - } 53 + extern int user_path_at(int, const char __user *, unsigned, struct path *); 60 54 61 55 struct dentry *lookup_one_qstr_excl(const struct qstr *name, 62 56 struct dentry *base,

+1 -2

io_uring/statx.c

··· 37 37 sx->flags = READ_ONCE(sqe->statx_flags); 38 38 39 39 sx->filename = getname_flags(path, 40 - getname_statx_lookup_flags(sx->flags), 41 - NULL); 40 + getname_statx_lookup_flags(sx->flags)); 42 41 43 42 if (IS_ERR(sx->filename)) { 44 43 int ret = PTR_ERR(sx->filename);

+2 -2

io_uring/xattr.c

··· 96 96 97 97 path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); 98 98 99 - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); 99 + ix->filename = getname_flags(path, LOOKUP_FOLLOW); 100 100 if (IS_ERR(ix->filename)) { 101 101 ret = PTR_ERR(ix->filename); 102 102 ix->filename = NULL; ··· 189 189 190 190 path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); 191 191 192 - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); 192 + ix->filename = getname_flags(path, LOOKUP_FOLLOW); 193 193 if (IS_ERR(ix->filename)) { 194 194 ret = PTR_ERR(ix->filename); 195 195 ix->filename = NULL;

+2 -1

ipc/mqueue.c

··· 903 903 904 904 audit_mq_open(oflag, mode, attr); 905 905 906 - if (IS_ERR(name = getname(u_name))) 906 + name = getname(u_name); 907 + if (IS_ERR(name)) 907 908 return PTR_ERR(name); 908 909 909 910 fd = get_unused_fd_flags(O_CLOEXEC);

+3 -23

kernel/fork.c

··· 616 616 617 617 exe_file = get_mm_exe_file(oldmm); 618 618 RCU_INIT_POINTER(mm->exe_file, exe_file); 619 - /* 620 - * We depend on the oldmm having properly denied write access to the 621 - * exe_file already. 622 - */ 623 - if (exe_file && deny_write_access(exe_file)) 624 - pr_warn_once("deny_write_access() failed in %s\n", __func__); 625 619 } 626 620 627 621 #ifdef CONFIG_MMU ··· 1406 1412 */ 1407 1413 old_exe_file = rcu_dereference_raw(mm->exe_file); 1408 1414 1409 - if (new_exe_file) { 1410 - /* 1411 - * We expect the caller (i.e., sys_execve) to already denied 1412 - * write access, so this is unlikely to fail. 1413 - */ 1414 - if (unlikely(deny_write_access(new_exe_file))) 1415 - return -EACCES; 1415 + if (new_exe_file) 1416 1416 get_file(new_exe_file); 1417 - } 1418 1417 rcu_assign_pointer(mm->exe_file, new_exe_file); 1419 - if (old_exe_file) { 1420 - allow_write_access(old_exe_file); 1418 + if (old_exe_file) 1421 1419 fput(old_exe_file); 1422 - } 1423 1420 return 0; 1424 1421 } 1425 1422 ··· 1449 1464 return ret; 1450 1465 } 1451 1466 1452 - ret = deny_write_access(new_exe_file); 1453 - if (ret) 1454 - return -EACCES; 1455 1467 get_file(new_exe_file); 1456 1468 1457 1469 /* set the new file */ ··· 1457 1475 rcu_assign_pointer(mm->exe_file, new_exe_file); 1458 1476 mmap_write_unlock(mm); 1459 1477 1460 - if (old_exe_file) { 1461 - allow_write_access(old_exe_file); 1478 + if (old_exe_file) 1462 1479 fput(old_exe_file); 1463 - } 1464 1480 return 0; 1465 1481 } 1466 1482

+5 -5

mm/khugepaged.c

··· 2000 2000 if (!is_shmem) { 2001 2001 filemap_nr_thps_inc(mapping); 2002 2002 /* 2003 - * Paired with smp_mb() in do_dentry_open() to ensure 2004 - * i_writecount is up to date and the update to nr_thps is 2005 - * visible. Ensures the page cache will be truncated if the 2003 + * Paired with the fence in do_dentry_open() -> get_write_access() 2004 + * to ensure i_writecount is up to date and the update to nr_thps 2005 + * is visible. Ensures the page cache will be truncated if the 2006 2006 * file is opened writable. 2007 2007 */ 2008 2008 smp_mb(); ··· 2190 2190 if (!is_shmem && result == SCAN_COPY_MC) { 2191 2191 filemap_nr_thps_dec(mapping); 2192 2192 /* 2193 - * Paired with smp_mb() in do_dentry_open() to 2194 - * ensure the update to nr_thps is visible. 2193 + * Paired with the fence in do_dentry_open() -> get_write_access() 2194 + * to ensure the update to nr_thps is visible. 2195 2195 */ 2196 2196 smp_mb(); 2197 2197 }

+6 -3

mm/shmem.c

··· 3177 3177 struct folio *folio; 3178 3178 3179 3179 /* 3180 - * Good, the fallocate(2) manpage permits EINTR: we may have 3181 - * been interrupted because we are using up too much memory. 3180 + * Check for fatal signal so that we abort early in OOM 3181 + * situations. We don't want to abort in case of non-fatal 3182 + * signals as large fallocate can take noticeable time and 3183 + * e.g. periodic timers may result in fallocate constantly 3184 + * restarting. 3182 3185 */ 3183 - if (signal_pending(current)) 3186 + if (fatal_signal_pending(current)) 3184 3187 error = -EINTR; 3185 3188 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 3186 3189 error = -ENOMEM;