Merge tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull vfs fixes from Christian Brauner:

- Fix an uninitialized variable in file_getattr().

The flags_valid field wasn't initialized before calling
vfs_fileattr_get(), triggering KMSAN uninit-value reports in fuse

- Fix writeback wakeup and logging timeouts when DETECT_HUNG_TASK is
not enabled.

sysctl_hung_task_timeout_secs is 0 in that case causing spurious
"waiting for writeback completion for more than 1 seconds" warnings

- Fix a null-ptr-deref in do_statmount() when the mount is internal

- Add missing kernel-doc description for the @private parameter in
iomap_readahead()

- Fix mount namespace creation to hold namespace_sem across the mount
copy in create_new_namespace().

The previous drop-and-reacquire pattern was fragile and failed to
clean up mount propagation links if the real rootfs was a shared or
dependent mount

- Fix /proc mount iteration where m->index wasn't updated when
m->show() overflows, causing a restart to repeatedly show the same
mount entry in a rapidly expanding mount table

- Return EFSCORRUPTED instead of ENOSPC in minix_new_inode() when the
inode number is out of range

- Fix unshare(2) when CLONE_NEWNS is set and current->fs isn't shared.

copy_mnt_ns() received the live fs_struct so if a subsequent
namespace creation failed the rollback would leave pwd and root
pointing to detached mounts. Always allocate a new fs_struct when
CLONE_NEWNS is requested

- fserror bug fixes:

- Remove the unused fsnotify_sb_error() helper now that all callers
have been converted to fserror_report_metadata

- Fix a lockdep splat in fserror_report() where igrab() takes
inode::i_lock which can be held in IRQ context.

Replace igrab() with a direct i_count bump since filesystems
should not report inodes that are about to be freed or not yet
exposed

- Handle error pointer in procfs for try_lookup_noperm()

- Fix an integer overflow in ep_loop_check_proc() where recursive calls
returning INT_MAX would overflow when +1 is added, breaking the
recursion depth check

- Fix a misleading break in pidfs

* tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
pidfs: avoid misleading break
eventpoll: Fix integer overflow in ep_loop_check_proc()
proc: Fix pointer error dereference
fserror: fix lockdep complaint when igrabbing inode
fsnotify: drop unused helper
unshare: fix unshare_fs() handling
minix: Correct errno in minix_new_inode
namespace: fix proc mount iteration
mount: hold namespace_sem across copy in create_new_namespace()
iomap: Describe @private in iomap_readahead()
statmount: Fix the null-ptr-deref in do_statmount()
writeback: Fix wakeup and logging timeouts for !DETECT_HUNG_TASK
fs: init flags_valid before calling vfs_fileattr_get

Linus Torvalds 5 days ago 0e335a77 bfbc0b5b

+142 -90

11 changed files

expand all

unified split

eventpoll.c

file_attr.c

fs-writeback.c

iomap

buffered-io.c

ioend.c

minix

bitmap.c

namespace.c

pidfs.c

proc

base.c

include

linux

fsnotify.h

kernel

fork.c

+3 -2

fs/eventpoll.c

··· 2061 2061 * @ep: the &struct eventpoll to be currently checked. 2062 2062 * @depth: Current depth of the path being checked. 2063 2063 * 2064 - * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. 2064 + * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found 2065 + * a loop or went too deep. 2065 2066 */ 2066 2067 static int ep_loop_check_proc(struct eventpoll *ep, int depth) 2067 2068 { ··· 2081 2080 struct eventpoll *ep_tovisit; 2082 2081 ep_tovisit = epi->ffd.file->private_data; 2083 2082 if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) 2084 - result = INT_MAX; 2083 + result = EP_MAX_NESTS+1; 2085 2084 else 2086 2085 result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); 2087 2086 if (result > EP_MAX_NESTS)

+1 -1

fs/file_attr.c

··· 378 378 struct path filepath __free(path_put) = {}; 379 379 unsigned int lookup_flags = 0; 380 380 struct file_attr fattr; 381 - struct file_kattr fa; 381 + struct file_kattr fa = { .flags_valid = true }; /* hint only */ 382 382 int error; 383 383 384 384 BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);

+5 -4

fs/fs-writeback.c

··· 198 198 199 199 static bool wb_wait_for_completion_cb(struct wb_completion *done) 200 200 { 201 + unsigned long timeout = sysctl_hung_task_timeout_secs; 201 202 unsigned long waited_secs = (jiffies - done->wait_start) / HZ; 202 203 203 204 done->progress_stamp = jiffies; 204 - if (waited_secs > sysctl_hung_task_timeout_secs) 205 + if (timeout && (waited_secs > timeout)) 205 206 pr_info("INFO: The task %s:%d has been waiting for writeback " 206 207 "completion for more than %lu seconds.", 207 208 current->comm, current->pid, waited_secs); ··· 1955 1954 .range_end = LLONG_MAX, 1956 1955 }; 1957 1956 unsigned long start_time = jiffies; 1957 + unsigned long timeout = sysctl_hung_task_timeout_secs; 1958 1958 long write_chunk; 1959 1959 long total_wrote = 0; /* count both pages and inodes */ 1960 1960 unsigned long dirtied_before = jiffies; ··· 2042 2040 __writeback_single_inode(inode, &wbc); 2043 2041 2044 2042 /* Report progress to inform the hung task detector of the progress. */ 2045 - if (work->done && work->done->progress_stamp && 2046 - (jiffies - work->done->progress_stamp) > HZ * 2047 - sysctl_hung_task_timeout_secs / 2) 2043 + if (work->done && work->done->progress_stamp && timeout && 2044 + (jiffies - work->done->progress_stamp) > HZ * timeout / 2) 2048 2045 wake_up_all(work->done->waitq); 2049 2046 2050 2047 wbc_detach_inode(&wbc);

fs/iomap/buffered-io.c

··· 624 624 * iomap_readahead - Attempt to read pages from a file. 625 625 * @ops: The operations vector for the filesystem. 626 626 * @ctx: The ctx used for issuing readahead. 627 + * @private: The filesystem-specific information for issuing iomap_iter. 627 628 * 628 629 * This function is for filesystems to call to implement their readahead 629 630 * address_space operation.

+46

fs/iomap/ioend.c

··· 69 69 return folio_count; 70 70 } 71 71 72 + static DEFINE_SPINLOCK(failed_ioend_lock); 73 + static LIST_HEAD(failed_ioend_list); 74 + 75 + static void 76 + iomap_fail_ioends( 77 + struct work_struct *work) 78 + { 79 + struct iomap_ioend *ioend; 80 + struct list_head tmp; 81 + unsigned long flags; 82 + 83 + spin_lock_irqsave(&failed_ioend_lock, flags); 84 + list_replace_init(&failed_ioend_list, &tmp); 85 + spin_unlock_irqrestore(&failed_ioend_lock, flags); 86 + 87 + while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, 88 + io_list))) { 89 + list_del_init(&ioend->io_list); 90 + iomap_finish_ioend_buffered(ioend); 91 + cond_resched(); 92 + } 93 + } 94 + 95 + static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends); 96 + 97 + static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend) 98 + { 99 + unsigned long flags; 100 + 101 + /* 102 + * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions 103 + * in the fserror code. The caller no longer owns the ioend reference 104 + * after the spinlock drops. 105 + */ 106 + spin_lock_irqsave(&failed_ioend_lock, flags); 107 + if (list_empty(&failed_ioend_list)) 108 + WARN_ON_ONCE(!schedule_work(&failed_ioend_work)); 109 + list_add_tail(&ioend->io_list, &failed_ioend_list); 110 + spin_unlock_irqrestore(&failed_ioend_lock, flags); 111 + } 112 + 72 113 static void ioend_writeback_end_bio(struct bio *bio) 73 114 { 74 115 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 75 116 76 117 ioend->io_error = blk_status_to_errno(bio->bi_status); 118 + if (ioend->io_error) { 119 + iomap_fail_ioend_buffered(ioend); 120 + return; 121 + } 122 + 77 123 iomap_finish_ioend_buffered(ioend); 78 124 } 79 125

+1 -1

fs/minix/bitmap.c

··· 247 247 j += i * bits_per_zone; 248 248 if (!j || j > sbi->s_ninodes) { 249 249 iput(inode); 250 - return ERR_PTR(-ENOSPC); 250 + return ERR_PTR(-EFSCORRUPTED); 251 251 } 252 252 inode_init_owner(&nop_mnt_idmap, inode, dir, mode); 253 253 inode->i_ino = j;

+77 -62

fs/namespace.c

··· 1531 1531 static void *m_start(struct seq_file *m, loff_t *pos) 1532 1532 { 1533 1533 struct proc_mounts *p = m->private; 1534 + struct mount *mnt; 1534 1535 1535 1536 down_read(&namespace_sem); 1536 1537 1537 - return mnt_find_id_at(p->ns, *pos); 1538 + mnt = mnt_find_id_at(p->ns, *pos); 1539 + if (mnt) 1540 + *pos = mnt->mnt_id_unique; 1541 + return mnt; 1538 1542 } 1539 1543 1540 1544 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1541 1545 { 1542 - struct mount *next = NULL, *mnt = v; 1546 + struct mount *mnt = v; 1543 1547 struct rb_node *node = rb_next(&mnt->mnt_node); 1544 1548 1545 - ++*pos; 1546 1549 if (node) { 1547 - next = node_to_mount(node); 1550 + struct mount *next = node_to_mount(node); 1548 1551 *pos = next->mnt_id_unique; 1552 + return next; 1549 1553 } 1550 - return next; 1554 + 1555 + /* 1556 + * No more mounts. Set pos past current mount's ID so that if 1557 + * iteration restarts, mnt_find_id_at() returns NULL. 1558 + */ 1559 + *pos = mnt->mnt_id_unique + 1; 1560 + return NULL; 1551 1561 } 1552 1562 1553 1563 static void m_stop(struct seq_file *m, void *v) ··· 2801 2791 } 2802 2792 2803 2793 static void lock_mount_exact(const struct path *path, 2804 - struct pinned_mountpoint *mp); 2794 + struct pinned_mountpoint *mp, bool copy_mount, 2795 + unsigned int copy_flags); 2805 2796 2806 2797 #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ 2807 2798 struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ ··· 2810 2799 #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) 2811 2800 #define LOCK_MOUNT_EXACT(mp, path) \ 2812 2801 struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ 2813 - lock_mount_exact((path), &mp) 2802 + lock_mount_exact((path), &mp, false, 0) 2803 + #define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \ 2804 + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ 2805 + lock_mount_exact((path), &mp, true, (copy_flags)) 2814 2806 2815 2807 static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) 2816 2808 { ··· 3087 3073 return file; 3088 3074 } 3089 3075 3090 - DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *, 3091 - if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T)) 3092 - 3093 3076 static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) 3094 3077 { 3095 - struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL; 3096 - struct path to_path __free(path_put) = {}; 3097 3078 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 3098 3079 struct user_namespace *user_ns = current_user_ns(); 3099 - struct mount *new_ns_root; 3080 + struct mnt_namespace *new_ns; 3081 + struct mount *new_ns_root, *old_ns_root; 3082 + struct path to_path; 3100 3083 struct mount *mnt; 3101 3084 unsigned int copy_flags = 0; 3102 3085 bool locked = false; ··· 3105 3094 if (IS_ERR(new_ns)) 3106 3095 return ERR_CAST(new_ns); 3107 3096 3108 - scoped_guard(namespace_excl) { 3109 - new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); 3110 - if (IS_ERR(new_ns_root)) 3111 - return ERR_CAST(new_ns_root); 3097 + old_ns_root = ns->root; 3098 + to_path.mnt = &old_ns_root->mnt; 3099 + to_path.dentry = old_ns_root->mnt.mnt_root; 3112 3100 3113 - /* 3114 - * If the real rootfs had a locked mount on top of it somewhere 3115 - * in the stack, lock the new mount tree as well so it can't be 3116 - * exposed. 3117 - */ 3118 - mnt = ns->root; 3119 - while (mnt->overmount) { 3120 - mnt = mnt->overmount; 3121 - if (mnt->mnt.mnt_flags & MNT_LOCKED) 3122 - locked = true; 3123 - } 3101 + VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type); 3102 + 3103 + LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags); 3104 + if (IS_ERR(mp.parent)) { 3105 + free_mnt_ns(new_ns); 3106 + return ERR_CAST(mp.parent); 3107 + } 3108 + new_ns_root = mp.parent; 3109 + 3110 + /* 3111 + * If the real rootfs had a locked mount on top of it somewhere 3112 + * in the stack, lock the new mount tree as well so it can't be 3113 + * exposed. 3114 + */ 3115 + mnt = old_ns_root; 3116 + while (mnt->overmount) { 3117 + mnt = mnt->overmount; 3118 + if (mnt->mnt.mnt_flags & MNT_LOCKED) 3119 + locked = true; 3124 3120 } 3125 3121 3126 3122 /* 3127 - * We dropped the namespace semaphore so we can actually lock 3128 - * the copy for mounting. The copied mount isn't attached to any 3129 - * mount namespace and it is thus excluded from any propagation. 3130 - * So realistically we're isolated and the mount can't be 3131 - * overmounted. 3132 - */ 3133 - 3134 - /* Borrow the reference from clone_mnt(). */ 3135 - to_path.mnt = &new_ns_root->mnt; 3136 - to_path.dentry = dget(new_ns_root->mnt.mnt_root); 3137 - 3138 - /* Now lock for actual mounting. */ 3139 - LOCK_MOUNT_EXACT(mp, &to_path); 3140 - if (unlikely(IS_ERR(mp.parent))) 3141 - return ERR_CAST(mp.parent); 3142 - 3143 - /* 3144 - * We don't emulate unshare()ing a mount namespace. We stick to the 3145 - * restrictions of creating detached bind-mounts. It has a lot 3146 - * saner and simpler semantics. 3123 + * We don't emulate unshare()ing a mount namespace. We stick 3124 + * to the restrictions of creating detached bind-mounts. It 3125 + * has a lot saner and simpler semantics. 3147 3126 */ 3148 3127 mnt = __do_loopback(path, flags, copy_flags); 3149 - if (IS_ERR(mnt)) 3150 - return ERR_CAST(mnt); 3151 - 3152 3128 scoped_guard(mount_writer) { 3129 + if (IS_ERR(mnt)) { 3130 + emptied_ns = new_ns; 3131 + umount_tree(new_ns_root, 0); 3132 + return ERR_CAST(mnt); 3133 + } 3134 + 3153 3135 if (locked) 3154 3136 mnt->mnt.mnt_flags |= MNT_LOCKED; 3155 3137 /* 3156 - * Now mount the detached tree on top of the copy of the 3157 - * real rootfs we created. 3138 + * now mount the detached tree on top of the copy 3139 + * of the real rootfs we created. 3158 3140 */ 3159 3141 attach_mnt(mnt, new_ns_root, mp.mp); 3160 3142 if (user_ns != ns->user_ns) 3161 3143 lock_mnt_tree(new_ns_root); 3162 3144 } 3163 3145 3164 - /* Add all mounts to the new namespace. */ 3165 - for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) { 3166 - mnt_add_to_ns(new_ns, p); 3146 + for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) { 3147 + mnt_add_to_ns(new_ns, mnt); 3167 3148 new_ns->nr_mounts++; 3168 3149 } 3169 3150 3170 - new_ns->root = real_mount(no_free_ptr(to_path.mnt)); 3151 + new_ns->root = new_ns_root; 3171 3152 ns_tree_add_raw(new_ns); 3172 - return no_free_ptr(new_ns); 3153 + return new_ns; 3173 3154 } 3174 3155 3175 3156 static struct file *open_new_namespace(struct path *path, unsigned int flags) ··· 3843 3840 } 3844 3841 3845 3842 static void lock_mount_exact(const struct path *path, 3846 - struct pinned_mountpoint *mp) 3843 + struct pinned_mountpoint *mp, bool copy_mount, 3844 + unsigned int copy_flags) 3847 3845 { 3848 3846 struct dentry *dentry = path->dentry; 3849 3847 int err; 3848 + 3849 + /* Assert that inode_lock() locked the correct inode. */ 3850 + VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path)); 3850 3851 3851 3852 inode_lock(dentry->d_inode); 3852 3853 namespace_lock(); 3853 3854 if (unlikely(cant_mount(dentry))) 3854 3855 err = -ENOENT; 3855 - else if (path_overmounted(path)) 3856 + else if (!copy_mount && path_overmounted(path)) 3856 3857 err = -EBUSY; 3857 3858 else 3858 3859 err = get_mountpoint(dentry, mp); ··· 3864 3857 namespace_unlock(); 3865 3858 inode_unlock(dentry->d_inode); 3866 3859 mp->parent = ERR_PTR(err); 3867 - } else { 3868 - mp->parent = real_mount(path->mnt); 3860 + return; 3869 3861 } 3862 + 3863 + if (copy_mount) 3864 + mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags); 3865 + else 3866 + mp->parent = real_mount(path->mnt); 3867 + if (unlikely(IS_ERR(mp->parent))) 3868 + __unlock_mount(mp); 3870 3869 } 3871 3870 3872 3871 int finish_automount(struct vfsmount *__m, const struct path *path) ··· 5691 5678 5692 5679 s->mnt = mnt_file->f_path.mnt; 5693 5680 ns = real_mount(s->mnt)->mnt_ns; 5681 + if (IS_ERR(ns)) 5682 + return PTR_ERR(ns); 5694 5683 if (!ns) 5695 5684 /* 5696 5685 * We can't set mount point and mnt_ns_id since we don't have a

+4 -6

fs/pidfs.c

··· 608 608 struct user_namespace *user_ns; 609 609 610 610 user_ns = task_cred_xxx(task, user_ns); 611 - if (!ns_ref_get(user_ns)) 612 - break; 613 - ns_common = to_ns_common(user_ns); 611 + if (ns_ref_get(user_ns)) 612 + ns_common = to_ns_common(user_ns); 614 613 } 615 614 #endif 616 615 break; ··· 619 620 struct pid_namespace *pid_ns; 620 621 621 622 pid_ns = task_active_pid_ns(task); 622 - if (!ns_ref_get(pid_ns)) 623 - break; 624 - ns_common = to_ns_common(pid_ns); 623 + if (ns_ref_get(pid_ns)) 624 + ns_common = to_ns_common(pid_ns); 625 625 } 626 626 #endif 627 627 break;

fs/proc/base.c

··· 2128 2128 ino_t ino = 1; 2129 2129 2130 2130 child = try_lookup_noperm(&qname, dir); 2131 + if (IS_ERR(child)) 2132 + goto end_instantiate; 2133 + 2131 2134 if (!child) { 2132 2135 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); 2133 2136 child = d_alloc_parallel(dir, &qname, &wq);

-13

include/linux/fsnotify.h

··· 495 495 fsnotify_dentry(dentry, mask); 496 496 } 497 497 498 - static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, 499 - int error) 500 - { 501 - struct fs_error_report report = { 502 - .error = error, 503 - .inode = inode, 504 - .sb = sb, 505 - }; 506 - 507 - return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, 508 - NULL, NULL, NULL, 0); 509 - } 510 - 511 498 static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) 512 499 { 513 500 fsnotify_mnt(FS_MNT_ATTACH, ns, mnt);

+1 -1

kernel/fork.c

··· 3085 3085 return 0; 3086 3086 3087 3087 /* don't need lock here; in the worst case we'll do useless copy */ 3088 - if (fs->users == 1) 3088 + if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1) 3089 3089 return 0; 3090 3090 3091 3091 *new_fsp = copy_fs_struct(fs);