Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mount: hold namespace_sem across copy in create_new_namespace()

Fix an oversight when creating a new mount namespace. If someone had the
bright idea to make the real rootfs a shared or dependent mount and it
is later copied the copy will become a peer of the old real rootfs mount
or a dependent mount of it. The namespace semaphore is dropped and we
use mount lock exact to lock the new real root mount. If that fails or
the subsequent do_loopback() fails we rely on the copy of the real root
mount to be cleaned up by path_put(). The problem is that this doesn't
deal with mount propagation and will leave the mounts linked in the
propagation lists.

When creating a new mount namespace create_new_namespace() first
acquires namespace_sem to clone the nullfs root, drops it, then
reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to
respect the inode_lock -> namespace_sem lock ordering. This
drop-and-reacquire pattern is fragile and was the source of the
propagation cleanup bug fixed in the preceding commit.

Extend lock_mount_exact() with a copy_mount mode that clones the mount
under the locks atomically. When copy_mount is true, path_overmounted()
is skipped since we're copying the mount, not mounting on top of it -
the nullfs root always has rootfs mounted on top so the check would
always fail. If clone_mnt() fails after get_mountpoint() has pinned the
mountpoint, __unlock_mount() is used to properly unpin the mountpoint
and release both locks.

This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which
takes inode_lock and namespace_sem once and holds them throughout the
clone and subsequent mount operations, eliminating the
drop-and-reacquire pattern entirely.

Reported-by: syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com
Fixes: 9b8a0ba68246 ("mount: add OPEN_TREE_NAMESPACE") # mainline only
Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com
Signed-off-by: Christian Brauner <brauner@kernel.org>

+60 -57
+60 -57
fs/namespace.c
··· 2791 2791 } 2792 2792 2793 2793 static void lock_mount_exact(const struct path *path, 2794 - struct pinned_mountpoint *mp); 2794 + struct pinned_mountpoint *mp, bool copy_mount, 2795 + unsigned int copy_flags); 2795 2796 2796 2797 #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ 2797 2798 struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ ··· 2800 2799 #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) 2801 2800 #define LOCK_MOUNT_EXACT(mp, path) \ 2802 2801 struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ 2803 - lock_mount_exact((path), &mp) 2802 + lock_mount_exact((path), &mp, false, 0) 2803 + #define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \ 2804 + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ 2805 + lock_mount_exact((path), &mp, true, (copy_flags)) 2804 2806 2805 2807 static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) 2806 2808 { ··· 3077 3073 return file; 3078 3074 } 3079 3075 3080 - DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *, 3081 - if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T)) 3082 - 3083 3076 static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) 3084 3077 { 3085 - struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL; 3086 - struct path to_path __free(path_put) = {}; 3087 3078 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 3088 3079 struct user_namespace *user_ns = current_user_ns(); 3089 - struct mount *new_ns_root; 3080 + struct mnt_namespace *new_ns; 3081 + struct mount *new_ns_root, *old_ns_root; 3082 + struct path to_path; 3090 3083 struct mount *mnt; 3091 3084 unsigned int copy_flags = 0; 3092 3085 bool locked = false; ··· 3095 3094 if (IS_ERR(new_ns)) 3096 3095 return ERR_CAST(new_ns); 3097 3096 3098 - scoped_guard(namespace_excl) { 3099 - new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); 3100 - if (IS_ERR(new_ns_root)) 3101 - return ERR_CAST(new_ns_root); 3097 + old_ns_root = ns->root; 3098 + to_path.mnt = &old_ns_root->mnt; 3099 + to_path.dentry = old_ns_root->mnt.mnt_root; 3102 3100 3103 - /* 3104 - * If the real rootfs had a locked mount on top of it somewhere 3105 - * in the stack, lock the new mount tree as well so it can't be 3106 - * exposed. 3107 - */ 3108 - mnt = ns->root; 3109 - while (mnt->overmount) { 3110 - mnt = mnt->overmount; 3111 - if (mnt->mnt.mnt_flags & MNT_LOCKED) 3112 - locked = true; 3113 - } 3101 + VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type); 3102 + 3103 + LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags); 3104 + if (IS_ERR(mp.parent)) { 3105 + free_mnt_ns(new_ns); 3106 + return ERR_CAST(mp.parent); 3107 + } 3108 + new_ns_root = mp.parent; 3109 + 3110 + /* 3111 + * If the real rootfs had a locked mount on top of it somewhere 3112 + * in the stack, lock the new mount tree as well so it can't be 3113 + * exposed. 3114 + */ 3115 + mnt = old_ns_root; 3116 + while (mnt->overmount) { 3117 + mnt = mnt->overmount; 3118 + if (mnt->mnt.mnt_flags & MNT_LOCKED) 3119 + locked = true; 3114 3120 } 3115 3121 3116 3122 /* 3117 - * We dropped the namespace semaphore so we can actually lock 3118 - * the copy for mounting. The copied mount isn't attached to any 3119 - * mount namespace and it is thus excluded from any propagation. 3120 - * So realistically we're isolated and the mount can't be 3121 - * overmounted. 3122 - */ 3123 - 3124 - /* Borrow the reference from clone_mnt(). */ 3125 - to_path.mnt = &new_ns_root->mnt; 3126 - to_path.dentry = dget(new_ns_root->mnt.mnt_root); 3127 - 3128 - /* Now lock for actual mounting. */ 3129 - LOCK_MOUNT_EXACT(mp, &to_path); 3130 - if (unlikely(IS_ERR(mp.parent))) 3131 - return ERR_CAST(mp.parent); 3132 - 3133 - /* 3134 - * We don't emulate unshare()ing a mount namespace. We stick to the 3135 - * restrictions of creating detached bind-mounts. It has a lot 3136 - * saner and simpler semantics. 3123 + * We don't emulate unshare()ing a mount namespace. We stick 3124 + * to the restrictions of creating detached bind-mounts. It 3125 + * has a lot saner and simpler semantics. 3137 3126 */ 3138 3127 mnt = __do_loopback(path, flags, copy_flags); 3139 - if (IS_ERR(mnt)) 3140 - return ERR_CAST(mnt); 3141 - 3142 3128 scoped_guard(mount_writer) { 3129 + if (IS_ERR(mnt)) { 3130 + emptied_ns = new_ns; 3131 + umount_tree(new_ns_root, 0); 3132 + return ERR_CAST(mnt); 3133 + } 3134 + 3143 3135 if (locked) 3144 3136 mnt->mnt.mnt_flags |= MNT_LOCKED; 3145 3137 /* 3146 - * Now mount the detached tree on top of the copy of the 3147 - * real rootfs we created. 3138 + * now mount the detached tree on top of the copy 3139 + * of the real rootfs we created. 3148 3140 */ 3149 3141 attach_mnt(mnt, new_ns_root, mp.mp); 3150 3142 if (user_ns != ns->user_ns) 3151 3143 lock_mnt_tree(new_ns_root); 3152 3144 } 3153 3145 3154 - /* Add all mounts to the new namespace. */ 3155 - for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) { 3156 - mnt_add_to_ns(new_ns, p); 3146 + for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) { 3147 + mnt_add_to_ns(new_ns, mnt); 3157 3148 new_ns->nr_mounts++; 3158 3149 } 3159 3150 3160 - new_ns->root = real_mount(no_free_ptr(to_path.mnt)); 3151 + new_ns->root = new_ns_root; 3161 3152 ns_tree_add_raw(new_ns); 3162 - return no_free_ptr(new_ns); 3153 + return new_ns; 3163 3154 } 3164 3155 3165 3156 static struct file *open_new_namespace(struct path *path, unsigned int flags) ··· 3833 3840 } 3834 3841 3835 3842 static void lock_mount_exact(const struct path *path, 3836 - struct pinned_mountpoint *mp) 3843 + struct pinned_mountpoint *mp, bool copy_mount, 3844 + unsigned int copy_flags) 3837 3845 { 3838 3846 struct dentry *dentry = path->dentry; 3839 3847 int err; 3848 + 3849 + /* Assert that inode_lock() locked the correct inode. */ 3850 + VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path)); 3840 3851 3841 3852 inode_lock(dentry->d_inode); 3842 3853 namespace_lock(); 3843 3854 if (unlikely(cant_mount(dentry))) 3844 3855 err = -ENOENT; 3845 - else if (path_overmounted(path)) 3856 + else if (!copy_mount && path_overmounted(path)) 3846 3857 err = -EBUSY; 3847 3858 else 3848 3859 err = get_mountpoint(dentry, mp); ··· 3854 3857 namespace_unlock(); 3855 3858 inode_unlock(dentry->d_inode); 3856 3859 mp->parent = ERR_PTR(err); 3857 - } else { 3858 - mp->parent = real_mount(path->mnt); 3860 + return; 3859 3861 } 3862 + 3863 + if (copy_mount) 3864 + mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags); 3865 + else 3866 + mp->parent = real_mount(path->mnt); 3867 + if (unlikely(IS_ERR(mp->parent))) 3868 + __unlock_mount(mp); 3860 3869 } 3861 3870 3862 3871 int finish_automount(struct vfsmount *__m, const struct path *path)