Merge tag 'vfs-6.11.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull vfs mount query updates from Christian Brauner:
"This contains work to extend the abilities of listmount() and
statmount() and various fixes and cleanups.

Features:

- Allow iterating through mounts via listmount() from newest to
oldest. This makes it possible for mount(8) to keep iterating the
mount table in reverse order so it gets newest mounts first.

- Relax permissions on listmount() and statmount().

It's not necessary to have capabilities in the initial namespace:
it is sufficient to have capabilities in the owning namespace of
the mount namespace we're located in to list unreachable mounts in
that namespace.

- Extend both listmount() and statmount() to list and stat mounts in
foreign mount namespaces.

Currently the only way to iterate over mount entries in mount
namespaces that aren't in the caller's mount namespace is by
crawling through /proc in order to find /proc/<pid>/mountinfo for
the relevant mount namespace.

This is both very clumsy and hugely inefficient. So extend struct
mnt_id_req with a new member that allows to specify the mount
namespace id of the mount namespace we want to look at.

Luckily internally we already have most of the infrastructure for
this so we just need to expose it to userspace. Give userspace a
way to retrieve the id of a mount namespace via statmount() and
through a new nsfs ioctl() on mount namespace file descriptor.

This comes with appropriate selftests.

- Expose mount options through statmount().

Currently if userspace wants to get mount options for a mount and
with statmount(), they still have to open /proc/<pid>/mountinfo to
parse mount options. Simply the information through statmount()
directly.

Afterwards it's possible to only rely on statmount() and
listmount() to retrieve all and more information than
/proc/<pid>/mountinfo provides.

This comes with appropriate selftests.

Fixes:

- Avoid copying to userspace under the namespace semaphore in
listmount.

Cleanups:

- Simplify the error handling in listmount by relying on our newly
added cleanup infrastructure.

- Refuse invalid mount ids early for both listmount and statmount"

* tag 'vfs-6.11.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs: reject invalid last mount id early
fs: refuse mnt id requests with invalid ids early
fs: find rootfs mount of the mount namespace
fs: only copy to userspace on success in listmount()
sefltests: extend the statmount test for mount options
fs: use guard for namespace_sem in statmount()
fs: export mount options via statmount()
fs: rename show_mnt_opts -> show_vfsmnt_opts
selftests: add a test for the foreign mnt ns extensions
fs: add an ioctl to get the mnt ns id from nsfs
fs: Allow statmount() in foreign mount namespace
fs: Allow listmount() in foreign mount namespace
fs: export the mount ns id via statmount
fs: keep an index of current mount namespaces
fs: relax permissions for statmount()
listmount: allow listing in reverse order
fs: relax permissions for listmount()
fs: simplify error handling
fs: don't copy to userspace under namespace semaphore
path: add cleanup helper

Linus Torvalds 2 years ago f608caba 2aae1d67

+926 -123

11 changed files

expand all

mount.h

namespace.c

nsfs.c

proc_namespace.c

include

linux

path.h

uapi

linux

mount.h

nsfs.h

tools

testing

selftests

filesystems

statmount

Makefile

statmount.h

statmount_test.c

statmount_test_ns.c

fs/mount.h

··· 16 16 u64 event; 17 17 unsigned int nr_mounts; /* # of mounts in the namespace */ 18 18 unsigned int pending_mounts; 19 + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ 20 + refcount_t passive; /* number references not pinning @mounts */ 19 21 } __randomize_layout; 20 22 21 23 struct mnt_pcp {

+374 -76

fs/namespace.c

··· 70 70 static DEFINE_IDA(mnt_group_ida); 71 71 72 72 /* Don't allow confusion with old 32bit mount ID */ 73 - static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32); 73 + #define MNT_UNIQUE_ID_OFFSET (1ULL << 32) 74 + static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET); 74 75 75 76 static struct hlist_head *mount_hashtable __ro_after_init; 76 77 static struct hlist_head *mountpoint_hashtable __ro_after_init; ··· 79 78 static DECLARE_RWSEM(namespace_sem); 80 79 static HLIST_HEAD(unmounted); /* protected by namespace_sem */ 81 80 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ 81 + static DEFINE_RWLOCK(mnt_ns_tree_lock); 82 + static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ 82 83 83 84 struct mount_kattr { 84 85 unsigned int attr_set; ··· 105 102 * tree or hash is modified or when a vfsmount structure is modified. 106 103 */ 107 104 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 105 + 106 + static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) 107 + { 108 + u64 seq_b = ns->seq; 109 + 110 + if (seq < seq_b) 111 + return -1; 112 + if (seq > seq_b) 113 + return 1; 114 + return 0; 115 + } 116 + 117 + static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) 118 + { 119 + if (!node) 120 + return NULL; 121 + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); 122 + } 123 + 124 + static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) 125 + { 126 + struct mnt_namespace *ns_a = node_to_mnt_ns(a); 127 + struct mnt_namespace *ns_b = node_to_mnt_ns(b); 128 + u64 seq_a = ns_a->seq; 129 + 130 + return mnt_ns_cmp(seq_a, ns_b) < 0; 131 + } 132 + 133 + static void mnt_ns_tree_add(struct mnt_namespace *ns) 134 + { 135 + guard(write_lock)(&mnt_ns_tree_lock); 136 + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); 137 + } 138 + 139 + static void mnt_ns_release(struct mnt_namespace *ns) 140 + { 141 + lockdep_assert_not_held(&mnt_ns_tree_lock); 142 + 143 + /* keep alive for {list,stat}mount() */ 144 + if (refcount_dec_and_test(&ns->passive)) { 145 + put_user_ns(ns->user_ns); 146 + kfree(ns); 147 + } 148 + } 149 + DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) 150 + 151 + static void mnt_ns_tree_remove(struct mnt_namespace *ns) 152 + { 153 + /* remove from global mount namespace list */ 154 + if (!is_anon_ns(ns)) { 155 + guard(write_lock)(&mnt_ns_tree_lock); 156 + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); 157 + } 158 + 159 + mnt_ns_release(ns); 160 + } 161 + 162 + /* 163 + * Returns the mount namespace which either has the specified id, or has the 164 + * next smallest id afer the specified one. 165 + */ 166 + static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) 167 + { 168 + struct rb_node *node = mnt_ns_tree.rb_node; 169 + struct mnt_namespace *ret = NULL; 170 + 171 + lockdep_assert_held(&mnt_ns_tree_lock); 172 + 173 + while (node) { 174 + struct mnt_namespace *n = node_to_mnt_ns(node); 175 + 176 + if (mnt_ns_id <= n->seq) { 177 + ret = node_to_mnt_ns(node); 178 + if (mnt_ns_id == n->seq) 179 + break; 180 + node = node->rb_left; 181 + } else { 182 + node = node->rb_right; 183 + } 184 + } 185 + return ret; 186 + } 187 + 188 + /* 189 + * Lookup a mount namespace by id and take a passive reference count. Taking a 190 + * passive reference means the mount namespace can be emptied if e.g., the last 191 + * task holding an active reference exits. To access the mounts of the 192 + * namespace the @namespace_sem must first be acquired. If the namespace has 193 + * already shut down before acquiring @namespace_sem, {list,stat}mount() will 194 + * see that the mount rbtree of the namespace is empty. 195 + */ 196 + static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) 197 + { 198 + struct mnt_namespace *ns; 199 + 200 + guard(read_lock)(&mnt_ns_tree_lock); 201 + ns = mnt_ns_find_id_at(mnt_ns_id); 202 + if (!ns || ns->seq != mnt_ns_id) 203 + return NULL; 204 + 205 + refcount_inc(&ns->passive); 206 + return ns; 207 + } 108 208 109 209 static inline void lock_mount_hash(void) 110 210 { ··· 1549 1443 node = node->rb_left; 1550 1444 } else { 1551 1445 node = node->rb_right; 1446 + } 1447 + } 1448 + return ret; 1449 + } 1450 + 1451 + /* 1452 + * Returns the mount which either has the specified mnt_id, or has the next 1453 + * greater id before the specified one. 1454 + */ 1455 + static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id) 1456 + { 1457 + struct rb_node *node = ns->mounts.rb_node; 1458 + struct mount *ret = NULL; 1459 + 1460 + while (node) { 1461 + struct mount *m = node_to_mount(node); 1462 + 1463 + if (mnt_id >= m->mnt_id_unique) { 1464 + ret = node_to_mount(node); 1465 + if (mnt_id == m->mnt_id_unique) 1466 + break; 1467 + node = node->rb_right; 1468 + } else { 1469 + node = node->rb_left; 1552 1470 } 1553 1471 } 1554 1472 return ret; ··· 3829 3699 if (!is_anon_ns(ns)) 3830 3700 ns_free_inum(&ns->ns); 3831 3701 dec_mnt_namespaces(ns->ucounts); 3832 - put_user_ns(ns->user_ns); 3833 - kfree(ns); 3702 + mnt_ns_tree_remove(ns); 3834 3703 } 3835 3704 3836 3705 /* ··· 3868 3739 if (!anon) 3869 3740 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 3870 3741 refcount_set(&new_ns->ns.count, 1); 3742 + refcount_set(&new_ns->passive, 1); 3871 3743 new_ns->mounts = RB_ROOT; 3744 + RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); 3872 3745 init_waitqueue_head(&new_ns->poll); 3873 3746 new_ns->user_ns = get_user_ns(user_ns); 3874 3747 new_ns->ucounts = ucounts; ··· 3947 3816 while (p->mnt.mnt_root != q->mnt.mnt_root) 3948 3817 p = next_mnt(skip_mnt_tree(p), old); 3949 3818 } 3819 + mnt_ns_tree_add(new_ns); 3950 3820 namespace_unlock(); 3951 3821 3952 3822 if (rootmnt) ··· 4965 4833 return 0; 4966 4834 } 4967 4835 4836 + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) 4837 + { 4838 + s->sm.mask |= STATMOUNT_MNT_NS_ID; 4839 + s->sm.mnt_ns_id = ns->seq; 4840 + } 4841 + 4842 + static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) 4843 + { 4844 + struct vfsmount *mnt = s->mnt; 4845 + struct super_block *sb = mnt->mnt_sb; 4846 + int err; 4847 + 4848 + if (sb->s_op->show_options) { 4849 + size_t start = seq->count; 4850 + 4851 + err = sb->s_op->show_options(seq, mnt->mnt_root); 4852 + if (err) 4853 + return err; 4854 + 4855 + if (unlikely(seq_has_overflowed(seq))) 4856 + return -EAGAIN; 4857 + 4858 + if (seq->count == start) 4859 + return 0; 4860 + 4861 + /* skip leading comma */ 4862 + memmove(seq->buf + start, seq->buf + start + 1, 4863 + seq->count - start - 1); 4864 + seq->count--; 4865 + } 4866 + 4867 + return 0; 4868 + } 4869 + 4968 4870 static int statmount_string(struct kstatmount *s, u64 flag) 4969 4871 { 4970 4872 int ret; ··· 5018 4852 case STATMOUNT_MNT_POINT: 5019 4853 sm->mnt_point = seq->count; 5020 4854 ret = statmount_mnt_point(s, seq); 4855 + break; 4856 + case STATMOUNT_MNT_OPTS: 4857 + sm->mnt_opts = seq->count; 4858 + ret = statmount_mnt_opts(s, seq); 5021 4859 break; 5022 4860 default: 5023 4861 WARN_ON_ONCE(true); ··· 5063 4893 return 0; 5064 4894 } 5065 4895 5066 - static int do_statmount(struct kstatmount *s) 4896 + static struct mount *listmnt_next(struct mount *curr, bool reverse) 5067 4897 { 5068 - struct mount *m = real_mount(s->mnt); 4898 + struct rb_node *node; 4899 + 4900 + if (reverse) 4901 + node = rb_prev(&curr->mnt_node); 4902 + else 4903 + node = rb_next(&curr->mnt_node); 4904 + 4905 + return node_to_mount(node); 4906 + } 4907 + 4908 + static int grab_requested_root(struct mnt_namespace *ns, struct path *root) 4909 + { 4910 + struct mount *first, *child; 4911 + 4912 + rwsem_assert_held(&namespace_sem); 4913 + 4914 + /* We're looking at our own ns, just use get_fs_root. */ 4915 + if (ns == current->nsproxy->mnt_ns) { 4916 + get_fs_root(current->fs, root); 4917 + return 0; 4918 + } 4919 + 4920 + /* 4921 + * We have to find the first mount in our ns and use that, however it 4922 + * may not exist, so handle that properly. 4923 + */ 4924 + if (RB_EMPTY_ROOT(&ns->mounts)) 4925 + return -ENOENT; 4926 + 4927 + first = child = ns->root; 4928 + for (;;) { 4929 + child = listmnt_next(child, false); 4930 + if (!child) 4931 + return -ENOENT; 4932 + if (child->mnt_parent == first) 4933 + break; 4934 + } 4935 + 4936 + root->mnt = mntget(&child->mnt); 4937 + root->dentry = dget(root->mnt->mnt_root); 4938 + return 0; 4939 + } 4940 + 4941 + static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, 4942 + struct mnt_namespace *ns) 4943 + { 4944 + struct path root __free(path_put) = {}; 4945 + struct mount *m; 5069 4946 int err; 4947 + 4948 + /* Has the namespace already been emptied? */ 4949 + if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) 4950 + return -ENOENT; 4951 + 4952 + s->mnt = lookup_mnt_in_ns(mnt_id, ns); 4953 + if (!s->mnt) 4954 + return -ENOENT; 4955 + 4956 + err = grab_requested_root(ns, &root); 4957 + if (err) 4958 + return err; 5070 4959 5071 4960 /* 5072 4961 * Don't trigger audit denials. We just want to determine what 5073 4962 * mounts to show users. 5074 4963 */ 5075 - if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && 5076 - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) 4964 + m = real_mount(s->mnt); 4965 + if (!is_path_reachable(m, m->mnt.mnt_root, &root) && 4966 + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) 5077 4967 return -EPERM; 5078 4968 5079 4969 err = security_sb_statfs(s->mnt->mnt_root); 5080 4970 if (err) 5081 4971 return err; 5082 4972 4973 + s->root = root; 5083 4974 if (s->mask & STATMOUNT_SB_BASIC) 5084 4975 statmount_sb_basic(s); 5085 4976 ··· 5159 4928 if (!err && s->mask & STATMOUNT_MNT_POINT) 5160 4929 err = statmount_string(s, STATMOUNT_MNT_POINT); 5161 4930 4931 + if (!err && s->mask & STATMOUNT_MNT_OPTS) 4932 + err = statmount_string(s, STATMOUNT_MNT_OPTS); 4933 + 4934 + if (!err && s->mask & STATMOUNT_MNT_NS_ID) 4935 + statmount_mnt_ns_id(s, ns); 4936 + 5162 4937 if (err) 5163 4938 return err; 5164 4939 ··· 5182 4945 return true; 5183 4946 } 5184 4947 4948 + #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ 4949 + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) 4950 + 5185 4951 static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, 5186 4952 struct statmount __user *buf, size_t bufsize, 5187 4953 size_t seq_size) ··· 5196 4956 ks->mask = kreq->param; 5197 4957 ks->buf = buf; 5198 4958 ks->bufsize = bufsize; 5199 - ks->seq.size = seq_size; 5200 - ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); 5201 - if (!ks->seq.buf) 5202 - return -ENOMEM; 4959 + 4960 + if (ks->mask & STATMOUNT_STRING_REQ) { 4961 + if (bufsize == sizeof(ks->sm)) 4962 + return -EOVERFLOW; 4963 + 4964 + ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); 4965 + if (!ks->seq.buf) 4966 + return -ENOMEM; 4967 + 4968 + ks->seq.size = seq_size; 4969 + } 4970 + 5203 4971 return 0; 5204 4972 } 5205 4973 ··· 5217 4969 int ret; 5218 4970 size_t usize; 5219 4971 5220 - BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0); 4972 + BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); 5221 4973 5222 4974 ret = get_user(usize, &req->size); 5223 4975 if (ret) ··· 5232 4984 return ret; 5233 4985 if (kreq->spare != 0) 5234 4986 return -EINVAL; 4987 + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ 4988 + if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) 4989 + return -EINVAL; 5235 4990 return 0; 4991 + } 4992 + 4993 + /* 4994 + * If the user requested a specific mount namespace id, look that up and return 4995 + * that, or if not simply grab a passive reference on our mount namespace and 4996 + * return that. 4997 + */ 4998 + static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id) 4999 + { 5000 + if (mnt_ns_id) 5001 + return lookup_mnt_ns(mnt_ns_id); 5002 + refcount_inc(&current->nsproxy->mnt_ns->passive); 5003 + return current->nsproxy->mnt_ns; 5236 5004 } 5237 5005 5238 5006 SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, 5239 5007 struct statmount __user *, buf, size_t, bufsize, 5240 5008 unsigned int, flags) 5241 5009 { 5242 - struct vfsmount *mnt; 5010 + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; 5011 + struct kstatmount *ks __free(kfree) = NULL; 5243 5012 struct mnt_id_req kreq; 5244 - struct kstatmount ks; 5245 5013 /* We currently support retrieval of 3 strings. */ 5246 5014 size_t seq_size = 3 * PATH_MAX; 5247 5015 int ret; ··· 5269 5005 if (ret) 5270 5006 return ret; 5271 5007 5008 + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); 5009 + if (!ns) 5010 + return -ENOENT; 5011 + 5012 + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && 5013 + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) 5014 + return -ENOENT; 5015 + 5016 + ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); 5017 + if (!ks) 5018 + return -ENOMEM; 5019 + 5272 5020 retry: 5273 - ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); 5021 + ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size); 5274 5022 if (ret) 5275 5023 return ret; 5276 5024 5277 - down_read(&namespace_sem); 5278 - mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); 5279 - if (!mnt) { 5280 - up_read(&namespace_sem); 5281 - kvfree(ks.seq.buf); 5282 - return -ENOENT; 5283 - } 5284 - 5285 - ks.mnt = mnt; 5286 - get_fs_root(current->fs, &ks.root); 5287 - ret = do_statmount(&ks); 5288 - path_put(&ks.root); 5289 - up_read(&namespace_sem); 5025 + scoped_guard(rwsem_read, &namespace_sem) 5026 + ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); 5290 5027 5291 5028 if (!ret) 5292 - ret = copy_statmount_to_user(&ks); 5293 - kvfree(ks.seq.buf); 5029 + ret = copy_statmount_to_user(ks); 5030 + kvfree(ks->seq.buf); 5294 5031 if (retry_statmount(ret, &seq_size)) 5295 5032 goto retry; 5296 5033 return ret; 5297 5034 } 5298 5035 5299 - static struct mount *listmnt_next(struct mount *curr) 5036 + static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, 5037 + u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, 5038 + bool reverse) 5300 5039 { 5301 - return node_to_mount(rb_next(&curr->mnt_node)); 5302 - } 5303 - 5304 - static ssize_t do_listmount(struct mount *first, struct path *orig, 5305 - u64 mnt_parent_id, u64 __user *mnt_ids, 5306 - size_t nr_mnt_ids, const struct path *root) 5307 - { 5308 - struct mount *r; 5040 + struct path root __free(path_put) = {}; 5041 + struct path orig; 5042 + struct mount *r, *first; 5309 5043 ssize_t ret; 5044 + 5045 + rwsem_assert_held(&namespace_sem); 5046 + 5047 + ret = grab_requested_root(ns, &root); 5048 + if (ret) 5049 + return ret; 5050 + 5051 + if (mnt_parent_id == LSMT_ROOT) { 5052 + orig = root; 5053 + } else { 5054 + orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); 5055 + if (!orig.mnt) 5056 + return -ENOENT; 5057 + orig.dentry = orig.mnt->mnt_root; 5058 + } 5310 5059 5311 5060 /* 5312 5061 * Don't trigger audit denials. We just want to determine what 5313 5062 * mounts to show users. 5314 5063 */ 5315 - if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) && 5316 - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) 5064 + if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && 5065 + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) 5317 5066 return -EPERM; 5318 5067 5319 - ret = security_sb_statfs(orig->dentry); 5068 + ret = security_sb_statfs(orig.dentry); 5320 5069 if (ret) 5321 5070 return ret; 5322 5071 5323 - for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) { 5072 + if (!last_mnt_id) { 5073 + if (reverse) 5074 + first = node_to_mount(rb_last(&ns->mounts)); 5075 + else 5076 + first = node_to_mount(rb_first(&ns->mounts)); 5077 + } else { 5078 + if (reverse) 5079 + first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); 5080 + else 5081 + first = mnt_find_id_at(ns, last_mnt_id + 1); 5082 + } 5083 + 5084 + for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) { 5324 5085 if (r->mnt_id_unique == mnt_parent_id) 5325 5086 continue; 5326 - if (!is_path_reachable(r, r->mnt.mnt_root, orig)) 5087 + if (!is_path_reachable(r, r->mnt.mnt_root, &orig)) 5327 5088 continue; 5328 - if (put_user(r->mnt_id_unique, mnt_ids)) 5329 - return -EFAULT; 5089 + *mnt_ids = r->mnt_id_unique; 5330 5090 mnt_ids++; 5331 5091 nr_mnt_ids--; 5332 5092 ret++; ··· 5358 5070 return ret; 5359 5071 } 5360 5072 5361 - SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, 5362 - mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) 5073 + SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, 5074 + u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) 5363 5075 { 5364 - struct mnt_namespace *ns = current->nsproxy->mnt_ns; 5076 + u64 *kmnt_ids __free(kvfree) = NULL; 5077 + const size_t maxcount = 1000000; 5078 + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; 5365 5079 struct mnt_id_req kreq; 5366 - struct mount *first; 5367 - struct path root, orig; 5368 - u64 mnt_parent_id, last_mnt_id; 5369 - const size_t maxcount = (size_t)-1 >> 3; 5080 + u64 last_mnt_id; 5370 5081 ssize_t ret; 5371 5082 5372 - if (flags) 5083 + if (flags & ~LISTMOUNT_REVERSE) 5373 5084 return -EINVAL; 5374 5085 5086 + /* 5087 + * If the mount namespace really has more than 1 million mounts the 5088 + * caller must iterate over the mount namespace (and reconsider their 5089 + * system design...). 5090 + */ 5375 5091 if (unlikely(nr_mnt_ids > maxcount)) 5376 - return -EFAULT; 5092 + return -EOVERFLOW; 5377 5093 5378 5094 if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) 5379 5095 return -EFAULT; ··· 5385 5093 ret = copy_mnt_id_req(req, &kreq); 5386 5094 if (ret) 5387 5095 return ret; 5388 - mnt_parent_id = kreq.mnt_id; 5096 + 5389 5097 last_mnt_id = kreq.param; 5098 + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ 5099 + if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) 5100 + return -EINVAL; 5390 5101 5391 - down_read(&namespace_sem); 5392 - get_fs_root(current->fs, &root); 5393 - if (mnt_parent_id == LSMT_ROOT) { 5394 - orig = root; 5395 - } else { 5396 - ret = -ENOENT; 5397 - orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); 5398 - if (!orig.mnt) 5399 - goto err; 5400 - orig.dentry = orig.mnt->mnt_root; 5401 - } 5402 - if (!last_mnt_id) 5403 - first = node_to_mount(rb_first(&ns->mounts)); 5404 - else 5405 - first = mnt_find_id_at(ns, last_mnt_id + 1); 5102 + kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), 5103 + GFP_KERNEL_ACCOUNT); 5104 + if (!kmnt_ids) 5105 + return -ENOMEM; 5406 5106 5407 - ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root); 5408 - err: 5409 - path_put(&root); 5410 - up_read(&namespace_sem); 5107 + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); 5108 + if (!ns) 5109 + return -ENOENT; 5110 + 5111 + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && 5112 + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) 5113 + return -ENOENT; 5114 + 5115 + scoped_guard(rwsem_read, &namespace_sem) 5116 + ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, 5117 + nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); 5118 + if (ret <= 0) 5119 + return ret; 5120 + 5121 + if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) 5122 + return -EFAULT; 5123 + 5411 5124 return ret; 5412 5125 } 5413 - 5414 5126 5415 5127 static void __init init_mount_tree(void) 5416 5128 { ··· 5443 5147 5444 5148 set_fs_pwd(current->fs, &root); 5445 5149 set_fs_root(current->fs, &root); 5150 + 5151 + mnt_ns_tree_add(ns); 5446 5152 } 5447 5153 5448 5154 void __init mnt_init(void)

+14

fs/nsfs.c

··· 12 12 #include <linux/nsfs.h> 13 13 #include <linux/uaccess.h> 14 14 15 + #include "mount.h" 15 16 #include "internal.h" 16 17 17 18 static struct vfsmount *nsfs_mnt; ··· 144 143 argp = (uid_t __user *) arg; 145 144 uid = from_kuid_munged(current_user_ns(), user_ns->owner); 146 145 return put_user(uid, argp); 146 + case NS_GET_MNTNS_ID: { 147 + struct mnt_namespace *mnt_ns; 148 + __u64 __user *idp; 149 + __u64 id; 150 + 151 + if (ns->ops->type != CLONE_NEWNS) 152 + return -EINVAL; 153 + 154 + mnt_ns = container_of(ns, struct mnt_namespace, ns); 155 + idp = (__u64 __user *)arg; 156 + id = mnt_ns->seq; 157 + return put_user(id, idp); 158 + } 147 159 default: 148 160 return -ENOTTY; 149 161 }

+3 -3

fs/proc_namespace.c

··· 61 61 return security_sb_show_options(m, sb); 62 62 } 63 63 64 - static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) 64 + static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt) 65 65 { 66 66 static const struct proc_fs_opts mnt_opts[] = { 67 67 { MNT_NOSUID, ",nosuid" }, ··· 124 124 err = show_sb_opts(m, sb); 125 125 if (err) 126 126 goto out; 127 - show_mnt_opts(m, mnt); 127 + show_vfsmnt_opts(m, mnt); 128 128 if (sb->s_op->show_options) 129 129 err = sb->s_op->show_options(m, mnt_path.dentry); 130 130 seq_puts(m, " 0 0\n"); ··· 153 153 goto out; 154 154 155 155 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); 156 - show_mnt_opts(m, mnt); 156 + show_vfsmnt_opts(m, mnt); 157 157 158 158 /* Tagged fields ("foo:X" or "bar") */ 159 159 if (IS_MNT_SHARED(r))

include/linux/path.h

··· 24 24 *path = (struct path) { }; 25 25 } 26 26 27 + /* 28 + * Cleanup macro for use with __free(path_put). Avoids dereference and 29 + * copying @path unlike DEFINE_FREE(). path_put() will handle the empty 30 + * path correctly just ensure @path is initialized: 31 + * 32 + * struct path path __free(path_put) = {}; 33 + */ 34 + #define __free_path_put path_put 35 + 27 36 #endif /* _LINUX_PATH_H */

+8 -2

include/uapi/linux/mount.h

··· 154 154 */ 155 155 struct statmount { 156 156 __u32 size; /* Total size, including strings */ 157 - __u32 __spare1; 157 + __u32 mnt_opts; /* [str] Mount options of the mount */ 158 158 __u64 mask; /* What results were written */ 159 159 __u32 sb_dev_major; /* Device ID */ 160 160 __u32 sb_dev_minor; ··· 172 172 __u64 propagate_from; /* Propagation from in current namespace */ 173 173 __u32 mnt_root; /* [str] Root of mount relative to root of fs */ 174 174 __u32 mnt_point; /* [str] Mountpoint relative to current root */ 175 - __u64 __spare2[50]; 175 + __u64 mnt_ns_id; /* ID of the mount namespace */ 176 + __u64 __spare2[49]; 176 177 char str[]; /* Variable size part containing strings */ 177 178 }; 178 179 ··· 189 188 __u32 spare; 190 189 __u64 mnt_id; 191 190 __u64 param; 191 + __u64 mnt_ns_id; 192 192 }; 193 193 194 194 /* List of all mnt_id_req versions. */ 195 195 #define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ 196 + #define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ 196 197 197 198 /* 198 199 * @mask bits for statmount(2) ··· 205 202 #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ 206 203 #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ 207 204 #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ 205 + #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ 206 + #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ 208 207 209 208 /* 210 209 * Special @mnt_id values that can be passed to listmount 211 210 */ 212 211 #define LSMT_ROOT 0xffffffffffffffff /* root mount */ 212 + #define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ 213 213 214 214 #endif /* _UAPI_LINUX_MOUNT_H */

include/uapi/linux/nsfs.h

··· 15 15 #define NS_GET_NSTYPE _IO(NSIO, 0x3) 16 16 /* Get owner UID (in the caller's user namespace) for a user namespace */ 17 17 #define NS_GET_OWNER_UID _IO(NSIO, 0x4) 18 + /* Get the id for a mount namespace */ 19 + #define NS_GET_MNTNS_ID _IO(NSIO, 0x5) 18 20 19 21 #endif /* __LINUX_NSFS_H */

+1 -1

tools/testing/selftests/filesystems/statmount/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0-or-later 2 2 3 3 CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) 4 - TEST_GEN_PROGS := statmount_test 4 + TEST_GEN_PROGS := statmount_test statmount_test_ns 5 5 6 6 include ../../lib.mk

+46

tools/testing/selftests/filesystems/statmount/statmount.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef __STATMOUNT_H 4 + #define __STATMOUNT_H 5 + 6 + #include <stdint.h> 7 + #include <linux/mount.h> 8 + #include <asm/unistd.h> 9 + 10 + static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, 11 + struct statmount *buf, size_t bufsize, 12 + unsigned int flags) 13 + { 14 + struct mnt_id_req req = { 15 + .size = MNT_ID_REQ_SIZE_VER0, 16 + .mnt_id = mnt_id, 17 + .param = mask, 18 + }; 19 + 20 + if (mnt_ns_id) { 21 + req.size = MNT_ID_REQ_SIZE_VER1; 22 + req.mnt_ns_id = mnt_ns_id; 23 + } 24 + 25 + return syscall(__NR_statmount, &req, buf, bufsize, flags); 26 + } 27 + 28 + static ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, 29 + uint64_t last_mnt_id, uint64_t list[], size_t num, 30 + unsigned int flags) 31 + { 32 + struct mnt_id_req req = { 33 + .size = MNT_ID_REQ_SIZE_VER0, 34 + .mnt_id = mnt_id, 35 + .param = last_mnt_id, 36 + }; 37 + 38 + if (mnt_ns_id) { 39 + req.size = MNT_ID_REQ_SIZE_VER1; 40 + req.mnt_ns_id = mnt_ns_id; 41 + } 42 + 43 + return syscall(__NR_listmount, &req, list, num, flags); 44 + } 45 + 46 + #endif /* __STATMOUNT_H */

+103 -41

tools/testing/selftests/filesystems/statmount/statmount_test.c

··· 4 4 5 5 #include <assert.h> 6 6 #include <stddef.h> 7 - #include <stdint.h> 8 7 #include <sched.h> 9 8 #include <fcntl.h> 10 9 #include <sys/param.h> 11 10 #include <sys/mount.h> 12 11 #include <sys/stat.h> 13 12 #include <sys/statfs.h> 14 - #include <linux/mount.h> 15 13 #include <linux/stat.h> 16 - #include <asm/unistd.h> 17 14 15 + #include "statmount.h" 18 16 #include "../../kselftest.h" 19 17 20 18 static const char *const known_fs[] = { ··· 34 36 "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", 35 37 "zonefs", NULL }; 36 38 37 - static int statmount(uint64_t mnt_id, uint64_t mask, struct statmount *buf, 38 - size_t bufsize, unsigned int flags) 39 - { 40 - struct mnt_id_req req = { 41 - .size = MNT_ID_REQ_SIZE_VER0, 42 - .mnt_id = mnt_id, 43 - .param = mask, 44 - }; 45 - 46 - return syscall(__NR_statmount, &req, buf, bufsize, flags); 47 - } 48 - 49 39 static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) 50 40 { 51 41 size_t bufsize = 1 << 15; ··· 42 56 int ret; 43 57 44 58 for (;;) { 45 - ret = statmount(mnt_id, mask, tmp, bufsize, flags); 59 + ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags); 46 60 if (ret != -1) 47 61 break; 48 62 if (tofree) ··· 107 121 static int orig_root; 108 122 static uint64_t root_id, parent_id; 109 123 static uint32_t old_root_id, old_parent_id; 110 - 124 + static FILE *f_mountinfo; 111 125 112 126 static void cleanup_namespace(void) 113 127 { ··· 132 146 uid_t uid = getuid(); 133 147 gid_t gid = getgid(); 134 148 135 - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); 149 + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); 136 150 if (ret == -1) 137 151 ksft_exit_fail_msg("unsharing mountns and userns: %s\n", 138 152 strerror(errno)); ··· 142 156 write_file("/proc/self/setgroups", "deny"); 143 157 sprintf(buf, "0 %d 1", gid); 144 158 write_file("/proc/self/gid_map", buf); 159 + 160 + f_mountinfo = fopen("/proc/self/mountinfo", "re"); 161 + if (!f_mountinfo) 162 + ksft_exit_fail_msg("failed to open mountinfo: %s\n", 163 + strerror(errno)); 145 164 146 165 ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); 147 166 if (ret == -1) ··· 207 216 return 0; 208 217 } 209 218 210 - static ssize_t listmount(uint64_t mnt_id, uint64_t last_mnt_id, 211 - uint64_t list[], size_t num, unsigned int flags) 212 - { 213 - struct mnt_id_req req = { 214 - .size = MNT_ID_REQ_SIZE_VER0, 215 - .mnt_id = mnt_id, 216 - .param = last_mnt_id, 217 - }; 218 - 219 - return syscall(__NR_listmount, &req, list, num, flags); 220 - } 221 - 222 219 static void test_listmount_empty_root(void) 223 220 { 224 221 ssize_t res; 225 222 const unsigned int size = 32; 226 223 uint64_t list[size]; 227 224 228 - res = listmount(LSMT_ROOT, 0, list, size, 0); 225 + res = listmount(LSMT_ROOT, 0, 0, list, size, 0); 229 226 if (res == -1) { 230 227 ksft_test_result_fail("listmount: %s\n", strerror(errno)); 231 228 return; ··· 238 259 struct statmount sm; 239 260 int ret; 240 261 241 - ret = statmount(root_id, 0, &sm, sizeof(sm), 0); 262 + ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0); 242 263 if (ret == -1) { 243 264 ksft_test_result_fail("statmount zero mask: %s\n", 244 265 strerror(errno)); ··· 264 285 int ret; 265 286 uint64_t mask = STATMOUNT_MNT_BASIC; 266 287 267 - ret = statmount(root_id, mask, &sm, sizeof(sm), 0); 288 + ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); 268 289 if (ret == -1) { 269 290 ksft_test_result_fail("statmount mnt basic: %s\n", 270 291 strerror(errno)); ··· 324 345 struct statx sx; 325 346 struct statfs sf; 326 347 327 - ret = statmount(root_id, mask, &sm, sizeof(sm), 0); 348 + ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); 328 349 if (ret == -1) { 329 350 ksft_test_result_fail("statmount sb basic: %s\n", 330 351 strerror(errno)); ··· 449 470 free(sm); 450 471 } 451 472 473 + static void test_statmount_mnt_opts(void) 474 + { 475 + struct statmount *sm; 476 + const char *statmount_opts; 477 + char *line = NULL; 478 + size_t len = 0; 479 + 480 + sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, 481 + 0); 482 + if (!sm) { 483 + ksft_test_result_fail("statmount mnt opts: %s\n", 484 + strerror(errno)); 485 + return; 486 + } 487 + 488 + while (getline(&line, &len, f_mountinfo) != -1) { 489 + int i; 490 + char *p, *p2; 491 + unsigned int old_mnt_id; 492 + 493 + old_mnt_id = atoi(line); 494 + if (old_mnt_id != sm->mnt_id_old) 495 + continue; 496 + 497 + for (p = line, i = 0; p && i < 5; i++) 498 + p = strchr(p + 1, ' '); 499 + if (!p) 500 + continue; 501 + 502 + p2 = strchr(p + 1, ' '); 503 + if (!p2) 504 + continue; 505 + *p2 = '\0'; 506 + p = strchr(p2 + 1, '-'); 507 + if (!p) 508 + continue; 509 + for (p++, i = 0; p && i < 2; i++) 510 + p = strchr(p + 1, ' '); 511 + if (!p) 512 + continue; 513 + p++; 514 + 515 + /* skip generic superblock options */ 516 + if (strncmp(p, "ro", 2) == 0) 517 + p += 2; 518 + else if (strncmp(p, "rw", 2) == 0) 519 + p += 2; 520 + if (*p == ',') 521 + p++; 522 + if (strncmp(p, "sync", 4) == 0) 523 + p += 4; 524 + if (*p == ',') 525 + p++; 526 + if (strncmp(p, "dirsync", 7) == 0) 527 + p += 7; 528 + if (*p == ',') 529 + p++; 530 + if (strncmp(p, "lazytime", 8) == 0) 531 + p += 8; 532 + if (*p == ',') 533 + p++; 534 + p2 = strrchr(p, '\n'); 535 + if (p2) 536 + *p2 = '\0'; 537 + 538 + statmount_opts = sm->str + sm->mnt_opts; 539 + if (strcmp(statmount_opts, p) != 0) 540 + ksft_test_result_fail( 541 + "unexpected mount options: '%s' != '%s'\n", 542 + statmount_opts, p); 543 + else 544 + ksft_test_result_pass("statmount mount options\n"); 545 + free(sm); 546 + free(line); 547 + return; 548 + } 549 + 550 + ksft_test_result_fail("didnt't find mount entry\n"); 551 + free(sm); 552 + free(line); 553 + } 554 + 452 555 static void test_statmount_string(uint64_t mask, size_t off, const char *name) 453 556 { 454 557 struct statmount *sm; ··· 567 506 exactsize = sm->size; 568 507 shortsize = sizeof(*sm) + i; 569 508 570 - ret = statmount(root_id, mask, sm, exactsize, 0); 509 + ret = statmount(root_id, 0, mask, sm, exactsize, 0); 571 510 if (ret == -1) { 572 511 ksft_test_result_fail("statmount exact size: %s\n", 573 512 strerror(errno)); 574 513 goto out; 575 514 } 576 515 errno = 0; 577 - ret = statmount(root_id, mask, sm, shortsize, 0); 516 + ret = statmount(root_id, 0, mask, sm, shortsize, 0); 578 517 if (ret != -1 || errno != EOVERFLOW) { 579 518 ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", 580 519 strerror(errno)); ··· 602 541 if (res == -1) 603 542 return; 604 543 605 - num = res = listmount(LSMT_ROOT, 0, list, size, 0); 544 + num = res = listmount(LSMT_ROOT, 0, 0, list, size, 0); 606 545 if (res == -1) { 607 546 ksft_test_result_fail("listmount: %s\n", strerror(errno)); 608 547 return; ··· 614 553 } 615 554 616 555 for (i = 0; i < size - step;) { 617 - res = listmount(LSMT_ROOT, i ? list2[i - 1] : 0, list2 + i, step, 0); 556 + res = listmount(LSMT_ROOT, 0, i ? list2[i - 1] : 0, list2 + i, step, 0); 618 557 if (res == -1) 619 558 ksft_test_result_fail("short listmount: %s\n", 620 559 strerror(errno)); ··· 646 585 int ret; 647 586 uint64_t all_mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC | 648 587 STATMOUNT_PROPAGATE_FROM | STATMOUNT_MNT_ROOT | 649 - STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE; 588 + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE | STATMOUNT_MNT_NS_ID; 650 589 651 590 ksft_print_header(); 652 591 653 - ret = statmount(0, 0, NULL, 0, 0); 592 + ret = statmount(0, 0, 0, NULL, 0, 0); 654 593 assert(ret == -1); 655 594 if (errno == ENOSYS) 656 595 ksft_exit_skip("statmount() syscall not supported\n"); 657 596 658 597 setup_namespace(); 659 598 660 - ksft_set_plan(14); 599 + ksft_set_plan(15); 661 600 test_listmount_empty_root(); 662 601 test_statmount_zero_mask(); 663 602 test_statmount_mnt_basic(); ··· 665 604 test_statmount_mnt_root(); 666 605 test_statmount_mnt_point(); 667 606 test_statmount_fs_type(); 607 + test_statmount_mnt_opts(); 668 608 test_statmount_string(STATMOUNT_MNT_ROOT, str_off(mnt_root), "mount root"); 669 609 test_statmount_string(STATMOUNT_MNT_POINT, str_off(mnt_point), "mount point"); 670 610 test_statmount_string(STATMOUNT_FS_TYPE, str_off(fs_type), "fs type");

+364

tools/testing/selftests/filesystems/statmount/statmount_test_ns.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + #define _GNU_SOURCE 4 + 5 + #include <assert.h> 6 + #include <fcntl.h> 7 + #include <limits.h> 8 + #include <sched.h> 9 + #include <stdlib.h> 10 + #include <sys/mount.h> 11 + #include <sys/stat.h> 12 + #include <sys/wait.h> 13 + #include <linux/nsfs.h> 14 + #include <linux/stat.h> 15 + 16 + #include "statmount.h" 17 + #include "../../kselftest.h" 18 + 19 + #define NSID_PASS 0 20 + #define NSID_FAIL 1 21 + #define NSID_SKIP 2 22 + #define NSID_ERROR 3 23 + 24 + static void handle_result(int ret, const char *testname) 25 + { 26 + if (ret == NSID_PASS) 27 + ksft_test_result_pass("%s\n", testname); 28 + else if (ret == NSID_FAIL) 29 + ksft_test_result_fail("%s\n", testname); 30 + else if (ret == NSID_ERROR) 31 + ksft_exit_fail_msg("%s\n", testname); 32 + else 33 + ksft_test_result_skip("%s\n", testname); 34 + } 35 + 36 + static inline int wait_for_pid(pid_t pid) 37 + { 38 + int status, ret; 39 + 40 + again: 41 + ret = waitpid(pid, &status, 0); 42 + if (ret == -1) { 43 + if (errno == EINTR) 44 + goto again; 45 + 46 + ksft_print_msg("waitpid returned -1, errno=%d\n", errno); 47 + return -1; 48 + } 49 + 50 + if (!WIFEXITED(status)) { 51 + ksft_print_msg( 52 + "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n", 53 + WIFSIGNALED(status), WTERMSIG(status)); 54 + return -1; 55 + } 56 + 57 + ret = WEXITSTATUS(status); 58 + return ret; 59 + } 60 + 61 + static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id) 62 + { 63 + int fd = open(mnt_ns, O_RDONLY); 64 + 65 + if (fd < 0) { 66 + ksft_print_msg("failed to open for ns %s: %s\n", 67 + mnt_ns, strerror(errno)); 68 + sleep(60); 69 + return NSID_ERROR; 70 + } 71 + 72 + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) { 73 + ksft_print_msg("failed to get the nsid for ns %s: %s\n", 74 + mnt_ns, strerror(errno)); 75 + return NSID_ERROR; 76 + } 77 + close(fd); 78 + return NSID_PASS; 79 + } 80 + 81 + static int get_mnt_id(const char *path, uint64_t *mnt_id) 82 + { 83 + struct statx sx; 84 + int ret; 85 + 86 + ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx); 87 + if (ret == -1) { 88 + ksft_print_msg("retrieving unique mount ID for %s: %s\n", path, 89 + strerror(errno)); 90 + return NSID_ERROR; 91 + } 92 + 93 + if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) { 94 + ksft_print_msg("no unique mount ID available for %s\n", path); 95 + return NSID_ERROR; 96 + } 97 + 98 + *mnt_id = sx.stx_mnt_id; 99 + return NSID_PASS; 100 + } 101 + 102 + static int write_file(const char *path, const char *val) 103 + { 104 + int fd = open(path, O_WRONLY); 105 + size_t len = strlen(val); 106 + int ret; 107 + 108 + if (fd == -1) { 109 + ksft_print_msg("opening %s for write: %s\n", path, strerror(errno)); 110 + return NSID_ERROR; 111 + } 112 + 113 + ret = write(fd, val, len); 114 + if (ret == -1) { 115 + ksft_print_msg("writing to %s: %s\n", path, strerror(errno)); 116 + return NSID_ERROR; 117 + } 118 + if (ret != len) { 119 + ksft_print_msg("short write to %s\n", path); 120 + return NSID_ERROR; 121 + } 122 + 123 + ret = close(fd); 124 + if (ret == -1) { 125 + ksft_print_msg("closing %s\n", path); 126 + return NSID_ERROR; 127 + } 128 + 129 + return NSID_PASS; 130 + } 131 + 132 + static int setup_namespace(void) 133 + { 134 + int ret; 135 + char buf[32]; 136 + uid_t uid = getuid(); 137 + gid_t gid = getgid(); 138 + 139 + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); 140 + if (ret == -1) 141 + ksft_exit_fail_msg("unsharing mountns and userns: %s\n", 142 + strerror(errno)); 143 + 144 + sprintf(buf, "0 %d 1", uid); 145 + ret = write_file("/proc/self/uid_map", buf); 146 + if (ret != NSID_PASS) 147 + return ret; 148 + ret = write_file("/proc/self/setgroups", "deny"); 149 + if (ret != NSID_PASS) 150 + return ret; 151 + sprintf(buf, "0 %d 1", gid); 152 + ret = write_file("/proc/self/gid_map", buf); 153 + if (ret != NSID_PASS) 154 + return ret; 155 + 156 + ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); 157 + if (ret == -1) { 158 + ksft_print_msg("making mount tree private: %s\n", 159 + strerror(errno)); 160 + return NSID_ERROR; 161 + } 162 + 163 + return NSID_PASS; 164 + } 165 + 166 + static int _test_statmount_mnt_ns_id(void) 167 + { 168 + struct statmount sm; 169 + uint64_t mnt_ns_id; 170 + uint64_t root_id; 171 + int ret; 172 + 173 + ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id); 174 + if (ret != NSID_PASS) 175 + return ret; 176 + 177 + ret = get_mnt_id("/", &root_id); 178 + if (ret != NSID_PASS) 179 + return ret; 180 + 181 + ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); 182 + if (ret == -1) { 183 + ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); 184 + return NSID_ERROR; 185 + } 186 + 187 + if (sm.size != sizeof(sm)) { 188 + ksft_print_msg("unexpected size: %u != %u\n", sm.size, 189 + (uint32_t)sizeof(sm)); 190 + return NSID_FAIL; 191 + } 192 + if (sm.mask != STATMOUNT_MNT_NS_ID) { 193 + ksft_print_msg("statmount mnt ns id unavailable\n"); 194 + return NSID_SKIP; 195 + } 196 + 197 + if (sm.mnt_ns_id != mnt_ns_id) { 198 + ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n", 199 + (unsigned long long)sm.mnt_ns_id, 200 + (unsigned long long)mnt_ns_id); 201 + return NSID_FAIL; 202 + } 203 + 204 + return NSID_PASS; 205 + } 206 + 207 + static void test_statmount_mnt_ns_id(void) 208 + { 209 + pid_t pid; 210 + int ret; 211 + 212 + pid = fork(); 213 + if (pid < 0) 214 + ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno)); 215 + 216 + /* We're the original pid, wait for the result. */ 217 + if (pid != 0) { 218 + ret = wait_for_pid(pid); 219 + handle_result(ret, "test statmount ns id"); 220 + return; 221 + } 222 + 223 + ret = setup_namespace(); 224 + if (ret != NSID_PASS) 225 + exit(ret); 226 + ret = _test_statmount_mnt_ns_id(); 227 + exit(ret); 228 + } 229 + 230 + static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts) 231 + { 232 + uint64_t list[256]; 233 + uint64_t mnt_ns_id; 234 + uint64_t nr_mounts; 235 + char buf[256]; 236 + int ret; 237 + 238 + /* Get the mount ns id for our child. */ 239 + snprintf(buf, sizeof(buf), "/proc/%lu/ns/mnt", (unsigned long)pid); 240 + ret = get_mnt_ns_id(buf, &mnt_ns_id); 241 + 242 + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); 243 + if (nr_mounts == (uint64_t)-1) { 244 + ksft_print_msg("listmount: %s\n", strerror(errno)); 245 + return NSID_ERROR; 246 + } 247 + 248 + if (nr_mounts != child_nr_mounts) { 249 + ksft_print_msg("listmount results is %zi != %zi\n", nr_mounts, 250 + child_nr_mounts); 251 + return NSID_FAIL; 252 + } 253 + 254 + /* Validate that all of our entries match our mnt_ns_id. */ 255 + for (int i = 0; i < nr_mounts; i++) { 256 + struct statmount sm; 257 + 258 + ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm, 259 + sizeof(sm), 0); 260 + if (ret < 0) { 261 + ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); 262 + return NSID_ERROR; 263 + } 264 + 265 + if (sm.mask != STATMOUNT_MNT_NS_ID) { 266 + ksft_print_msg("statmount mnt ns id unavailable\n"); 267 + return NSID_SKIP; 268 + } 269 + 270 + if (sm.mnt_ns_id != mnt_ns_id) { 271 + ksft_print_msg("listmount gave us the wrong ns id: 0x%llx != 0x%llx\n", 272 + (unsigned long long)sm.mnt_ns_id, 273 + (unsigned long long)mnt_ns_id); 274 + return NSID_FAIL; 275 + } 276 + } 277 + 278 + return NSID_PASS; 279 + } 280 + 281 + static void test_listmount_ns(void) 282 + { 283 + uint64_t nr_mounts; 284 + char pval; 285 + int child_ready_pipe[2]; 286 + int parent_ready_pipe[2]; 287 + pid_t pid; 288 + int ret, child_ret; 289 + 290 + if (pipe(child_ready_pipe) < 0) 291 + ksft_exit_fail_msg("failed to create the child pipe: %s\n", 292 + strerror(errno)); 293 + if (pipe(parent_ready_pipe) < 0) 294 + ksft_exit_fail_msg("failed to create the parent pipe: %s\n", 295 + strerror(errno)); 296 + 297 + pid = fork(); 298 + if (pid < 0) 299 + ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno)); 300 + 301 + if (pid == 0) { 302 + char cval; 303 + uint64_t list[256]; 304 + 305 + close(child_ready_pipe[0]); 306 + close(parent_ready_pipe[1]); 307 + 308 + ret = setup_namespace(); 309 + if (ret != NSID_PASS) 310 + exit(ret); 311 + 312 + nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 256, 0); 313 + if (nr_mounts == (uint64_t)-1) { 314 + ksft_print_msg("listmount: %s\n", strerror(errno)); 315 + exit(NSID_FAIL); 316 + } 317 + 318 + /* 319 + * Tell our parent how many mounts we have, and then wait for it 320 + * to tell us we're done. 321 + */ 322 + write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)); 323 + read(parent_ready_pipe[0], &cval, sizeof(cval)); 324 + exit(NSID_PASS); 325 + } 326 + 327 + close(child_ready_pipe[1]); 328 + close(parent_ready_pipe[0]); 329 + 330 + /* Wait until the child has created everything. */ 331 + if (read(child_ready_pipe[0], &nr_mounts, sizeof(nr_mounts)) != 332 + sizeof(nr_mounts)) 333 + ret = NSID_ERROR; 334 + 335 + ret = validate_external_listmount(pid, nr_mounts); 336 + 337 + if (write(parent_ready_pipe[1], &pval, sizeof(pval)) != sizeof(pval)) 338 + ret = NSID_ERROR; 339 + 340 + child_ret = wait_for_pid(pid); 341 + if (child_ret != NSID_PASS) 342 + ret = child_ret; 343 + handle_result(ret, "test listmount ns id"); 344 + } 345 + 346 + int main(void) 347 + { 348 + int ret; 349 + 350 + ksft_print_header(); 351 + ret = statmount(0, 0, 0, NULL, 0, 0); 352 + assert(ret == -1); 353 + if (errno == ENOSYS) 354 + ksft_exit_skip("statmount() syscall not supported\n"); 355 + 356 + ksft_set_plan(2); 357 + test_statmount_mnt_ns_id(); 358 + test_listmount_ns(); 359 + 360 + if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) 361 + ksft_exit_fail(); 362 + else 363 + ksft_exit_pass(); 364 + }