Merge tag 'namespace-6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

+3 -5

block/blk-integrity.c

··· 58 58 int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, 59 59 struct logical_block_metadata_cap __user *argp) 60 60 { 61 - struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); 61 + struct blk_integrity *bi; 62 62 struct logical_block_metadata_cap meta_cap = {}; 63 63 size_t usize = _IOC_SIZE(cmd); 64 64 65 - if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) || 66 - _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) || 67 - _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) || 68 - _IOC_SIZE(cmd) < LBMD_SIZE_VER0) 65 + if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0)) 69 66 return -ENOIOCTLCMD; 70 67 68 + bi = blk_get_integrity(bdev->bd_disk); 71 69 if (!bi) 72 70 goto out; 73 71

+6

fs/fhandle.c

··· 11 11 #include <linux/personality.h> 12 12 #include <linux/uaccess.h> 13 13 #include <linux/compat.h> 14 + #include <linux/nsfs.h> 14 15 #include "internal.h" 15 16 #include "mount.h" 16 17 ··· 187 186 188 187 if (fd == FD_PIDFS_ROOT) { 189 188 pidfs_get_root(root); 189 + return 0; 190 + } 191 + 192 + if (fd == FD_NSFS_ROOT) { 193 + nsfs_get_root(root); 190 194 return 0; 191 195 } 192 196

+1

fs/internal.h

··· 355 355 int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 356 356 struct iattr *attr); 357 357 void pidfs_get_root(struct path *path); 358 + void nsfs_get_root(struct path *path);

+3 -9

fs/mount.h

··· 17 17 }; 18 18 struct user_namespace *user_ns; 19 19 struct ucounts *ucounts; 20 - u64 seq; /* Sequence number to prevent loops */ 21 - union { 22 - wait_queue_head_t poll; 23 - struct rcu_head mnt_ns_rcu; 24 - }; 20 + wait_queue_head_t poll; 25 21 u64 seq_origin; /* Sequence number of origin mount namespace */ 26 22 u64 event; 27 23 #ifdef CONFIG_FSNOTIFY ··· 26 30 #endif 27 31 unsigned int nr_mounts; /* # of mounts in the namespace */ 28 32 unsigned int pending_mounts; 29 - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ 30 - struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ 31 33 refcount_t passive; /* number references not pinning @mounts */ 32 34 } __randomize_layout; 33 35 ··· 143 149 144 150 static inline void get_mnt_ns(struct mnt_namespace *ns) 145 151 { 146 - refcount_inc(&ns->ns.count); 152 + ns_ref_inc(ns); 147 153 } 148 154 149 155 extern seqlock_t mount_lock; ··· 167 173 168 174 static inline bool is_anon_ns(struct mnt_namespace *ns) 169 175 { 170 - return ns->seq == 0; 176 + return ns->ns.ns_id == 0; 171 177 } 172 178 173 179 static inline bool anon_ns_root(const struct mount *m)

+60 -136

fs/namespace.c

··· 33 33 #include <linux/shmem_fs.h> 34 34 #include <linux/mnt_idmapping.h> 35 35 #include <linux/pidfs.h> 36 + #include <linux/nstree.h> 36 37 37 38 #include "pnode.h" 38 39 #include "internal.h" ··· 90 89 static HLIST_HEAD(unmounted); /* protected by namespace_sem */ 91 90 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ 92 91 static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ 93 - static DEFINE_SEQLOCK(mnt_ns_tree_lock); 94 92 95 93 #ifdef CONFIG_FSNOTIFY 96 94 LIST_HEAD(notify_list); /* protected by namespace_sem */ 97 95 #endif 98 - static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ 99 - static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ 100 96 101 97 enum mount_kattr_flags_t { 102 98 MOUNT_KATTR_RECURSE = (1 << 0), ··· 126 128 127 129 static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) 128 130 { 131 + struct ns_common *ns; 132 + 129 133 if (!node) 130 134 return NULL; 131 - return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); 132 - } 133 - 134 - static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) 135 - { 136 - struct mnt_namespace *ns_a = node_to_mnt_ns(a); 137 - struct mnt_namespace *ns_b = node_to_mnt_ns(b); 138 - u64 seq_a = ns_a->seq; 139 - u64 seq_b = ns_b->seq; 140 - 141 - if (seq_a < seq_b) 142 - return -1; 143 - if (seq_a > seq_b) 144 - return 1; 145 - return 0; 146 - } 147 - 148 - static inline void mnt_ns_tree_write_lock(void) 149 - { 150 - write_seqlock(&mnt_ns_tree_lock); 151 - } 152 - 153 - static inline void mnt_ns_tree_write_unlock(void) 154 - { 155 - write_sequnlock(&mnt_ns_tree_lock); 156 - } 157 - 158 - static void mnt_ns_tree_add(struct mnt_namespace *ns) 159 - { 160 - struct rb_node *node, *prev; 161 - 162 - mnt_ns_tree_write_lock(); 163 - node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); 164 - /* 165 - * If there's no previous entry simply add it after the 166 - * head and if there is add it after the previous entry. 167 - */ 168 - prev = rb_prev(&ns->mnt_ns_tree_node); 169 - if (!prev) 170 - list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); 171 - else 172 - list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); 173 - mnt_ns_tree_write_unlock(); 174 - 175 - WARN_ON_ONCE(node); 135 + ns = rb_entry(node, struct ns_common, ns_tree_node); 136 + return container_of(ns, struct mnt_namespace, ns); 176 137 } 177 138 178 139 static void mnt_ns_release(struct mnt_namespace *ns) ··· 147 190 148 191 static void mnt_ns_release_rcu(struct rcu_head *rcu) 149 192 { 150 - mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); 193 + mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); 151 194 } 152 195 153 196 static void mnt_ns_tree_remove(struct mnt_namespace *ns) 154 197 { 155 198 /* remove from global mount namespace list */ 156 - if (!is_anon_ns(ns)) { 157 - mnt_ns_tree_write_lock(); 158 - rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); 159 - list_bidir_del_rcu(&ns->mnt_ns_list); 160 - mnt_ns_tree_write_unlock(); 161 - } 199 + if (ns_tree_active(ns)) 200 + ns_tree_remove(ns); 162 201 163 - call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); 164 - } 165 - 166 - static int mnt_ns_find(const void *key, const struct rb_node *node) 167 - { 168 - const u64 mnt_ns_id = *(u64 *)key; 169 - const struct mnt_namespace *ns = node_to_mnt_ns(node); 170 - 171 - if (mnt_ns_id < ns->seq) 172 - return -1; 173 - if (mnt_ns_id > ns->seq) 174 - return 1; 175 - return 0; 202 + call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); 176 203 } 177 204 178 205 /* ··· 175 234 */ 176 235 static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) 177 236 { 178 - struct mnt_namespace *ns; 179 - struct rb_node *node; 180 - unsigned int seq; 237 + struct mnt_namespace *mnt_ns; 238 + struct ns_common *ns; 181 239 182 240 guard(rcu)(); 183 - do { 184 - seq = read_seqbegin(&mnt_ns_tree_lock); 185 - node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); 186 - if (node) 187 - break; 188 - } while (read_seqretry(&mnt_ns_tree_lock, seq)); 189 - 190 - if (!node) 241 + ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); 242 + if (!ns) 191 243 return NULL; 192 244 193 245 /* 194 246 * The last reference count is put with RCU delay so we can 195 247 * unconditonally acquire a reference here. 196 248 */ 197 - ns = node_to_mnt_ns(node); 198 - refcount_inc(&ns->passive); 199 - return ns; 249 + mnt_ns = container_of(ns, struct mnt_namespace, ns); 250 + refcount_inc(&mnt_ns->passive); 251 + return mnt_ns; 200 252 } 201 253 202 254 static inline void lock_mount_hash(void) ··· 960 1026 return false; 961 1027 962 1028 seq = mnt->mnt_ns->seq_origin; 963 - return !seq || (seq == current->nsproxy->mnt_ns->seq); 1029 + return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); 964 1030 } 965 1031 966 1032 /* ··· 2095 2161 2096 2162 struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) 2097 2163 { 2164 + struct ns_common *ns; 2165 + 2098 2166 guard(rcu)(); 2099 2167 2100 2168 for (;;) { 2101 - struct list_head *list; 2169 + ns = ns_tree_adjoined_rcu(mntns, previous); 2170 + if (IS_ERR(ns)) 2171 + return ERR_CAST(ns); 2102 2172 2103 - if (previous) 2104 - list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); 2105 - else 2106 - list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); 2107 - if (list_is_head(list, &mnt_ns_list)) 2108 - return ERR_PTR(-ENOENT); 2109 - 2110 - mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); 2173 + mntns = to_mnt_ns(ns); 2111 2174 2112 2175 /* 2113 2176 * The last passive reference count is put with RCU ··· 2119 2188 * the mount namespace and it might already be on its 2120 2189 * deathbed. 2121 2190 */ 2122 - if (!refcount_inc_not_zero(&mntns->ns.count)) 2191 + if (!ns_ref_get(mntns)) 2123 2192 continue; 2124 2193 2125 2194 return mntns; ··· 2144 2213 if (!mnt_ns) 2145 2214 return false; 2146 2215 2147 - return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 2216 + return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; 2148 2217 } 2149 2218 2150 2219 struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, ··· 3020 3089 if (is_anon_ns(src_mnt_ns)) 3021 3090 ns->seq_origin = src_mnt_ns->seq_origin; 3022 3091 else 3023 - ns->seq_origin = src_mnt_ns->seq; 3092 + ns->seq_origin = src_mnt_ns->ns.ns_id; 3024 3093 } 3025 3094 3026 3095 mnt = __do_loopback(path, recursive); ··· 4093 4162 static void free_mnt_ns(struct mnt_namespace *ns) 4094 4163 { 4095 4164 if (!is_anon_ns(ns)) 4096 - ns_free_inum(&ns->ns); 4165 + ns_common_free(ns); 4097 4166 dec_mnt_namespaces(ns->ucounts); 4098 4167 mnt_ns_tree_remove(ns); 4099 4168 } 4100 - 4101 - /* 4102 - * Assign a sequence number so we can detect when we attempt to bind 4103 - * mount a reference to an older mount namespace into the current 4104 - * mount namespace, preventing reference counting loops. A 64bit 4105 - * number incrementing at 10Ghz will take 12,427 years to wrap which 4106 - * is effectively never, so we can ignore the possibility. 4107 - */ 4108 - static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); 4109 4169 4110 4170 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) 4111 4171 { ··· 4113 4191 dec_mnt_namespaces(ucounts); 4114 4192 return ERR_PTR(-ENOMEM); 4115 4193 } 4116 - if (!anon) { 4117 - ret = ns_alloc_inum(&new_ns->ns); 4118 - if (ret) { 4119 - kfree(new_ns); 4120 - dec_mnt_namespaces(ucounts); 4121 - return ERR_PTR(ret); 4122 - } 4194 + 4195 + if (anon) 4196 + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); 4197 + else 4198 + ret = ns_common_init(new_ns); 4199 + if (ret) { 4200 + kfree(new_ns); 4201 + dec_mnt_namespaces(ucounts); 4202 + return ERR_PTR(ret); 4123 4203 } 4124 - new_ns->ns.ops = &mntns_operations; 4125 4204 if (!anon) 4126 - new_ns->seq = atomic64_inc_return(&mnt_ns_seq); 4127 - refcount_set(&new_ns->ns.count, 1); 4205 + ns_tree_gen_id(&new_ns->ns); 4128 4206 refcount_set(&new_ns->passive, 1); 4129 4207 new_ns->mounts = RB_ROOT; 4130 - INIT_LIST_HEAD(&new_ns->mnt_ns_list); 4131 - RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); 4132 4208 init_waitqueue_head(&new_ns->poll); 4133 4209 new_ns->user_ns = get_user_ns(user_ns); 4134 4210 new_ns->ucounts = ucounts; ··· 4165 4245 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 4166 4246 if (IS_ERR(new)) { 4167 4247 namespace_unlock(); 4168 - ns_free_inum(&new_ns->ns); 4248 + ns_common_free(ns); 4169 4249 dec_mnt_namespaces(new_ns->ucounts); 4170 4250 mnt_ns_release(new_ns); 4171 4251 return ERR_CAST(new); ··· 4212 4292 if (pwdmnt) 4213 4293 mntput(pwdmnt); 4214 4294 4215 - mnt_ns_tree_add(new_ns); 4295 + ns_tree_add_raw(new_ns); 4216 4296 return new_ns; 4217 4297 } 4218 4298 ··· 4938 5018 return -EINVAL; 4939 5019 4940 5020 ns = get_proc_ns(file_inode(fd_file(f))); 4941 - if (ns->ops->type != CLONE_NEWUSER) 5021 + if (ns->ns_type != CLONE_NEWUSER) 4942 5022 return -EINVAL; 4943 5023 4944 5024 /* ··· 5331 5411 static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) 5332 5412 { 5333 5413 s->sm.mask |= STATMOUNT_MNT_NS_ID; 5334 - s->sm.mnt_ns_id = ns->seq; 5414 + s->sm.mnt_ns_id = ns->ns.ns_id; 5335 5415 } 5336 5416 5337 5417 static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) ··· 5838 5918 return ERR_PTR(-EINVAL); 5839 5919 5840 5920 ns = get_proc_ns(file_inode(fd_file(f))); 5841 - if (ns->ops->type != CLONE_NEWNS) 5921 + if (ns->ns_type != CLONE_NEWNS) 5842 5922 return ERR_PTR(-EINVAL); 5843 5923 5844 5924 mnt_ns = to_mnt_ns(ns); ··· 6051 6131 return ret; 6052 6132 } 6053 6133 6134 + struct mnt_namespace init_mnt_ns = { 6135 + .ns.inum = ns_init_inum(&init_mnt_ns), 6136 + .ns.ops = &mntns_operations, 6137 + .user_ns = &init_user_ns, 6138 + .ns.__ns_ref = REFCOUNT_INIT(1), 6139 + .ns.ns_type = ns_common_type(&init_mnt_ns), 6140 + .passive = REFCOUNT_INIT(1), 6141 + .mounts = RB_ROOT, 6142 + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), 6143 + }; 6144 + 6054 6145 static void __init init_mount_tree(void) 6055 6146 { 6056 6147 struct vfsmount *mnt; 6057 6148 struct mount *m; 6058 - struct mnt_namespace *ns; 6059 6149 struct path root; 6060 6150 6061 6151 mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); 6062 6152 if (IS_ERR(mnt)) 6063 6153 panic("Can't create rootfs"); 6064 6154 6065 - ns = alloc_mnt_ns(&init_user_ns, true); 6066 - if (IS_ERR(ns)) 6067 - panic("Can't allocate initial namespace"); 6068 - ns->seq = atomic64_inc_return(&mnt_ns_seq); 6069 - ns->ns.inum = PROC_MNT_INIT_INO; 6070 6155 m = real_mount(mnt); 6071 - ns->root = m; 6072 - ns->nr_mounts = 1; 6073 - mnt_add_to_ns(ns, m); 6074 - init_task.nsproxy->mnt_ns = ns; 6075 - get_mnt_ns(ns); 6156 + init_mnt_ns.root = m; 6157 + init_mnt_ns.nr_mounts = 1; 6158 + mnt_add_to_ns(&init_mnt_ns, m); 6159 + init_task.nsproxy->mnt_ns = &init_mnt_ns; 6160 + get_mnt_ns(&init_mnt_ns); 6076 6161 6077 6162 root.mnt = mnt; 6078 6163 root.dentry = mnt->mnt_root; ··· 6085 6160 set_fs_pwd(current->fs, &root); 6086 6161 set_fs_root(current->fs, &root); 6087 6162 6088 - mnt_ns_tree_add(ns); 6163 + ns_tree_add(&init_mnt_ns); 6089 6164 } 6090 6165 6091 6166 void __init mnt_init(void) ··· 6125 6200 6126 6201 void put_mnt_ns(struct mnt_namespace *ns) 6127 6202 { 6128 - if (!refcount_dec_and_test(&ns->ns.count)) 6203 + if (!ns_ref_put(ns)) 6129 6204 return; 6130 6205 namespace_lock(); 6131 6206 emptied_ns = ns; ··· 6374 6449 6375 6450 const struct proc_ns_operations mntns_operations = { 6376 6451 .name = "mnt", 6377 - .type = CLONE_NEWNS, 6378 6452 .get = mntns_get, 6379 6453 .put = mntns_put, 6380 6454 .install = mntns_install,

+191 -20

fs/nsfs.c

··· 13 13 #include <linux/nsfs.h> 14 14 #include <linux/uaccess.h> 15 15 #include <linux/mnt_namespace.h> 16 + #include <linux/ipc_namespace.h> 17 + #include <linux/time_namespace.h> 18 + #include <linux/utsname.h> 19 + #include <linux/exportfs.h> 20 + #include <linux/nstree.h> 21 + #include <net/net_namespace.h> 16 22 17 23 #include "mount.h" 18 24 #include "internal.h" 19 25 20 26 static struct vfsmount *nsfs_mnt; 27 + 28 + static struct path nsfs_root_path = {}; 29 + 30 + void nsfs_get_root(struct path *path) 31 + { 32 + *path = nsfs_root_path; 33 + path_get(path); 34 + } 21 35 22 36 static long ns_ioctl(struct file *filp, unsigned int ioctl, 23 37 unsigned long arg); ··· 153 139 * the size value will be set to the size the kernel knows about. 154 140 */ 155 141 kinfo->size = min(usize, sizeof(*kinfo)); 156 - kinfo->mnt_ns_id = mnt_ns->seq; 142 + kinfo->mnt_ns_id = mnt_ns->ns.ns_id; 157 143 kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); 158 144 /* Subtract the root mount of the mount namespace. */ 159 145 if (kinfo->nr_mounts) ··· 177 163 case NS_GET_TGID_FROM_PIDNS: 178 164 case NS_GET_PID_IN_PIDNS: 179 165 case NS_GET_TGID_IN_PIDNS: 180 - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); 166 + case NS_GET_ID: 167 + return true; 181 168 } 182 169 183 170 /* Extensible ioctls require some extra handling. */ 184 171 switch (_IOC_NR(cmd)) { 185 172 case _IOC_NR(NS_MNT_GET_INFO): 173 + return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); 186 174 case _IOC_NR(NS_MNT_GET_NEXT): 175 + return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); 187 176 case _IOC_NR(NS_MNT_GET_PREV): 188 - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); 177 + return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); 189 178 } 190 179 191 180 return false; ··· 219 202 return -EINVAL; 220 203 return open_related_ns(ns, ns->ops->get_parent); 221 204 case NS_GET_NSTYPE: 222 - return ns->ops->type; 205 + return ns->ns_type; 223 206 case NS_GET_OWNER_UID: 224 - if (ns->ops->type != CLONE_NEWUSER) 207 + if (ns->ns_type != CLONE_NEWUSER) 225 208 return -EINVAL; 226 209 user_ns = container_of(ns, struct user_namespace, ns); 227 210 argp = (uid_t __user *) arg; 228 211 uid = from_kuid_munged(current_user_ns(), user_ns->owner); 229 212 return put_user(uid, argp); 230 - case NS_GET_MNTNS_ID: { 231 - __u64 __user *idp; 232 - __u64 id; 233 - 234 - if (ns->ops->type != CLONE_NEWNS) 235 - return -EINVAL; 236 - 237 - mnt_ns = container_of(ns, struct mnt_namespace, ns); 238 - idp = (__u64 __user *)arg; 239 - id = mnt_ns->seq; 240 - return put_user(id, idp); 241 - } 242 213 case NS_GET_PID_FROM_PIDNS: 243 214 fallthrough; 244 215 case NS_GET_TGID_FROM_PIDNS: ··· 234 229 case NS_GET_PID_IN_PIDNS: 235 230 fallthrough; 236 231 case NS_GET_TGID_IN_PIDNS: { 237 - if (ns->ops->type != CLONE_NEWPID) 232 + if (ns->ns_type != CLONE_NEWPID) 238 233 return -EINVAL; 239 234 240 235 ret = -ESRCH; ··· 272 267 ret = -ESRCH; 273 268 return ret; 274 269 } 270 + case NS_GET_MNTNS_ID: 271 + if (ns->ns_type != CLONE_NEWNS) 272 + return -EINVAL; 273 + fallthrough; 274 + case NS_GET_ID: { 275 + __u64 __user *idp; 276 + __u64 id; 277 + 278 + idp = (__u64 __user *)arg; 279 + id = ns->ns_id; 280 + return put_user(id, idp); 281 + } 275 282 } 276 283 277 284 /* extensible ioctls */ ··· 293 276 struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; 294 277 size_t usize = _IOC_SIZE(ioctl); 295 278 296 - if (ns->ops->type != CLONE_NEWNS) 279 + if (ns->ns_type != CLONE_NEWNS) 297 280 return -EINVAL; 298 281 299 282 if (!uinfo) ··· 314 297 struct file *f __free(fput) = NULL; 315 298 size_t usize = _IOC_SIZE(ioctl); 316 299 317 - if (ns->ops->type != CLONE_NEWNS) 300 + if (ns->ns_type != CLONE_NEWNS) 318 301 return -EINVAL; 319 302 320 303 if (usize < MNT_NS_INFO_SIZE_VER0) ··· 432 415 .put_data = nsfs_put_data, 433 416 }; 434 417 418 + #define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) 419 + #define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) 420 + 421 + static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, 422 + struct inode *parent) 423 + { 424 + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; 425 + struct ns_common *ns = inode->i_private; 426 + int len = *max_len; 427 + 428 + if (parent) 429 + return FILEID_INVALID; 430 + 431 + if (len < NSFS_FID_SIZE_U32_VER0) { 432 + *max_len = NSFS_FID_SIZE_U32_LATEST; 433 + return FILEID_INVALID; 434 + } else if (len > NSFS_FID_SIZE_U32_LATEST) { 435 + *max_len = NSFS_FID_SIZE_U32_LATEST; 436 + } 437 + 438 + fid->ns_id = ns->ns_id; 439 + fid->ns_type = ns->ns_type; 440 + fid->ns_inum = inode->i_ino; 441 + return FILEID_NSFS; 442 + } 443 + 444 + static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, 445 + int fh_len, int fh_type) 446 + { 447 + struct path path __free(path_put) = {}; 448 + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; 449 + struct user_namespace *owning_ns = NULL; 450 + struct ns_common *ns; 451 + int ret; 452 + 453 + if (fh_len < NSFS_FID_SIZE_U32_VER0) 454 + return NULL; 455 + 456 + /* Check that any trailing bytes are zero. */ 457 + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && 458 + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, 459 + fh_len - NSFS_FID_SIZE_U32_LATEST)) 460 + return NULL; 461 + 462 + switch (fh_type) { 463 + case FILEID_NSFS: 464 + break; 465 + default: 466 + return NULL; 467 + } 468 + 469 + scoped_guard(rcu) { 470 + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); 471 + if (!ns) 472 + return NULL; 473 + 474 + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); 475 + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); 476 + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); 477 + 478 + if (!__ns_ref_get(ns)) 479 + return NULL; 480 + } 481 + 482 + switch (ns->ns_type) { 483 + #ifdef CONFIG_CGROUPS 484 + case CLONE_NEWCGROUP: 485 + if (!current_in_namespace(to_cg_ns(ns))) 486 + owning_ns = to_cg_ns(ns)->user_ns; 487 + break; 488 + #endif 489 + #ifdef CONFIG_IPC_NS 490 + case CLONE_NEWIPC: 491 + if (!current_in_namespace(to_ipc_ns(ns))) 492 + owning_ns = to_ipc_ns(ns)->user_ns; 493 + break; 494 + #endif 495 + case CLONE_NEWNS: 496 + if (!current_in_namespace(to_mnt_ns(ns))) 497 + owning_ns = to_mnt_ns(ns)->user_ns; 498 + break; 499 + #ifdef CONFIG_NET_NS 500 + case CLONE_NEWNET: 501 + if (!current_in_namespace(to_net_ns(ns))) 502 + owning_ns = to_net_ns(ns)->user_ns; 503 + break; 504 + #endif 505 + #ifdef CONFIG_PID_NS 506 + case CLONE_NEWPID: 507 + if (!current_in_namespace(to_pid_ns(ns))) { 508 + owning_ns = to_pid_ns(ns)->user_ns; 509 + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { 510 + ns->ops->put(ns); 511 + return ERR_PTR(-EPERM); 512 + } 513 + break; 514 + #endif 515 + #ifdef CONFIG_TIME_NS 516 + case CLONE_NEWTIME: 517 + if (!current_in_namespace(to_time_ns(ns))) 518 + owning_ns = to_time_ns(ns)->user_ns; 519 + break; 520 + #endif 521 + #ifdef CONFIG_USER_NS 522 + case CLONE_NEWUSER: 523 + if (!current_in_namespace(to_user_ns(ns))) 524 + owning_ns = to_user_ns(ns); 525 + break; 526 + #endif 527 + #ifdef CONFIG_UTS_NS 528 + case CLONE_NEWUTS: 529 + if (!current_in_namespace(to_uts_ns(ns))) 530 + owning_ns = to_uts_ns(ns)->user_ns; 531 + break; 532 + #endif 533 + default: 534 + return ERR_PTR(-EOPNOTSUPP); 535 + } 536 + 537 + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { 538 + ns->ops->put(ns); 539 + return ERR_PTR(-EPERM); 540 + } 541 + 542 + /* path_from_stashed() unconditionally consumes the reference. */ 543 + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); 544 + if (ret) 545 + return ERR_PTR(ret); 546 + 547 + return no_free_ptr(path.dentry); 548 + } 549 + 550 + static int nsfs_export_permission(struct handle_to_path_ctx *ctx, 551 + unsigned int oflags) 552 + { 553 + /* nsfs_fh_to_dentry() performs all permission checks. */ 554 + return 0; 555 + } 556 + 557 + static struct file *nsfs_export_open(struct path *path, unsigned int oflags) 558 + { 559 + return file_open_root(path, "", oflags, 0); 560 + } 561 + 562 + static const struct export_operations nsfs_export_operations = { 563 + .encode_fh = nsfs_encode_fh, 564 + .fh_to_dentry = nsfs_fh_to_dentry, 565 + .open = nsfs_export_open, 566 + .permission = nsfs_export_permission, 567 + }; 568 + 435 569 static int nsfs_init_fs_context(struct fs_context *fc) 436 570 { 437 571 struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); 438 572 if (!ctx) 439 573 return -ENOMEM; 440 574 ctx->ops = &nsfs_ops; 575 + ctx->eops = &nsfs_export_operations; 441 576 ctx->dops = &ns_dentry_operations; 442 577 fc->s_fs_info = (void *)&nsfs_stashed_ops; 443 578 return 0; ··· 607 438 if (IS_ERR(nsfs_mnt)) 608 439 panic("can't set nsfs up\n"); 609 440 nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; 441 + nsfs_root_path.mnt = nsfs_mnt; 442 + nsfs_root_path.dentry = nsfs_mnt->mnt_root; 610 443 }

+1 -1

fs/pidfs.c

··· 440 440 * erronously mistook the file descriptor for a pidfd. 441 441 * This is not perfect but will catch most cases. 442 442 */ 443 - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); 443 + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); 444 444 } 445 445 446 446 return false;

+1 -1

fs/proc/root.c

··· 143 143 if (!proc_ns_file(ns_filp)) 144 144 return invalfc(fc, "pidns argument is not an nsfs file"); 145 145 ns = get_proc_ns(file_inode(ns_filp)); 146 - if (ns->ops->type != CLONE_NEWPID) 146 + if (ns->ns_type != CLONE_NEWPID) 147 147 return invalfc(fc, "pidns argument is not a pidns file"); 148 148 target = container_of(ns, struct pid_namespace, ns); 149 149

+1 -46

include/linux/cgroup.h

··· 27 27 #include <linux/kernel_stat.h> 28 28 29 29 #include <linux/cgroup-defs.h> 30 + #include <linux/cgroup_namespace.h> 30 31 31 32 struct kernel_clone_args; 32 33 ··· 783 782 static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} 784 783 785 784 #endif /* CONFIG_CGROUP_DATA */ 786 - 787 - struct cgroup_namespace { 788 - struct ns_common ns; 789 - struct user_namespace *user_ns; 790 - struct ucounts *ucounts; 791 - struct css_set *root_cset; 792 - }; 793 - 794 - extern struct cgroup_namespace init_cgroup_ns; 795 - 796 - #ifdef CONFIG_CGROUPS 797 - 798 - void free_cgroup_ns(struct cgroup_namespace *ns); 799 - 800 - struct cgroup_namespace *copy_cgroup_ns(u64 flags, 801 - struct user_namespace *user_ns, 802 - struct cgroup_namespace *old_ns); 803 - 804 - int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, 805 - struct cgroup_namespace *ns); 806 - 807 - static inline void get_cgroup_ns(struct cgroup_namespace *ns) 808 - { 809 - refcount_inc(&ns->ns.count); 810 - } 811 - 812 - static inline void put_cgroup_ns(struct cgroup_namespace *ns) 813 - { 814 - if (refcount_dec_and_test(&ns->ns.count)) 815 - free_cgroup_ns(ns); 816 - } 817 - 818 - #else /* !CONFIG_CGROUPS */ 819 - 820 - static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } 821 - static inline struct cgroup_namespace * 822 - copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, 823 - struct cgroup_namespace *old_ns) 824 - { 825 - return old_ns; 826 - } 827 - 828 - static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } 829 - static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } 830 - 831 - #endif /* !CONFIG_CGROUPS */ 832 785 833 786 #ifdef CONFIG_CGROUPS 834 787

+58

include/linux/cgroup_namespace.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_CGROUP_NAMESPACE_H 3 + #define _LINUX_CGROUP_NAMESPACE_H 4 + 5 + #include <linux/ns_common.h> 6 + 7 + struct cgroup_namespace { 8 + struct ns_common ns; 9 + struct user_namespace *user_ns; 10 + struct ucounts *ucounts; 11 + struct css_set *root_cset; 12 + }; 13 + 14 + extern struct cgroup_namespace init_cgroup_ns; 15 + 16 + #ifdef CONFIG_CGROUPS 17 + 18 + static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) 19 + { 20 + return container_of(ns, struct cgroup_namespace, ns); 21 + } 22 + 23 + void free_cgroup_ns(struct cgroup_namespace *ns); 24 + 25 + struct cgroup_namespace *copy_cgroup_ns(u64 flags, 26 + struct user_namespace *user_ns, 27 + struct cgroup_namespace *old_ns); 28 + 29 + int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, 30 + struct cgroup_namespace *ns); 31 + 32 + static inline void get_cgroup_ns(struct cgroup_namespace *ns) 33 + { 34 + ns_ref_inc(ns); 35 + } 36 + 37 + static inline void put_cgroup_ns(struct cgroup_namespace *ns) 38 + { 39 + if (ns_ref_put(ns)) 40 + free_cgroup_ns(ns); 41 + } 42 + 43 + #else /* !CONFIG_CGROUPS */ 44 + 45 + static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } 46 + static inline struct cgroup_namespace * 47 + copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, 48 + struct cgroup_namespace *old_ns) 49 + { 50 + return old_ns; 51 + } 52 + 53 + static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } 54 + static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } 55 + 56 + #endif /* !CONFIG_CGROUPS */ 57 + 58 + #endif /* _LINUX_CGROUP_NAMESPACE_H */

+6

include/linux/exportfs.h

··· 123 123 FILEID_BCACHEFS_WITH_PARENT = 0xb2, 124 124 125 125 /* 126 + * 127 + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. 128 + */ 129 + FILEID_NSFS = 0xf1, 130 + 131 + /* 126 132 * 64 bit unique kernfs id 127 133 */ 128 134 FILEID_KERNFS = 0xfe,

+14

include/linux/fs.h

··· 4018 4018 4019 4019 int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); 4020 4020 4021 + static inline bool extensible_ioctl_valid(unsigned int cmd_a, 4022 + unsigned int cmd_b, size_t min_size) 4023 + { 4024 + if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) 4025 + return false; 4026 + if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) 4027 + return false; 4028 + if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) 4029 + return false; 4030 + if (_IOC_SIZE(cmd_a) < min_size) 4031 + return false; 4032 + return true; 4033 + } 4034 + 4021 4035 #endif /* _LINUX_FS_H */

+7 -2

include/linux/ipc_namespace.h

··· 129 129 #endif 130 130 131 131 #if defined(CONFIG_IPC_NS) 132 + static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) 133 + { 134 + return container_of(ns, struct ipc_namespace, ns); 135 + } 136 + 132 137 extern struct ipc_namespace *copy_ipcs(u64 flags, 133 138 struct user_namespace *user_ns, struct ipc_namespace *ns); 134 139 135 140 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) 136 141 { 137 142 if (ns) 138 - refcount_inc(&ns->ns.count); 143 + ns_ref_inc(ns); 139 144 return ns; 140 145 } 141 146 142 147 static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) 143 148 { 144 149 if (ns) { 145 - if (refcount_inc_not_zero(&ns->ns.count)) 150 + if (ns_ref_get(ns)) 146 151 return ns; 147 152 } 148 153

+2

include/linux/mnt_namespace.h

··· 11 11 struct user_namespace; 12 12 struct ns_common; 13 13 14 + extern struct mnt_namespace init_mnt_ns; 15 + 14 16 extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, 15 17 struct user_namespace *, struct fs_struct *); 16 18 extern void put_mnt_ns(struct mnt_namespace *ns);

+138 -1

include/linux/ns_common.h

··· 3 3 #define _LINUX_NS_COMMON_H 4 4 5 5 #include <linux/refcount.h> 6 + #include <linux/rbtree.h> 7 + #include <uapi/linux/sched.h> 6 8 7 9 struct proc_ns_operations; 8 10 11 + struct cgroup_namespace; 12 + struct ipc_namespace; 13 + struct mnt_namespace; 14 + struct net; 15 + struct pid_namespace; 16 + struct time_namespace; 17 + struct user_namespace; 18 + struct uts_namespace; 19 + 20 + extern struct cgroup_namespace init_cgroup_ns; 21 + extern struct ipc_namespace init_ipc_ns; 22 + extern struct mnt_namespace init_mnt_ns; 23 + extern struct net init_net; 24 + extern struct pid_namespace init_pid_ns; 25 + extern struct time_namespace init_time_ns; 26 + extern struct user_namespace init_user_ns; 27 + extern struct uts_namespace init_uts_ns; 28 + 29 + extern const struct proc_ns_operations netns_operations; 30 + extern const struct proc_ns_operations utsns_operations; 31 + extern const struct proc_ns_operations ipcns_operations; 32 + extern const struct proc_ns_operations pidns_operations; 33 + extern const struct proc_ns_operations pidns_for_children_operations; 34 + extern const struct proc_ns_operations userns_operations; 35 + extern const struct proc_ns_operations mntns_operations; 36 + extern const struct proc_ns_operations cgroupns_operations; 37 + extern const struct proc_ns_operations timens_operations; 38 + extern const struct proc_ns_operations timens_for_children_operations; 39 + 9 40 struct ns_common { 41 + u32 ns_type; 10 42 struct dentry *stashed; 11 43 const struct proc_ns_operations *ops; 12 44 unsigned int inum; 13 - refcount_t count; 45 + refcount_t __ns_ref; /* do not use directly */ 46 + union { 47 + struct { 48 + u64 ns_id; 49 + struct rb_node ns_tree_node; 50 + struct list_head ns_list_node; 51 + }; 52 + struct rcu_head ns_rcu; 53 + }; 14 54 }; 55 + 56 + int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); 57 + void __ns_common_free(struct ns_common *ns); 58 + 59 + #define to_ns_common(__ns) \ 60 + _Generic((__ns), \ 61 + struct cgroup_namespace *: &(__ns)->ns, \ 62 + const struct cgroup_namespace *: &(__ns)->ns, \ 63 + struct ipc_namespace *: &(__ns)->ns, \ 64 + const struct ipc_namespace *: &(__ns)->ns, \ 65 + struct mnt_namespace *: &(__ns)->ns, \ 66 + const struct mnt_namespace *: &(__ns)->ns, \ 67 + struct net *: &(__ns)->ns, \ 68 + const struct net *: &(__ns)->ns, \ 69 + struct pid_namespace *: &(__ns)->ns, \ 70 + const struct pid_namespace *: &(__ns)->ns, \ 71 + struct time_namespace *: &(__ns)->ns, \ 72 + const struct time_namespace *: &(__ns)->ns, \ 73 + struct user_namespace *: &(__ns)->ns, \ 74 + const struct user_namespace *: &(__ns)->ns, \ 75 + struct uts_namespace *: &(__ns)->ns, \ 76 + const struct uts_namespace *: &(__ns)->ns) 77 + 78 + #define ns_init_inum(__ns) \ 79 + _Generic((__ns), \ 80 + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ 81 + struct ipc_namespace *: IPC_NS_INIT_INO, \ 82 + struct mnt_namespace *: MNT_NS_INIT_INO, \ 83 + struct net *: NET_NS_INIT_INO, \ 84 + struct pid_namespace *: PID_NS_INIT_INO, \ 85 + struct time_namespace *: TIME_NS_INIT_INO, \ 86 + struct user_namespace *: USER_NS_INIT_INO, \ 87 + struct uts_namespace *: UTS_NS_INIT_INO) 88 + 89 + #define ns_init_ns(__ns) \ 90 + _Generic((__ns), \ 91 + struct cgroup_namespace *: &init_cgroup_ns, \ 92 + struct ipc_namespace *: &init_ipc_ns, \ 93 + struct mnt_namespace *: &init_mnt_ns, \ 94 + struct net *: &init_net, \ 95 + struct pid_namespace *: &init_pid_ns, \ 96 + struct time_namespace *: &init_time_ns, \ 97 + struct user_namespace *: &init_user_ns, \ 98 + struct uts_namespace *: &init_uts_ns) 99 + 100 + #define to_ns_operations(__ns) \ 101 + _Generic((__ns), \ 102 + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ 103 + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ 104 + struct mnt_namespace *: &mntns_operations, \ 105 + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ 106 + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ 107 + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ 108 + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ 109 + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) 110 + 111 + #define ns_common_type(__ns) \ 112 + _Generic((__ns), \ 113 + struct cgroup_namespace *: CLONE_NEWCGROUP, \ 114 + struct ipc_namespace *: CLONE_NEWIPC, \ 115 + struct mnt_namespace *: CLONE_NEWNS, \ 116 + struct net *: CLONE_NEWNET, \ 117 + struct pid_namespace *: CLONE_NEWPID, \ 118 + struct time_namespace *: CLONE_NEWTIME, \ 119 + struct user_namespace *: CLONE_NEWUSER, \ 120 + struct uts_namespace *: CLONE_NEWUTS) 121 + 122 + #define ns_common_init(__ns) \ 123 + __ns_common_init(to_ns_common(__ns), \ 124 + ns_common_type(__ns), \ 125 + to_ns_operations(__ns), \ 126 + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) 127 + 128 + #define ns_common_init_inum(__ns, __inum) \ 129 + __ns_common_init(to_ns_common(__ns), \ 130 + ns_common_type(__ns), \ 131 + to_ns_operations(__ns), \ 132 + __inum) 133 + 134 + #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) 135 + 136 + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) 137 + { 138 + return refcount_dec_and_test(&ns->__ns_ref); 139 + } 140 + 141 + static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) 142 + { 143 + return refcount_inc_not_zero(&ns->__ns_ref); 144 + } 145 + 146 + #define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) 147 + #define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) 148 + #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) 149 + #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) 150 + #define ns_ref_put_and_lock(__ns, __lock) \ 151 + refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) 15 152 16 153 #endif

+40

include/linux/nsfs.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 3 + 4 + #ifndef _LINUX_NSFS_H 5 + #define _LINUX_NSFS_H 6 + 7 + #include <linux/ns_common.h> 8 + #include <linux/cred.h> 9 + #include <linux/pid_namespace.h> 10 + 11 + struct path; 12 + struct task_struct; 13 + struct proc_ns_operations; 14 + 15 + int ns_get_path(struct path *path, struct task_struct *task, 16 + const struct proc_ns_operations *ns_ops); 17 + typedef struct ns_common *ns_get_path_helper_t(void *); 18 + int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, 19 + void *private_data); 20 + 21 + bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); 22 + 23 + int ns_get_name(char *buf, size_t size, struct task_struct *task, 24 + const struct proc_ns_operations *ns_ops); 25 + void nsfs_init(void); 26 + 27 + #define __current_namespace_from_type(__ns) \ 28 + _Generic((__ns), \ 29 + struct cgroup_namespace *: current->nsproxy->cgroup_ns, \ 30 + struct ipc_namespace *: current->nsproxy->ipc_ns, \ 31 + struct net *: current->nsproxy->net_ns, \ 32 + struct pid_namespace *: task_active_pid_ns(current), \ 33 + struct mnt_namespace *: current->nsproxy->mnt_ns, \ 34 + struct time_namespace *: current->nsproxy->time_ns, \ 35 + struct user_namespace *: current_user_ns(), \ 36 + struct uts_namespace *: current->nsproxy->uts_ns) 37 + 38 + #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) 39 + 40 + #endif /* _LINUX_NSFS_H */

-11

include/linux/nsproxy.h

··· 42 42 }; 43 43 extern struct nsproxy init_nsproxy; 44 44 45 - #define to_ns_common(__ns) \ 46 - _Generic((__ns), \ 47 - struct cgroup_namespace *: &(__ns->ns), \ 48 - struct ipc_namespace *: &(__ns->ns), \ 49 - struct net *: &(__ns->ns), \ 50 - struct pid_namespace *: &(__ns->ns), \ 51 - struct mnt_namespace *: &(__ns->ns), \ 52 - struct time_namespace *: &(__ns->ns), \ 53 - struct user_namespace *: &(__ns->ns), \ 54 - struct uts_namespace *: &(__ns->ns)) 55 - 56 45 /* 57 46 * A structure to encompass all bits needed to install 58 47 * a partial or complete new set of namespaces.

+78

include/linux/nstree.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_NSTREE_H 3 + #define _LINUX_NSTREE_H 4 + 5 + #include <linux/ns_common.h> 6 + #include <linux/nsproxy.h> 7 + #include <linux/rbtree.h> 8 + #include <linux/seqlock.h> 9 + #include <linux/rculist.h> 10 + #include <linux/cookie.h> 11 + 12 + extern struct ns_tree cgroup_ns_tree; 13 + extern struct ns_tree ipc_ns_tree; 14 + extern struct ns_tree mnt_ns_tree; 15 + extern struct ns_tree net_ns_tree; 16 + extern struct ns_tree pid_ns_tree; 17 + extern struct ns_tree time_ns_tree; 18 + extern struct ns_tree user_ns_tree; 19 + extern struct ns_tree uts_ns_tree; 20 + 21 + #define to_ns_tree(__ns) \ 22 + _Generic((__ns), \ 23 + struct cgroup_namespace *: &(cgroup_ns_tree), \ 24 + struct ipc_namespace *: &(ipc_ns_tree), \ 25 + struct net *: &(net_ns_tree), \ 26 + struct pid_namespace *: &(pid_ns_tree), \ 27 + struct mnt_namespace *: &(mnt_ns_tree), \ 28 + struct time_namespace *: &(time_ns_tree), \ 29 + struct user_namespace *: &(user_ns_tree), \ 30 + struct uts_namespace *: &(uts_ns_tree)) 31 + 32 + u64 ns_tree_gen_id(struct ns_common *ns); 33 + void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); 34 + void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); 35 + struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); 36 + struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, 37 + struct ns_tree *ns_tree, 38 + bool previous); 39 + 40 + static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) 41 + { 42 + ns_tree_gen_id(ns); 43 + __ns_tree_add_raw(ns, ns_tree); 44 + } 45 + 46 + /** 47 + * ns_tree_add_raw - Add a namespace to a namespace 48 + * @ns: Namespace to add 49 + * 50 + * This function adds a namespace to the appropriate namespace tree 51 + * without assigning a id. 52 + */ 53 + #define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) 54 + 55 + /** 56 + * ns_tree_add - Add a namespace to a namespace tree 57 + * @ns: Namespace to add 58 + * 59 + * This function assigns a new id to the namespace and adds it to the 60 + * appropriate namespace tree and list. 61 + */ 62 + #define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) 63 + 64 + /** 65 + * ns_tree_remove - Remove a namespace from a namespace tree 66 + * @ns: Namespace to remove 67 + * 68 + * This function removes a namespace from the appropriate namespace 69 + * tree and list. 70 + */ 71 + #define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) 72 + 73 + #define ns_tree_adjoined_rcu(__ns, __previous) \ 74 + __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) 75 + 76 + #define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) 77 + 78 + #endif /* _LINUX_NSTREE_H */

+6 -1

include/linux/pid_namespace.h

··· 54 54 #define PIDNS_ADDING (1U << 31) 55 55 56 56 #ifdef CONFIG_PID_NS 57 + static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) 58 + { 59 + return container_of(ns, struct pid_namespace, ns); 60 + } 61 + 57 62 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) 58 63 { 59 64 if (ns != &init_pid_ns) 60 - refcount_inc(&ns->ns.count); 65 + ns_ref_inc(ns); 61 66 return ns; 62 67 } 63 68

+1 -21

include/linux/proc_ns.h

··· 5 5 #ifndef _LINUX_PROC_NS_H 6 6 #define _LINUX_PROC_NS_H 7 7 8 - #include <linux/ns_common.h> 8 + #include <linux/nsfs.h> 9 9 #include <uapi/linux/nsfs.h> 10 10 11 11 struct pid_namespace; ··· 17 17 struct proc_ns_operations { 18 18 const char *name; 19 19 const char *real_ns_name; 20 - int type; 21 20 struct ns_common *(*get)(struct task_struct *task); 22 21 void (*put)(struct ns_common *ns); 23 22 int (*install)(struct nsset *nsset, struct ns_common *ns); ··· 65 66 66 67 #endif /* CONFIG_PROC_FS */ 67 68 68 - static inline int ns_alloc_inum(struct ns_common *ns) 69 - { 70 - WRITE_ONCE(ns->stashed, NULL); 71 - return proc_alloc_inum(&ns->inum); 72 - } 73 - 74 - #define ns_free_inum(ns) proc_free_inum((ns)->inum) 75 - 76 69 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) 77 - extern int ns_get_path(struct path *path, struct task_struct *task, 78 - const struct proc_ns_operations *ns_ops); 79 - typedef struct ns_common *ns_get_path_helper_t(void *); 80 - extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, 81 - void *private_data); 82 - 83 - extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); 84 - 85 - extern int ns_get_name(char *buf, size_t size, struct task_struct *task, 86 - const struct proc_ns_operations *ns_ops); 87 - extern void nsfs_init(void); 88 70 89 71 #endif /* _LINUX_PROC_NS_H */

+11 -2

include/linux/time_namespace.h

··· 33 33 extern struct time_namespace init_time_ns; 34 34 35 35 #ifdef CONFIG_TIME_NS 36 + static inline struct time_namespace *to_time_ns(struct ns_common *ns) 37 + { 38 + return container_of(ns, struct time_namespace, ns); 39 + } 40 + void __init time_ns_init(void); 36 41 extern int vdso_join_timens(struct task_struct *task, 37 42 struct time_namespace *ns); 38 43 extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); 39 44 40 45 static inline struct time_namespace *get_time_ns(struct time_namespace *ns) 41 46 { 42 - refcount_inc(&ns->ns.count); 47 + ns_ref_inc(ns); 43 48 return ns; 44 49 } 45 50 ··· 57 52 58 53 static inline void put_time_ns(struct time_namespace *ns) 59 54 { 60 - if (refcount_dec_and_test(&ns->ns.count)) 55 + if (ns_ref_put(ns)) 61 56 free_time_ns(ns); 62 57 } 63 58 ··· 113 108 } 114 109 115 110 #else 111 + static inline void __init time_ns_init(void) 112 + { 113 + } 114 + 116 115 static inline int vdso_join_timens(struct task_struct *task, 117 116 struct time_namespace *ns) 118 117 {

+7 -2

include/linux/user_namespace.h

··· 168 168 169 169 #ifdef CONFIG_USER_NS 170 170 171 + static inline struct user_namespace *to_user_ns(struct ns_common *ns) 172 + { 173 + return container_of(ns, struct user_namespace, ns); 174 + } 175 + 171 176 static inline struct user_namespace *get_user_ns(struct user_namespace *ns) 172 177 { 173 178 if (ns) 174 - refcount_inc(&ns->ns.count); 179 + ns_ref_inc(ns); 175 180 return ns; 176 181 } 177 182 ··· 186 181 187 182 static inline void put_user_ns(struct user_namespace *ns) 188 183 { 189 - if (ns && refcount_dec_and_test(&ns->ns.count)) 184 + if (ns && ns_ref_put(ns)) 190 185 __put_user_ns(ns); 191 186 } 192 187

+65

include/linux/uts_namespace.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_UTS_NAMESPACE_H 3 + #define _LINUX_UTS_NAMESPACE_H 4 + 5 + #include <linux/ns_common.h> 6 + #include <uapi/linux/utsname.h> 7 + 8 + struct user_namespace; 9 + extern struct user_namespace init_user_ns; 10 + 11 + struct uts_namespace { 12 + struct new_utsname name; 13 + struct user_namespace *user_ns; 14 + struct ucounts *ucounts; 15 + struct ns_common ns; 16 + } __randomize_layout; 17 + 18 + extern struct uts_namespace init_uts_ns; 19 + 20 + #ifdef CONFIG_UTS_NS 21 + static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) 22 + { 23 + return container_of(ns, struct uts_namespace, ns); 24 + } 25 + 26 + static inline void get_uts_ns(struct uts_namespace *ns) 27 + { 28 + ns_ref_inc(ns); 29 + } 30 + 31 + extern struct uts_namespace *copy_utsname(u64 flags, 32 + struct user_namespace *user_ns, struct uts_namespace *old_ns); 33 + extern void free_uts_ns(struct uts_namespace *ns); 34 + 35 + static inline void put_uts_ns(struct uts_namespace *ns) 36 + { 37 + if (ns_ref_put(ns)) 38 + free_uts_ns(ns); 39 + } 40 + 41 + void uts_ns_init(void); 42 + #else 43 + static inline void get_uts_ns(struct uts_namespace *ns) 44 + { 45 + } 46 + 47 + static inline void put_uts_ns(struct uts_namespace *ns) 48 + { 49 + } 50 + 51 + static inline struct uts_namespace *copy_utsname(u64 flags, 52 + struct user_namespace *user_ns, struct uts_namespace *old_ns) 53 + { 54 + if (flags & CLONE_NEWUTS) 55 + return ERR_PTR(-EINVAL); 56 + 57 + return old_ns; 58 + } 59 + 60 + static inline void uts_ns_init(void) 61 + { 62 + } 63 + #endif 64 + 65 + #endif /* _LINUX_UTS_NAMESPACE_H */

+1 -52

include/linux/utsname.h

··· 7 7 #include <linux/nsproxy.h> 8 8 #include <linux/ns_common.h> 9 9 #include <linux/err.h> 10 - #include <uapi/linux/utsname.h> 10 + #include <linux/uts_namespace.h> 11 11 12 12 enum uts_proc { 13 13 UTS_PROC_ARCH, ··· 17 17 UTS_PROC_HOSTNAME, 18 18 UTS_PROC_DOMAINNAME, 19 19 }; 20 - 21 - struct user_namespace; 22 - extern struct user_namespace init_user_ns; 23 - 24 - struct uts_namespace { 25 - struct new_utsname name; 26 - struct user_namespace *user_ns; 27 - struct ucounts *ucounts; 28 - struct ns_common ns; 29 - } __randomize_layout; 30 - extern struct uts_namespace init_uts_ns; 31 - 32 - #ifdef CONFIG_UTS_NS 33 - static inline void get_uts_ns(struct uts_namespace *ns) 34 - { 35 - refcount_inc(&ns->ns.count); 36 - } 37 - 38 - extern struct uts_namespace *copy_utsname(u64 flags, 39 - struct user_namespace *user_ns, struct uts_namespace *old_ns); 40 - extern void free_uts_ns(struct uts_namespace *ns); 41 - 42 - static inline void put_uts_ns(struct uts_namespace *ns) 43 - { 44 - if (refcount_dec_and_test(&ns->ns.count)) 45 - free_uts_ns(ns); 46 - } 47 - 48 - void uts_ns_init(void); 49 - #else 50 - static inline void get_uts_ns(struct uts_namespace *ns) 51 - { 52 - } 53 - 54 - static inline void put_uts_ns(struct uts_namespace *ns) 55 - { 56 - } 57 - 58 - static inline struct uts_namespace *copy_utsname(u64 flags, 59 - struct user_namespace *user_ns, struct uts_namespace *old_ns) 60 - { 61 - if (flags & CLONE_NEWUTS) 62 - return ERR_PTR(-EINVAL); 63 - 64 - return old_ns; 65 - } 66 - 67 - static inline void uts_ns_init(void) 68 - { 69 - } 70 - #endif 71 20 72 21 #ifdef CONFIG_PROC_SYSCTL 73 22 extern void uts_proc_notify(enum uts_proc proc);

+9 -4

include/net/net_namespace.h

··· 262 262 #ifdef CONFIG_NET_NS 263 263 void __put_net(struct net *net); 264 264 265 + static inline struct net *to_net_ns(struct ns_common *ns) 266 + { 267 + return container_of(ns, struct net, ns); 268 + } 269 + 265 270 /* Try using get_net_track() instead */ 266 271 static inline struct net *get_net(struct net *net) 267 272 { 268 - refcount_inc(&net->ns.count); 273 + ns_ref_inc(net); 269 274 return net; 270 275 } 271 276 ··· 281 276 * exists. If the reference count is zero this 282 277 * function fails and returns NULL. 283 278 */ 284 - if (!refcount_inc_not_zero(&net->ns.count)) 279 + if (!ns_ref_get(net)) 285 280 net = NULL; 286 281 return net; 287 282 } ··· 289 284 /* Try using put_net_track() instead */ 290 285 static inline void put_net(struct net *net) 291 286 { 292 - if (refcount_dec_and_test(&net->ns.count)) 287 + if (ns_ref_put(net)) 293 288 __put_net(net); 294 289 } 295 290 ··· 301 296 302 297 static inline int check_net(const struct net *net) 303 298 { 304 - return refcount_read(&net->ns.count) != 0; 299 + return ns_ref_read(net) != 0; 305 300 } 306 301 307 302 void net_drop_ns(void *);

+1

include/uapi/linux/fcntl.h

··· 111 111 #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ 112 112 113 113 #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ 114 + #define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ 114 115 #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ 115 116 116 117 /* Generic flags for the *at(2) family of syscalls. */

+16 -2

include/uapi/linux/nsfs.h

··· 16 16 #define NS_GET_NSTYPE _IO(NSIO, 0x3) 17 17 /* Get owner UID (in the caller's user namespace) for a user namespace */ 18 18 #define NS_GET_OWNER_UID _IO(NSIO, 0x4) 19 - /* Get the id for a mount namespace */ 20 - #define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) 21 19 /* Translate pid from target pid namespace into the caller's pid namespace. */ 22 20 #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) 23 21 /* Return thread-group leader id of pid in the callers pid namespace. */ ··· 40 42 /* Get previous namespace. */ 41 43 #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) 42 44 45 + /* Retrieve namespace identifiers. */ 46 + #define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) 47 + #define NS_GET_ID _IOR(NSIO, 13, __u64) 48 + 43 49 enum init_ns_ino { 44 50 IPC_NS_INIT_INO = 0xEFFFFFFFU, 45 51 UTS_NS_INIT_INO = 0xEFFFFFFEU, ··· 53 51 TIME_NS_INIT_INO = 0xEFFFFFFAU, 54 52 NET_NS_INIT_INO = 0xEFFFFFF9U, 55 53 MNT_NS_INIT_INO = 0xEFFFFFF8U, 54 + #ifdef __KERNEL__ 55 + MNT_NS_ANON_INO = 0xEFFFFFF7U, 56 + #endif 56 57 }; 58 + 59 + struct nsfs_file_handle { 60 + __u64 ns_id; 61 + __u32 ns_type; 62 + __u32 ns_inum; 63 + }; 64 + 65 + #define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ 66 + #define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ 57 67 58 68 #endif /* __LINUX_NSFS_H */

+2

init/main.c

··· 103 103 #include <linux/randomize_kstack.h> 104 104 #include <linux/pidfs.h> 105 105 #include <linux/ptdump.h> 106 + #include <linux/time_namespace.h> 106 107 #include <net/net_namespace.h> 107 108 108 109 #include <asm/io.h> ··· 1073 1072 fork_init(); 1074 1073 proc_caches_init(); 1075 1074 uts_ns_init(); 1075 + time_ns_init(); 1076 1076 key_init(); 1077 1077 security_init(); 1078 1078 dbg_late_init();

+3 -2

init/version-timestamp.c

··· 8 8 #include <linux/utsname.h> 9 9 10 10 struct uts_namespace init_uts_ns = { 11 - .ns.count = REFCOUNT_INIT(2), 11 + .ns.ns_type = ns_common_type(&init_uts_ns), 12 + .ns.__ns_ref = REFCOUNT_INIT(2), 12 13 .name = { 13 14 .sysname = UTS_SYSNAME, 14 15 .nodename = UTS_NODENAME, ··· 19 18 .domainname = UTS_DOMAINNAME, 20 19 }, 21 20 .user_ns = &init_user_ns, 22 - .ns.inum = PROC_UTS_INIT_INO, 21 + .ns.inum = ns_init_inum(&init_uts_ns), 23 22 #ifdef CONFIG_UTS_NS 24 23 .ns.ops = &utsns_operations, 25 24 #endif

+4 -2

ipc/msgutil.c

··· 15 15 #include <linux/proc_ns.h> 16 16 #include <linux/uaccess.h> 17 17 #include <linux/sched.h> 18 + #include <linux/nstree.h> 18 19 19 20 #include "util.h" 20 21 ··· 27 26 * and not CONFIG_IPC_NS. 28 27 */ 29 28 struct ipc_namespace init_ipc_ns = { 30 - .ns.count = REFCOUNT_INIT(1), 29 + .ns.__ns_ref = REFCOUNT_INIT(1), 31 30 .user_ns = &init_user_ns, 32 - .ns.inum = PROC_IPC_INIT_INO, 31 + .ns.inum = ns_init_inum(&init_ipc_ns), 33 32 #ifdef CONFIG_IPC_NS 34 33 .ns.ops = &ipcns_operations, 35 34 #endif 35 + .ns.ns_type = ns_common_type(&init_ipc_ns), 36 36 }; 37 37 38 38 struct msg_msgseg {

+7 -12

ipc/namespace.c

··· 15 15 #include <linux/mount.h> 16 16 #include <linux/user_namespace.h> 17 17 #include <linux/proc_ns.h> 18 + #include <linux/nstree.h> 18 19 #include <linux/sched/task.h> 19 20 20 21 #include "util.h" ··· 62 61 if (ns == NULL) 63 62 goto fail_dec; 64 63 65 - err = ns_alloc_inum(&ns->ns); 64 + err = ns_common_init(ns); 66 65 if (err) 67 66 goto fail_free; 68 - ns->ns.ops = &ipcns_operations; 69 67 70 - refcount_set(&ns->ns.count, 1); 71 68 ns->user_ns = get_user_ns(user_ns); 72 69 ns->ucounts = ucounts; 73 70 ··· 86 87 87 88 sem_init_ns(ns); 88 89 shm_init_ns(ns); 90 + ns_tree_add(ns); 89 91 90 92 return ns; 91 93 ··· 97 97 98 98 fail_put: 99 99 put_user_ns(ns->user_ns); 100 - ns_free_inum(&ns->ns); 100 + ns_common_free(ns); 101 101 fail_free: 102 102 kfree(ns); 103 103 fail_dec: ··· 161 161 162 162 dec_ipc_namespaces(ns->ucounts); 163 163 put_user_ns(ns->user_ns); 164 - ns_free_inum(&ns->ns); 164 + ns_common_free(ns); 165 165 kfree(ns); 166 166 } 167 167 ··· 199 199 */ 200 200 void put_ipc_ns(struct ipc_namespace *ns) 201 201 { 202 - if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { 202 + if (ns_ref_put_and_lock(ns, &mq_lock)) { 203 203 mq_clear_sbinfo(ns); 204 204 spin_unlock(&mq_lock); 205 205 206 + ns_tree_remove(ns); 206 207 if (llist_add(&ns->mnt_llist, &free_ipc_list)) 207 208 schedule_work(&free_ipc_work); 208 209 } 209 - } 210 - 211 - static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) 212 - { 213 - return container_of(ns, struct ipc_namespace, ns); 214 210 } 215 211 216 212 static struct ns_common *ipcns_get(struct task_struct *task) ··· 248 252 249 253 const struct proc_ns_operations ipcns_operations = { 250 254 .name = "ipc", 251 - .type = CLONE_NEWIPC, 252 255 .get = ipcns_get, 253 256 .put = ipcns_put, 254 257 .install = ipcns_install,

+2

ipc/shm.c

··· 45 45 #include <linux/mount.h> 46 46 #include <linux/ipc_namespace.h> 47 47 #include <linux/rhashtable.h> 48 + #include <linux/nstree.h> 48 49 49 50 #include <linux/uaccess.h> 50 51 ··· 149 148 static int __init ipc_ns_init(void) 150 149 { 151 150 shm_init_ns(&init_ipc_ns); 151 + ns_tree_add(&init_ipc_ns); 152 152 return 0; 153 153 } 154 154

+1 -1

kernel/Makefile

··· 8 8 sysctl.o capability.o ptrace.o user.o \ 9 9 signal.o sys.o umh.o workqueue.o pid.o task_work.o \ 10 10 extable.o params.o \ 11 - kthread.o sys_ni.o nsproxy.o \ 11 + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ 12 12 notifier.o ksysfs.o cred.o reboot.o \ 13 13 async.o range.o smpboot.o ucount.o regset.o ksyms_common.o 14 14

+5 -2

kernel/cgroup/cgroup.c

··· 59 59 #include <linux/sched/cputime.h> 60 60 #include <linux/sched/deadline.h> 61 61 #include <linux/psi.h> 62 + #include <linux/nstree.h> 62 63 #include <net/sock.h> 63 64 64 65 #define CREATE_TRACE_POINTS ··· 242 241 243 242 /* cgroup namespace for init task */ 244 243 struct cgroup_namespace init_cgroup_ns = { 245 - .ns.count = REFCOUNT_INIT(2), 244 + .ns.__ns_ref = REFCOUNT_INIT(2), 246 245 .user_ns = &init_user_ns, 247 246 .ns.ops = &cgroupns_operations, 248 - .ns.inum = PROC_CGROUP_INIT_INO, 247 + .ns.inum = ns_init_inum(&init_cgroup_ns), 249 248 .root_cset = &init_css_set, 249 + .ns.ns_type = ns_common_type(&init_cgroup_ns), 250 250 }; 251 251 252 252 static struct file_system_type cgroup2_fs_type; ··· 6338 6336 WARN_ON(register_filesystem(&cpuset_fs_type)); 6339 6337 #endif 6340 6338 6339 + ns_tree_add(&init_cgroup_ns); 6341 6340 return 0; 6342 6341 } 6343 6342

+10 -17

kernel/cgroup/namespace.c

··· 5 5 #include <linux/slab.h> 6 6 #include <linux/nsproxy.h> 7 7 #include <linux/proc_ns.h> 8 - 8 + #include <linux/nstree.h> 9 9 10 10 /* cgroup namespaces */ 11 11 ··· 21 21 22 22 static struct cgroup_namespace *alloc_cgroup_ns(void) 23 23 { 24 - struct cgroup_namespace *new_ns; 24 + struct cgroup_namespace *new_ns __free(kfree) = NULL; 25 25 int ret; 26 26 27 27 new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); 28 28 if (!new_ns) 29 29 return ERR_PTR(-ENOMEM); 30 - ret = ns_alloc_inum(&new_ns->ns); 31 - if (ret) { 32 - kfree(new_ns); 30 + ret = ns_common_init(new_ns); 31 + if (ret) 33 32 return ERR_PTR(ret); 34 - } 35 - refcount_set(&new_ns->ns.count, 1); 36 - new_ns->ns.ops = &cgroupns_operations; 37 - return new_ns; 33 + ns_tree_add(new_ns); 34 + return no_free_ptr(new_ns); 38 35 } 39 36 40 37 void free_cgroup_ns(struct cgroup_namespace *ns) 41 38 { 39 + ns_tree_remove(ns); 42 40 put_css_set(ns->root_cset); 43 41 dec_cgroup_namespaces(ns->ucounts); 44 42 put_user_ns(ns->user_ns); 45 - ns_free_inum(&ns->ns); 46 - kfree(ns); 43 + ns_common_free(ns); 44 + /* Concurrent nstree traversal depends on a grace period. */ 45 + kfree_rcu(ns, ns.ns_rcu); 47 46 } 48 47 EXPORT_SYMBOL(free_cgroup_ns); 49 48 ··· 87 88 new_ns->root_cset = cset; 88 89 89 90 return new_ns; 90 - } 91 - 92 - static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) 93 - { 94 - return container_of(ns, struct cgroup_namespace, ns); 95 91 } 96 92 97 93 static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) ··· 137 143 138 144 const struct proc_ns_operations cgroupns_operations = { 139 145 .name = "cgroup", 140 - .type = CLONE_NEWCGROUP, 141 146 .get = cgroupns_get, 142 147 .put = cgroupns_put, 143 148 .install = cgroupns_install,

+77

kernel/nscommon.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + 3 + #include <linux/ns_common.h> 4 + #include <linux/proc_ns.h> 5 + #include <linux/vfsdebug.h> 6 + 7 + #ifdef CONFIG_DEBUG_VFS 8 + static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) 9 + { 10 + switch (ns->ns_type) { 11 + #ifdef CONFIG_CGROUPS 12 + case CLONE_NEWCGROUP: 13 + VFS_WARN_ON_ONCE(ops != &cgroupns_operations); 14 + break; 15 + #endif 16 + #ifdef CONFIG_IPC_NS 17 + case CLONE_NEWIPC: 18 + VFS_WARN_ON_ONCE(ops != &ipcns_operations); 19 + break; 20 + #endif 21 + case CLONE_NEWNS: 22 + VFS_WARN_ON_ONCE(ops != &mntns_operations); 23 + break; 24 + #ifdef CONFIG_NET_NS 25 + case CLONE_NEWNET: 26 + VFS_WARN_ON_ONCE(ops != &netns_operations); 27 + break; 28 + #endif 29 + #ifdef CONFIG_PID_NS 30 + case CLONE_NEWPID: 31 + VFS_WARN_ON_ONCE(ops != &pidns_operations); 32 + break; 33 + #endif 34 + #ifdef CONFIG_TIME_NS 35 + case CLONE_NEWTIME: 36 + VFS_WARN_ON_ONCE(ops != &timens_operations); 37 + break; 38 + #endif 39 + #ifdef CONFIG_USER_NS 40 + case CLONE_NEWUSER: 41 + VFS_WARN_ON_ONCE(ops != &userns_operations); 42 + break; 43 + #endif 44 + #ifdef CONFIG_UTS_NS 45 + case CLONE_NEWUTS: 46 + VFS_WARN_ON_ONCE(ops != &utsns_operations); 47 + break; 48 + #endif 49 + } 50 + } 51 + #endif 52 + 53 + int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) 54 + { 55 + refcount_set(&ns->__ns_ref, 1); 56 + ns->stashed = NULL; 57 + ns->ops = ops; 58 + ns->ns_id = 0; 59 + ns->ns_type = ns_type; 60 + RB_CLEAR_NODE(&ns->ns_tree_node); 61 + INIT_LIST_HEAD(&ns->ns_list_node); 62 + 63 + #ifdef CONFIG_DEBUG_VFS 64 + ns_debug(ns, ops); 65 + #endif 66 + 67 + if (inum) { 68 + ns->inum = inum; 69 + return 0; 70 + } 71 + return proc_alloc_inum(&ns->inum); 72 + } 73 + 74 + void __ns_common_free(struct ns_common *ns) 75 + { 76 + proc_free_inum(ns->inum); 77 + }

+2 -2

kernel/nsproxy.c

··· 545 545 546 546 if (proc_ns_file(fd_file(f))) { 547 547 ns = get_proc_ns(file_inode(fd_file(f))); 548 - if (flags && (ns->ops->type != flags)) 548 + if (flags && (ns->ns_type != flags)) 549 549 err = -EINVAL; 550 - flags = ns->ops->type; 550 + flags = ns->ns_type; 551 551 } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { 552 552 err = check_setns_flags(flags); 553 553 } else {

+247

kernel/nstree.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + 3 + #include <linux/nstree.h> 4 + #include <linux/proc_ns.h> 5 + #include <linux/vfsdebug.h> 6 + 7 + /** 8 + * struct ns_tree - Namespace tree 9 + * @ns_tree: Rbtree of namespaces of a particular type 10 + * @ns_list: Sequentially walkable list of all namespaces of this type 11 + * @ns_tree_lock: Seqlock to protect the tree and list 12 + * @type: type of namespaces in this tree 13 + */ 14 + struct ns_tree { 15 + struct rb_root ns_tree; 16 + struct list_head ns_list; 17 + seqlock_t ns_tree_lock; 18 + int type; 19 + }; 20 + 21 + struct ns_tree mnt_ns_tree = { 22 + .ns_tree = RB_ROOT, 23 + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), 24 + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), 25 + .type = CLONE_NEWNS, 26 + }; 27 + 28 + struct ns_tree net_ns_tree = { 29 + .ns_tree = RB_ROOT, 30 + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), 31 + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), 32 + .type = CLONE_NEWNET, 33 + }; 34 + EXPORT_SYMBOL_GPL(net_ns_tree); 35 + 36 + struct ns_tree uts_ns_tree = { 37 + .ns_tree = RB_ROOT, 38 + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), 39 + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), 40 + .type = CLONE_NEWUTS, 41 + }; 42 + 43 + struct ns_tree user_ns_tree = { 44 + .ns_tree = RB_ROOT, 45 + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), 46 + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), 47 + .type = CLONE_NEWUSER, 48 + }; 49 + 50 + struct ns_tree ipc_ns_tree = { 51 + .ns_tree = RB_ROOT, 52 + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), 53 + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), 54 + .type = CLONE_NEWIPC, 55 + }; 56 + 57 + struct ns_tree pid_ns_tree = { 58 + .ns_tree = RB_ROOT, 59 + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), 60 + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), 61 + .type = CLONE_NEWPID, 62 + }; 63 + 64 + struct ns_tree cgroup_ns_tree = { 65 + .ns_tree = RB_ROOT, 66 + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), 67 + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), 68 + .type = CLONE_NEWCGROUP, 69 + }; 70 + 71 + struct ns_tree time_ns_tree = { 72 + .ns_tree = RB_ROOT, 73 + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), 74 + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), 75 + .type = CLONE_NEWTIME, 76 + }; 77 + 78 + DEFINE_COOKIE(namespace_cookie); 79 + 80 + static inline struct ns_common *node_to_ns(const struct rb_node *node) 81 + { 82 + if (!node) 83 + return NULL; 84 + return rb_entry(node, struct ns_common, ns_tree_node); 85 + } 86 + 87 + static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) 88 + { 89 + struct ns_common *ns_a = node_to_ns(a); 90 + struct ns_common *ns_b = node_to_ns(b); 91 + u64 ns_id_a = ns_a->ns_id; 92 + u64 ns_id_b = ns_b->ns_id; 93 + 94 + if (ns_id_a < ns_id_b) 95 + return -1; 96 + if (ns_id_a > ns_id_b) 97 + return 1; 98 + return 0; 99 + } 100 + 101 + void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) 102 + { 103 + struct rb_node *node, *prev; 104 + 105 + VFS_WARN_ON_ONCE(!ns->ns_id); 106 + 107 + write_seqlock(&ns_tree->ns_tree_lock); 108 + 109 + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); 110 + 111 + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); 112 + /* 113 + * If there's no previous entry simply add it after the 114 + * head and if there is add it after the previous entry. 115 + */ 116 + prev = rb_prev(&ns->ns_tree_node); 117 + if (!prev) 118 + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); 119 + else 120 + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); 121 + 122 + write_sequnlock(&ns_tree->ns_tree_lock); 123 + 124 + VFS_WARN_ON_ONCE(node); 125 + } 126 + 127 + void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) 128 + { 129 + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); 130 + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); 131 + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); 132 + 133 + write_seqlock(&ns_tree->ns_tree_lock); 134 + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); 135 + list_bidir_del_rcu(&ns->ns_list_node); 136 + RB_CLEAR_NODE(&ns->ns_tree_node); 137 + write_sequnlock(&ns_tree->ns_tree_lock); 138 + } 139 + EXPORT_SYMBOL_GPL(__ns_tree_remove); 140 + 141 + static int ns_find(const void *key, const struct rb_node *node) 142 + { 143 + const u64 ns_id = *(u64 *)key; 144 + const struct ns_common *ns = node_to_ns(node); 145 + 146 + if (ns_id < ns->ns_id) 147 + return -1; 148 + if (ns_id > ns->ns_id) 149 + return 1; 150 + return 0; 151 + } 152 + 153 + 154 + static struct ns_tree *ns_tree_from_type(int ns_type) 155 + { 156 + switch (ns_type) { 157 + case CLONE_NEWCGROUP: 158 + return &cgroup_ns_tree; 159 + case CLONE_NEWIPC: 160 + return &ipc_ns_tree; 161 + case CLONE_NEWNS: 162 + return &mnt_ns_tree; 163 + case CLONE_NEWNET: 164 + return &net_ns_tree; 165 + case CLONE_NEWPID: 166 + return &pid_ns_tree; 167 + case CLONE_NEWUSER: 168 + return &user_ns_tree; 169 + case CLONE_NEWUTS: 170 + return &uts_ns_tree; 171 + case CLONE_NEWTIME: 172 + return &time_ns_tree; 173 + } 174 + 175 + return NULL; 176 + } 177 + 178 + struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) 179 + { 180 + struct ns_tree *ns_tree; 181 + struct rb_node *node; 182 + unsigned int seq; 183 + 184 + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); 185 + 186 + ns_tree = ns_tree_from_type(ns_type); 187 + if (!ns_tree) 188 + return NULL; 189 + 190 + do { 191 + seq = read_seqbegin(&ns_tree->ns_tree_lock); 192 + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); 193 + if (node) 194 + break; 195 + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); 196 + 197 + if (!node) 198 + return NULL; 199 + 200 + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); 201 + 202 + return node_to_ns(node); 203 + } 204 + 205 + /** 206 + * ns_tree_adjoined_rcu - find the next/previous namespace in the same 207 + * tree 208 + * @ns: namespace to start from 209 + * @previous: if true find the previous namespace, otherwise the next 210 + * 211 + * Find the next or previous namespace in the same tree as @ns. If 212 + * there is no next/previous namespace, -ENOENT is returned. 213 + */ 214 + struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, 215 + struct ns_tree *ns_tree, bool previous) 216 + { 217 + struct list_head *list; 218 + 219 + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); 220 + 221 + if (previous) 222 + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); 223 + else 224 + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); 225 + if (list_is_head(list, &ns_tree->ns_list)) 226 + return ERR_PTR(-ENOENT); 227 + 228 + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); 229 + 230 + return list_entry_rcu(list, struct ns_common, ns_list_node); 231 + } 232 + 233 + /** 234 + * ns_tree_gen_id - generate a new namespace id 235 + * @ns: namespace to generate id for 236 + * 237 + * Generates a new namespace id and assigns it to the namespace. All 238 + * namespaces types share the same id space and thus can be compared 239 + * directly. IOW, when two ids of two namespace are equal, they are 240 + * identical. 241 + */ 242 + u64 ns_tree_gen_id(struct ns_common *ns) 243 + { 244 + guard(preempt)(); 245 + ns->ns_id = gen_cookie_next(&namespace_cookie); 246 + return ns->ns_id; 247 + }

+3 -2

kernel/pid.c

··· 71 71 * the scheme scales to up to 4 million PIDs, runtime. 72 72 */ 73 73 struct pid_namespace init_pid_ns = { 74 - .ns.count = REFCOUNT_INIT(2), 74 + .ns.__ns_ref = REFCOUNT_INIT(2), 75 75 .idr = IDR_INIT(init_pid_ns.idr), 76 76 .pid_allocated = PIDNS_ADDING, 77 77 .level = 0, 78 78 .child_reaper = &init_task, 79 79 .user_ns = &init_user_ns, 80 - .ns.inum = PROC_PID_INIT_INO, 80 + .ns.inum = ns_init_inum(&init_pid_ns), 81 81 #ifdef CONFIG_PID_NS 82 82 .ns.ops = &pidns_operations, 83 83 #endif ··· 85 85 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) 86 86 .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, 87 87 #endif 88 + .ns.ns_type = ns_common_type(&init_pid_ns), 88 89 }; 89 90 EXPORT_SYMBOL_GPL(init_pid_ns); 90 91

+9 -14

kernel/pid_namespace.c

··· 23 23 #include <linux/sched/task.h> 24 24 #include <linux/sched/signal.h> 25 25 #include <linux/idr.h> 26 + #include <linux/nstree.h> 26 27 #include <uapi/linux/wait.h> 27 28 #include "pid_sysctl.h" 28 29 ··· 103 102 if (ns->pid_cachep == NULL) 104 103 goto out_free_idr; 105 104 106 - err = ns_alloc_inum(&ns->ns); 105 + err = ns_common_init(ns); 107 106 if (err) 108 107 goto out_free_idr; 109 - ns->ns.ops = &pidns_operations; 110 108 111 109 ns->pid_max = PID_MAX_LIMIT; 112 110 err = register_pidns_sysctls(ns); 113 111 if (err) 114 112 goto out_free_inum; 115 113 116 - refcount_set(&ns->ns.count, 1); 117 114 ns->level = level; 118 115 ns->parent = get_pid_ns(parent_pid_ns); 119 116 ns->user_ns = get_user_ns(user_ns); ··· 123 124 ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); 124 125 #endif 125 126 127 + ns_tree_add(ns); 126 128 return ns; 127 129 128 130 out_free_inum: 129 - ns_free_inum(&ns->ns); 131 + ns_common_free(ns); 130 132 out_free_idr: 131 133 idr_destroy(&ns->idr); 132 134 kmem_cache_free(pid_ns_cachep, ns); ··· 149 149 150 150 static void destroy_pid_namespace(struct pid_namespace *ns) 151 151 { 152 + ns_tree_remove(ns); 152 153 unregister_pidns_sysctls(ns); 153 154 154 - ns_free_inum(&ns->ns); 155 + ns_common_free(ns); 155 156 156 157 idr_destroy(&ns->idr); 157 158 call_rcu(&ns->rcu, delayed_free_pidns); ··· 169 168 parent = ns->parent; 170 169 destroy_pid_namespace(ns); 171 170 ns = parent; 172 - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); 171 + } while (ns != &init_pid_ns && ns_ref_put(ns)); 173 172 } 174 173 175 174 struct pid_namespace *copy_pid_ns(u64 flags, ··· 184 183 185 184 void put_pid_ns(struct pid_namespace *ns) 186 185 { 187 - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) 186 + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) 188 187 schedule_work(&ns->work); 189 188 } 190 189 EXPORT_SYMBOL_GPL(put_pid_ns); ··· 345 344 return 0; 346 345 } 347 346 348 - static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) 349 - { 350 - return container_of(ns, struct pid_namespace, ns); 351 - } 352 - 353 347 static struct ns_common *pidns_get(struct task_struct *task) 354 348 { 355 349 struct pid_namespace *ns; ··· 449 453 450 454 const struct proc_ns_operations pidns_operations = { 451 455 .name = "pid", 452 - .type = CLONE_NEWPID, 453 456 .get = pidns_get, 454 457 .put = pidns_put, 455 458 .install = pidns_install, ··· 459 464 const struct proc_ns_operations pidns_for_children_operations = { 460 465 .name = "pid_for_children", 461 466 .real_ns_name = "pid", 462 - .type = CLONE_NEWPID, 463 467 .get = pidns_for_children_get, 464 468 .put = pidns_put, 465 469 .install = pidns_install, ··· 475 481 #endif 476 482 477 483 register_pid_ns_sysctl_table_vm(); 484 + ns_tree_add(&init_pid_ns); 478 485 return 0; 479 486 } 480 487

+16 -16

kernel/time/namespace.c

··· 12 12 #include <linux/seq_file.h> 13 13 #include <linux/proc_ns.h> 14 14 #include <linux/export.h> 15 + #include <linux/nstree.h> 15 16 #include <linux/time.h> 16 17 #include <linux/slab.h> 17 18 #include <linux/cred.h> ··· 89 88 goto fail; 90 89 91 90 err = -ENOMEM; 92 - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); 91 + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); 93 92 if (!ns) 94 93 goto fail_dec; 95 - 96 - refcount_set(&ns->ns.count, 1); 97 94 98 95 ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 99 96 if (!ns->vvar_page) 100 97 goto fail_free; 101 98 102 - err = ns_alloc_inum(&ns->ns); 99 + err = ns_common_init(ns); 103 100 if (err) 104 101 goto fail_free_page; 105 102 106 103 ns->ucounts = ucounts; 107 - ns->ns.ops = &timens_operations; 108 104 ns->user_ns = get_user_ns(user_ns); 109 105 ns->offsets = old_ns->offsets; 110 106 ns->frozen_offsets = false; 107 + ns_tree_add(ns); 111 108 return ns; 112 109 113 110 fail_free_page: ··· 252 253 253 254 void free_time_ns(struct time_namespace *ns) 254 255 { 256 + ns_tree_remove(ns); 255 257 dec_time_namespaces(ns->ucounts); 256 258 put_user_ns(ns->user_ns); 257 - ns_free_inum(&ns->ns); 259 + ns_common_free(ns); 258 260 __free_page(ns->vvar_page); 259 - kfree(ns); 260 - } 261 - 262 - static struct time_namespace *to_time_ns(struct ns_common *ns) 263 - { 264 - return container_of(ns, struct time_namespace, ns); 261 + /* Concurrent nstree traversal depends on a grace period. */ 262 + kfree_rcu(ns, ns.ns_rcu); 265 263 } 266 264 267 265 static struct ns_common *timens_get(struct task_struct *task) ··· 462 466 463 467 const struct proc_ns_operations timens_operations = { 464 468 .name = "time", 465 - .type = CLONE_NEWTIME, 466 469 .get = timens_get, 467 470 .put = timens_put, 468 471 .install = timens_install, ··· 471 476 const struct proc_ns_operations timens_for_children_operations = { 472 477 .name = "time_for_children", 473 478 .real_ns_name = "time", 474 - .type = CLONE_NEWTIME, 475 479 .get = timens_for_children_get, 476 480 .put = timens_put, 477 481 .install = timens_install, ··· 478 484 }; 479 485 480 486 struct time_namespace init_time_ns = { 481 - .ns.count = REFCOUNT_INIT(3), 487 + .ns.ns_type = ns_common_type(&init_time_ns), 488 + .ns.__ns_ref = REFCOUNT_INIT(3), 482 489 .user_ns = &init_user_ns, 483 - .ns.inum = PROC_TIME_INIT_INO, 490 + .ns.inum = ns_init_inum(&init_time_ns), 484 491 .ns.ops = &timens_operations, 485 492 .frozen_offsets = true, 486 493 }; 494 + 495 + void __init time_ns_init(void) 496 + { 497 + ns_tree_add(&init_time_ns); 498 + }

+3 -2

kernel/user.c

··· 65 65 .nr_extents = 1, 66 66 }, 67 67 }, 68 - .ns.count = REFCOUNT_INIT(3), 68 + .ns.ns_type = ns_common_type(&init_user_ns), 69 + .ns.__ns_ref = REFCOUNT_INIT(3), 69 70 .owner = GLOBAL_ROOT_UID, 70 71 .group = GLOBAL_ROOT_GID, 71 - .ns.inum = PROC_USER_INIT_INO, 72 + .ns.inum = ns_init_inum(&init_user_ns), 72 73 #ifdef CONFIG_USER_NS 73 74 .ns.ops = &userns_operations, 74 75 #endif

+11 -13

kernel/user_namespace.c

··· 21 21 #include <linux/fs_struct.h> 22 22 #include <linux/bsearch.h> 23 23 #include <linux/sort.h> 24 + #include <linux/nstree.h> 24 25 25 26 static struct kmem_cache *user_ns_cachep __ro_after_init; 26 27 static DEFINE_MUTEX(userns_state_mutex); ··· 125 124 goto fail_dec; 126 125 127 126 ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); 128 - ret = ns_alloc_inum(&ns->ns); 127 + 128 + ret = ns_common_init(ns); 129 129 if (ret) 130 130 goto fail_free; 131 - ns->ns.ops = &userns_operations; 132 131 133 - refcount_set(&ns->ns.count, 1); 134 132 /* Leave the new->user_ns reference with the new user namespace. */ 135 133 ns->parent = parent_ns; 136 134 ns->level = parent_ns->level + 1; ··· 159 159 goto fail_keyring; 160 160 161 161 set_cred_user_ns(new, ns); 162 + ns_tree_add(ns); 162 163 return 0; 163 164 fail_keyring: 164 165 #ifdef CONFIG_PERSISTENT_KEYRINGS 165 166 key_put(ns->persistent_keyring_register); 166 167 #endif 167 - ns_free_inum(&ns->ns); 168 + ns_common_free(ns); 168 169 fail_free: 169 170 kmem_cache_free(user_ns_cachep, ns); 170 171 fail_dec: ··· 202 201 do { 203 202 struct ucounts *ucounts = ns->ucounts; 204 203 parent = ns->parent; 204 + ns_tree_remove(ns); 205 205 if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { 206 206 kfree(ns->gid_map.forward); 207 207 kfree(ns->gid_map.reverse); ··· 220 218 #endif 221 219 retire_userns_sysctls(ns); 222 220 key_free_user_ns(ns); 223 - ns_free_inum(&ns->ns); 224 - kmem_cache_free(user_ns_cachep, ns); 221 + ns_common_free(ns); 222 + /* Concurrent nstree traversal depends on a grace period. */ 223 + kfree_rcu(ns, ns.ns_rcu); 225 224 dec_user_namespaces(ucounts); 226 225 ns = parent; 227 - } while (refcount_dec_and_test(&parent->ns.count)); 226 + } while (ns_ref_put(parent)); 228 227 } 229 228 230 229 void __put_user_ns(struct user_namespace *ns) ··· 1325 1322 } 1326 1323 EXPORT_SYMBOL(current_in_userns); 1327 1324 1328 - static inline struct user_namespace *to_user_ns(struct ns_common *ns) 1329 - { 1330 - return container_of(ns, struct user_namespace, ns); 1331 - } 1332 - 1333 1325 static struct ns_common *userns_get(struct task_struct *task) 1334 1326 { 1335 1327 struct user_namespace *user_ns; ··· 1400 1402 1401 1403 const struct proc_ns_operations userns_operations = { 1402 1404 .name = "user", 1403 - .type = CLONE_NEWUSER, 1404 1405 .get = userns_get, 1405 1406 .put = userns_put, 1406 1407 .install = userns_install, ··· 1410 1413 static __init int user_namespaces_init(void) 1411 1414 { 1412 1415 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); 1416 + ns_tree_add(&init_user_ns); 1413 1417 return 0; 1414 1418 } 1415 1419 subsys_initcall(user_namespaces_init);

+9 -22

kernel/utsname.c

··· 13 13 #include <linux/cred.h> 14 14 #include <linux/user_namespace.h> 15 15 #include <linux/proc_ns.h> 16 + #include <linux/nstree.h> 16 17 #include <linux/sched/task.h> 17 18 18 19 static struct kmem_cache *uts_ns_cache __ro_after_init; ··· 26 25 static void dec_uts_namespaces(struct ucounts *ucounts) 27 26 { 28 27 dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); 29 - } 30 - 31 - static struct uts_namespace *create_uts_ns(void) 32 - { 33 - struct uts_namespace *uts_ns; 34 - 35 - uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); 36 - if (uts_ns) 37 - refcount_set(&uts_ns->ns.count, 1); 38 - return uts_ns; 39 28 } 40 29 41 30 /* ··· 46 55 goto fail; 47 56 48 57 err = -ENOMEM; 49 - ns = create_uts_ns(); 58 + ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL); 50 59 if (!ns) 51 60 goto fail_dec; 52 61 53 - err = ns_alloc_inum(&ns->ns); 62 + err = ns_common_init(ns); 54 63 if (err) 55 64 goto fail_free; 56 65 57 66 ns->ucounts = ucounts; 58 - ns->ns.ops = &utsns_operations; 59 - 60 67 down_read(&uts_sem); 61 68 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 62 69 ns->user_ns = get_user_ns(user_ns); 63 70 up_read(&uts_sem); 71 + ns_tree_add(ns); 64 72 return ns; 65 73 66 74 fail_free: ··· 95 105 96 106 void free_uts_ns(struct uts_namespace *ns) 97 107 { 108 + ns_tree_remove(ns); 98 109 dec_uts_namespaces(ns->ucounts); 99 110 put_user_ns(ns->user_ns); 100 - ns_free_inum(&ns->ns); 101 - kmem_cache_free(uts_ns_cache, ns); 102 - } 103 - 104 - static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) 105 - { 106 - return container_of(ns, struct uts_namespace, ns); 111 + ns_common_free(ns); 112 + /* Concurrent nstree traversal depends on a grace period. */ 113 + kfree_rcu(ns, ns.ns_rcu); 107 114 } 108 115 109 116 static struct ns_common *utsns_get(struct task_struct *task) ··· 146 159 147 160 const struct proc_ns_operations utsns_operations = { 148 161 .name = "uts", 149 - .type = CLONE_NEWUTS, 150 162 .get = utsns_get, 151 163 .put = utsns_put, 152 164 .install = utsns_install, ··· 160 174 offsetof(struct uts_namespace, name), 161 175 sizeof_field(struct uts_namespace, name), 162 176 NULL); 177 + ns_tree_add(&init_uts_ns); 163 178 }

+3 -3

net/core/net-sysfs.c

··· 1328 1328 struct netdev_rx_queue *queue = &dev->_rx[i]; 1329 1329 struct kobject *kobj = &queue->kobj; 1330 1330 1331 - if (!refcount_read(&dev_net(dev)->ns.count)) 1331 + if (!check_net(dev_net(dev))) 1332 1332 kobj->uevent_suppress = 1; 1333 1333 if (dev->sysfs_rx_queue_group) 1334 1334 sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); ··· 2061 2061 while (--i >= new_num) { 2062 2062 struct netdev_queue *queue = dev->_tx + i; 2063 2063 2064 - if (!refcount_read(&dev_net(dev)->ns.count)) 2064 + if (!check_net(dev_net(dev))) 2065 2065 queue->kobj.uevent_suppress = 1; 2066 2066 2067 2067 if (netdev_uses_bql(dev)) ··· 2315 2315 { 2316 2316 struct device *dev = &ndev->dev; 2317 2317 2318 - if (!refcount_read(&dev_net(ndev)->ns.count)) 2318 + if (!check_net(dev_net(ndev))) 2319 2319 dev_set_uevent_suppress(dev, 1); 2320 2320 2321 2321 kobject_get(&dev->kobj);

+26 -32

net/core/net_namespace.c

··· 20 20 #include <linux/sched/task.h> 21 21 #include <linux/uidgid.h> 22 22 #include <linux/proc_fs.h> 23 + #include <linux/nstree.h> 23 24 24 25 #include <net/aligned_data.h> 25 26 #include <net/sock.h> ··· 315 314 { 316 315 int id; 317 316 318 - if (refcount_read(&net->ns.count) == 0) 317 + if (!check_net(net)) 319 318 return NETNSA_NSID_NOT_ASSIGNED; 320 319 321 320 spin_lock(&net->nsid_lock); ··· 398 397 } 399 398 400 399 /* init code that must occur even if setup_net() is not called. */ 401 - static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) 400 + static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) 402 401 { 402 + int ret; 403 + 404 + ret = ns_common_init(net); 405 + if (ret) 406 + return ret; 407 + 403 408 refcount_set(&net->passive, 1); 404 - refcount_set(&net->ns.count, 1); 405 409 ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); 406 410 ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); 407 411 ··· 426 420 INIT_LIST_HEAD(&net->ptype_all); 427 421 INIT_LIST_HEAD(&net->ptype_specific); 428 422 preinit_net_sysctl(net); 423 + return 0; 429 424 } 430 425 431 426 /* ··· 439 432 LIST_HEAD(net_exit_list); 440 433 int error = 0; 441 434 442 - net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); 435 + net->net_cookie = ns_tree_gen_id(&net->ns); 443 436 444 437 list_for_each_entry(ops, &pernet_list, list) { 445 438 error = ops_init(ops, net); ··· 449 442 down_write(&net_rwsem); 450 443 list_add_tail_rcu(&net->list, &net_namespace_list); 451 444 up_write(&net_rwsem); 445 + ns_tree_add_raw(net); 452 446 out: 453 447 return error; 454 448 ··· 567 559 goto dec_ucounts; 568 560 } 569 561 570 - preinit_net(net, user_ns); 562 + rv = preinit_net(net, user_ns); 563 + if (rv < 0) 564 + goto dec_ucounts; 571 565 net->ucounts = ucounts; 572 566 get_user_ns(user_ns); 573 567 ··· 583 573 584 574 if (rv < 0) { 585 575 put_userns: 576 + ns_common_free(net); 586 577 #ifdef CONFIG_KEYS 587 578 key_remove_domain(net->key_domain); 588 579 #endif ··· 670 659 671 660 /* Don't let anyone else find us. */ 672 661 down_write(&net_rwsem); 673 - llist_for_each_entry(net, net_kill_list, cleanup_list) 662 + llist_for_each_entry(net, net_kill_list, cleanup_list) { 663 + ns_tree_remove(net); 674 664 list_del_rcu(&net->list); 665 + } 675 666 /* Cache last net. After we unlock rtnl, no one new net 676 667 * added to net_namespace_list can assign nsid pointer 677 668 * to a net from net_kill_list (see peernet2id_alloc()). ··· 706 693 /* Finally it is safe to free my network namespace structure */ 707 694 list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { 708 695 list_del_init(&net->exit_list); 696 + ns_common_free(net); 709 697 dec_net_namespaces(net->ucounts); 710 698 #ifdef CONFIG_KEYS 711 699 key_remove_domain(net->key_domain); ··· 826 812 827 813 static __net_init int net_ns_net_init(struct net *net) 828 814 { 829 - #ifdef CONFIG_NET_NS 830 - net->ns.ops = &netns_operations; 831 - #endif 832 - net->ns.inum = PROC_NET_INIT_INO; 833 - if (net != &init_net) { 834 - int ret = ns_alloc_inum(&net->ns); 835 - if (ret) 836 - return ret; 837 - } 838 815 net_ns_net_debugfs(net); 839 816 return 0; 840 817 } 841 818 842 - static __net_exit void net_ns_net_exit(struct net *net) 843 - { 844 - /* 845 - * Initial network namespace doesn't exit so we don't need any 846 - * special checks here. 847 - */ 848 - ns_free_inum(&net->ns); 849 - } 850 - 851 819 static struct pernet_operations __net_initdata net_ns_ops = { 852 820 .init = net_ns_net_init, 853 - .exit = net_ns_net_exit, 854 821 }; 855 822 856 823 static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { ··· 1277 1282 #ifdef CONFIG_KEYS 1278 1283 init_net.key_domain = &init_net_key_domain; 1279 1284 #endif 1280 - preinit_net(&init_net, &init_user_ns); 1285 + /* 1286 + * This currently cannot fail as the initial network namespace 1287 + * has a static inode number. 1288 + */ 1289 + if (preinit_net(&init_net, &init_user_ns)) 1290 + panic("Could not preinitialize the initial network namespace"); 1281 1291 1282 1292 down_write(&pernet_ops_rwsem); 1283 1293 if (setup_net(&init_net)) ··· 1517 1517 return net ? &net->ns : NULL; 1518 1518 } 1519 1519 1520 - static inline struct net *to_net_ns(struct ns_common *ns) 1521 - { 1522 - return container_of(ns, struct net, ns); 1523 - } 1524 - 1525 1520 static void netns_put(struct ns_common *ns) 1526 1521 { 1527 1522 put_net(to_net_ns(ns)); ··· 1543 1548 1544 1549 const struct proc_ns_operations netns_operations = { 1545 1550 .name = "net", 1546 - .type = CLONE_NEWNET, 1547 1551 .get = netns_get, 1548 1552 .put = netns_put, 1549 1553 .install = netns_install,

+2 -2

net/ipv4/inet_timewait_sock.c

··· 329 329 TCPF_NEW_SYN_RECV)) 330 330 continue; 331 331 332 - if (refcount_read(&sock_net(sk)->ns.count)) 332 + if (check_net(sock_net(sk))) 333 333 continue; 334 334 335 335 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 336 336 continue; 337 337 338 - if (refcount_read(&sock_net(sk)->ns.count)) { 338 + if (check_net(sock_net(sk))) { 339 339 sock_gen_put(sk); 340 340 goto restart; 341 341 }

+1 -1

net/ipv4/tcp_metrics.c

··· 912 912 spin_lock_bh(&tcp_metrics_lock); 913 913 for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { 914 914 match = net ? net_eq(tm_net(tm), net) : 915 - !refcount_read(&tm_net(tm)->ns.count); 915 + !check_net(tm_net(tm)); 916 916 if (match) { 917 917 rcu_assign_pointer(*pp, tm->tcpm_next); 918 918 kfree_rcu(tm, rcu_head);

+15 -2

tools/include/uapi/linux/nsfs.h

··· 16 16 #define NS_GET_NSTYPE _IO(NSIO, 0x3) 17 17 /* Get owner UID (in the caller's user namespace) for a user namespace */ 18 18 #define NS_GET_OWNER_UID _IO(NSIO, 0x4) 19 - /* Get the id for a mount namespace */ 20 - #define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) 21 19 /* Translate pid from target pid namespace into the caller's pid namespace. */ 22 20 #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) 23 21 /* Return thread-group leader id of pid in the callers pid namespace. */ ··· 39 41 #define NS_MNT_GET_NEXT _IOR(NSIO, 11, struct mnt_ns_info) 40 42 /* Get previous namespace. */ 41 43 #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) 44 + 45 + /* Retrieve namespace identifiers. */ 46 + #define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) 47 + #define NS_GET_ID _IOR(NSIO, 13, __u64) 48 + 49 + enum init_ns_ino { 50 + IPC_NS_INIT_INO = 0xEFFFFFFFU, 51 + UTS_NS_INIT_INO = 0xEFFFFFFEU, 52 + USER_NS_INIT_INO = 0xEFFFFFFDU, 53 + PID_NS_INIT_INO = 0xEFFFFFFCU, 54 + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, 55 + TIME_NS_INIT_INO = 0xEFFFFFFAU, 56 + NET_NS_INIT_INO = 0xEFFFFFF9U, 57 + MNT_NS_INIT_INO = 0xEFFFFFF8U, 58 + }; 42 59 43 60 #endif /* __LINUX_NSFS_H */

+3

tools/testing/selftests/namespaces/.gitignore

··· 1 + nsid_test 2 + file_handle_test 3 + init_ino_test

+7

tools/testing/selftests/namespaces/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) 3 + 4 + TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test 5 + 6 + include ../lib.mk 7 +

+7

tools/testing/selftests/namespaces/config

··· 1 + CONFIG_UTS_NS=y 2 + CONFIG_TIME_NS=y 3 + CONFIG_IPC_NS=y 4 + CONFIG_USER_NS=y 5 + CONFIG_PID_NS=y 6 + CONFIG_NET_NS=y 7 + CONFIG_CGROUPS=y

+1429

tools/testing/selftests/namespaces/file_handle_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <grp.h> 6 + #include <limits.h> 7 + #include <sched.h> 8 + #include <stdio.h> 9 + #include <stdlib.h> 10 + #include <string.h> 11 + #include <sys/mount.h> 12 + #include <sys/stat.h> 13 + #include <sys/types.h> 14 + #include <sys/wait.h> 15 + #include <unistd.h> 16 + #include <linux/unistd.h> 17 + #include "../kselftest_harness.h" 18 + 19 + #ifndef FD_NSFS_ROOT 20 + #define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ 21 + #endif 22 + 23 + TEST(nsfs_net_handle) 24 + { 25 + struct file_handle *handle; 26 + int mount_id; 27 + int ret; 28 + int fd; 29 + int ns_fd; 30 + struct stat st1, st2; 31 + 32 + /* Drop to unprivileged uid/gid */ 33 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 34 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 35 + 36 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 37 + ASSERT_NE(handle, NULL); 38 + 39 + /* Open a namespace file descriptor */ 40 + ns_fd = open("/proc/self/ns/net", O_RDONLY); 41 + ASSERT_GE(ns_fd, 0); 42 + 43 + /* Get handle for the namespace */ 44 + handle->handle_bytes = MAX_HANDLE_SZ; 45 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 46 + if (ret < 0 && errno == EOPNOTSUPP) { 47 + SKIP(free(handle); close(ns_fd); 48 + return, "nsfs doesn't support file handles"); 49 + } 50 + ASSERT_EQ(ret, 0); 51 + ASSERT_GT(handle->handle_bytes, 0); 52 + 53 + /* Try to open using FD_NSFS_ROOT as unprivileged user */ 54 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 55 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 56 + SKIP(free(handle); close(ns_fd); 57 + return, 58 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 59 + } 60 + if (fd < 0 && errno == EPERM) { 61 + SKIP(free(handle); close(ns_fd); 62 + return, 63 + "Permission denied for unprivileged user (expected)"); 64 + } 65 + ASSERT_GE(fd, 0); 66 + 67 + /* Verify we opened the correct namespace */ 68 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 69 + ASSERT_EQ(fstat(fd, &st2), 0); 70 + ASSERT_EQ(st1.st_ino, st2.st_ino); 71 + ASSERT_EQ(st1.st_dev, st2.st_dev); 72 + 73 + close(fd); 74 + close(ns_fd); 75 + free(handle); 76 + } 77 + 78 + TEST(nsfs_uts_handle) 79 + { 80 + struct file_handle *handle; 81 + int mount_id; 82 + int ret; 83 + int fd; 84 + int ns_fd; 85 + struct stat st1, st2; 86 + 87 + /* Drop to unprivileged uid/gid */ 88 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 89 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 90 + 91 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 92 + ASSERT_NE(handle, NULL); 93 + 94 + /* Open UTS namespace file descriptor */ 95 + ns_fd = open("/proc/self/ns/uts", O_RDONLY); 96 + ASSERT_GE(ns_fd, 0); 97 + 98 + /* Get handle for the namespace */ 99 + handle->handle_bytes = MAX_HANDLE_SZ; 100 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 101 + if (ret < 0 && errno == EOPNOTSUPP) { 102 + SKIP(free(handle); close(ns_fd); 103 + return, "nsfs doesn't support file handles"); 104 + } 105 + ASSERT_EQ(ret, 0); 106 + ASSERT_GT(handle->handle_bytes, 0); 107 + 108 + /* Try to open using FD_NSFS_ROOT */ 109 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 110 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 111 + SKIP(free(handle); close(ns_fd); 112 + return, 113 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 114 + } 115 + ASSERT_GE(fd, 0); 116 + 117 + /* Verify we opened the correct namespace */ 118 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 119 + ASSERT_EQ(fstat(fd, &st2), 0); 120 + ASSERT_EQ(st1.st_ino, st2.st_ino); 121 + ASSERT_EQ(st1.st_dev, st2.st_dev); 122 + 123 + close(fd); 124 + close(ns_fd); 125 + free(handle); 126 + } 127 + 128 + TEST(nsfs_ipc_handle) 129 + { 130 + struct file_handle *handle; 131 + int mount_id; 132 + int ret; 133 + int fd; 134 + int ns_fd; 135 + struct stat st1, st2; 136 + 137 + /* Drop to unprivileged uid/gid */ 138 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 139 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 140 + 141 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 142 + ASSERT_NE(handle, NULL); 143 + 144 + /* Open IPC namespace file descriptor */ 145 + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); 146 + ASSERT_GE(ns_fd, 0); 147 + 148 + /* Get handle for the namespace */ 149 + handle->handle_bytes = MAX_HANDLE_SZ; 150 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 151 + if (ret < 0 && errno == EOPNOTSUPP) { 152 + SKIP(free(handle); close(ns_fd); 153 + return, "nsfs doesn't support file handles"); 154 + } 155 + ASSERT_EQ(ret, 0); 156 + ASSERT_GT(handle->handle_bytes, 0); 157 + 158 + /* Try to open using FD_NSFS_ROOT */ 159 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 160 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 161 + SKIP(free(handle); close(ns_fd); 162 + return, 163 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 164 + } 165 + ASSERT_GE(fd, 0); 166 + 167 + /* Verify we opened the correct namespace */ 168 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 169 + ASSERT_EQ(fstat(fd, &st2), 0); 170 + ASSERT_EQ(st1.st_ino, st2.st_ino); 171 + ASSERT_EQ(st1.st_dev, st2.st_dev); 172 + 173 + close(fd); 174 + close(ns_fd); 175 + free(handle); 176 + } 177 + 178 + TEST(nsfs_pid_handle) 179 + { 180 + struct file_handle *handle; 181 + int mount_id; 182 + int ret; 183 + int fd; 184 + int ns_fd; 185 + struct stat st1, st2; 186 + 187 + /* Drop to unprivileged uid/gid */ 188 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 189 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 190 + 191 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 192 + ASSERT_NE(handle, NULL); 193 + 194 + /* Open PID namespace file descriptor */ 195 + ns_fd = open("/proc/self/ns/pid", O_RDONLY); 196 + ASSERT_GE(ns_fd, 0); 197 + 198 + /* Get handle for the namespace */ 199 + handle->handle_bytes = MAX_HANDLE_SZ; 200 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 201 + if (ret < 0 && errno == EOPNOTSUPP) { 202 + SKIP(free(handle); close(ns_fd); 203 + return, "nsfs doesn't support file handles"); 204 + } 205 + ASSERT_EQ(ret, 0); 206 + ASSERT_GT(handle->handle_bytes, 0); 207 + 208 + /* Try to open using FD_NSFS_ROOT */ 209 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 210 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 211 + SKIP(free(handle); close(ns_fd); 212 + return, 213 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 214 + } 215 + ASSERT_GE(fd, 0); 216 + 217 + /* Verify we opened the correct namespace */ 218 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 219 + ASSERT_EQ(fstat(fd, &st2), 0); 220 + ASSERT_EQ(st1.st_ino, st2.st_ino); 221 + ASSERT_EQ(st1.st_dev, st2.st_dev); 222 + 223 + close(fd); 224 + close(ns_fd); 225 + free(handle); 226 + } 227 + 228 + TEST(nsfs_mnt_handle) 229 + { 230 + struct file_handle *handle; 231 + int mount_id; 232 + int ret; 233 + int fd; 234 + int ns_fd; 235 + struct stat st1, st2; 236 + 237 + /* Drop to unprivileged uid/gid */ 238 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 239 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 240 + 241 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 242 + ASSERT_NE(handle, NULL); 243 + 244 + /* Open mount namespace file descriptor */ 245 + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); 246 + ASSERT_GE(ns_fd, 0); 247 + 248 + /* Get handle for the namespace */ 249 + handle->handle_bytes = MAX_HANDLE_SZ; 250 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 251 + if (ret < 0 && errno == EOPNOTSUPP) { 252 + SKIP(free(handle); close(ns_fd); 253 + return, "nsfs doesn't support file handles"); 254 + } 255 + ASSERT_EQ(ret, 0); 256 + ASSERT_GT(handle->handle_bytes, 0); 257 + 258 + /* Try to open using FD_NSFS_ROOT */ 259 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 260 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 261 + SKIP(free(handle); close(ns_fd); 262 + return, 263 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 264 + } 265 + ASSERT_GE(fd, 0); 266 + 267 + /* Verify we opened the correct namespace */ 268 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 269 + ASSERT_EQ(fstat(fd, &st2), 0); 270 + ASSERT_EQ(st1.st_ino, st2.st_ino); 271 + ASSERT_EQ(st1.st_dev, st2.st_dev); 272 + 273 + close(fd); 274 + close(ns_fd); 275 + free(handle); 276 + } 277 + 278 + TEST(nsfs_user_handle) 279 + { 280 + struct file_handle *handle; 281 + int mount_id; 282 + int ret; 283 + int fd; 284 + int ns_fd; 285 + struct stat st1, st2; 286 + 287 + /* Drop to unprivileged uid/gid */ 288 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 289 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 290 + 291 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 292 + ASSERT_NE(handle, NULL); 293 + 294 + /* Open user namespace file descriptor */ 295 + ns_fd = open("/proc/self/ns/user", O_RDONLY); 296 + ASSERT_GE(ns_fd, 0); 297 + 298 + /* Get handle for the namespace */ 299 + handle->handle_bytes = MAX_HANDLE_SZ; 300 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 301 + if (ret < 0 && errno == EOPNOTSUPP) { 302 + SKIP(free(handle); close(ns_fd); 303 + return, "nsfs doesn't support file handles"); 304 + } 305 + ASSERT_EQ(ret, 0); 306 + ASSERT_GT(handle->handle_bytes, 0); 307 + 308 + /* Try to open using FD_NSFS_ROOT */ 309 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 310 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 311 + SKIP(free(handle); close(ns_fd); 312 + return, 313 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 314 + } 315 + ASSERT_GE(fd, 0); 316 + 317 + /* Verify we opened the correct namespace */ 318 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 319 + ASSERT_EQ(fstat(fd, &st2), 0); 320 + ASSERT_EQ(st1.st_ino, st2.st_ino); 321 + ASSERT_EQ(st1.st_dev, st2.st_dev); 322 + 323 + close(fd); 324 + close(ns_fd); 325 + free(handle); 326 + } 327 + 328 + TEST(nsfs_cgroup_handle) 329 + { 330 + struct file_handle *handle; 331 + int mount_id; 332 + int ret; 333 + int fd; 334 + int ns_fd; 335 + struct stat st1, st2; 336 + 337 + /* Drop to unprivileged uid/gid */ 338 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 339 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 340 + 341 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 342 + ASSERT_NE(handle, NULL); 343 + 344 + /* Open cgroup namespace file descriptor */ 345 + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); 346 + if (ns_fd < 0) { 347 + SKIP(free(handle); return, "cgroup namespace not available"); 348 + } 349 + 350 + /* Get handle for the namespace */ 351 + handle->handle_bytes = MAX_HANDLE_SZ; 352 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 353 + if (ret < 0 && errno == EOPNOTSUPP) { 354 + SKIP(free(handle); close(ns_fd); 355 + return, "nsfs doesn't support file handles"); 356 + } 357 + ASSERT_EQ(ret, 0); 358 + ASSERT_GT(handle->handle_bytes, 0); 359 + 360 + /* Try to open using FD_NSFS_ROOT */ 361 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 362 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 363 + SKIP(free(handle); close(ns_fd); 364 + return, 365 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 366 + } 367 + ASSERT_GE(fd, 0); 368 + 369 + /* Verify we opened the correct namespace */ 370 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 371 + ASSERT_EQ(fstat(fd, &st2), 0); 372 + ASSERT_EQ(st1.st_ino, st2.st_ino); 373 + ASSERT_EQ(st1.st_dev, st2.st_dev); 374 + 375 + close(fd); 376 + close(ns_fd); 377 + free(handle); 378 + } 379 + 380 + TEST(nsfs_time_handle) 381 + { 382 + struct file_handle *handle; 383 + int mount_id; 384 + int ret; 385 + int fd; 386 + int ns_fd; 387 + struct stat st1, st2; 388 + 389 + /* Drop to unprivileged uid/gid */ 390 + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ 391 + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ 392 + 393 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 394 + ASSERT_NE(handle, NULL); 395 + 396 + /* Open time namespace file descriptor */ 397 + ns_fd = open("/proc/self/ns/time", O_RDONLY); 398 + if (ns_fd < 0) { 399 + SKIP(free(handle); return, "time namespace not available"); 400 + } 401 + 402 + /* Get handle for the namespace */ 403 + handle->handle_bytes = MAX_HANDLE_SZ; 404 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 405 + if (ret < 0 && errno == EOPNOTSUPP) { 406 + SKIP(free(handle); close(ns_fd); 407 + return, "nsfs doesn't support file handles"); 408 + } 409 + ASSERT_EQ(ret, 0); 410 + ASSERT_GT(handle->handle_bytes, 0); 411 + 412 + /* Try to open using FD_NSFS_ROOT */ 413 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 414 + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 415 + SKIP(free(handle); close(ns_fd); 416 + return, 417 + "open_by_handle_at with FD_NSFS_ROOT not supported"); 418 + } 419 + ASSERT_GE(fd, 0); 420 + 421 + /* Verify we opened the correct namespace */ 422 + ASSERT_EQ(fstat(ns_fd, &st1), 0); 423 + ASSERT_EQ(fstat(fd, &st2), 0); 424 + ASSERT_EQ(st1.st_ino, st2.st_ino); 425 + ASSERT_EQ(st1.st_dev, st2.st_dev); 426 + 427 + close(fd); 428 + close(ns_fd); 429 + free(handle); 430 + } 431 + 432 + TEST(nsfs_user_net_namespace_isolation) 433 + { 434 + struct file_handle *handle; 435 + int mount_id; 436 + int ret; 437 + int fd; 438 + int ns_fd; 439 + pid_t pid; 440 + int status; 441 + int pipefd[2]; 442 + char result; 443 + 444 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 445 + ASSERT_NE(handle, NULL); 446 + 447 + /* Create pipe for communication */ 448 + ASSERT_EQ(pipe(pipefd), 0); 449 + 450 + /* Get handle for current network namespace */ 451 + ns_fd = open("/proc/self/ns/net", O_RDONLY); 452 + ASSERT_GE(ns_fd, 0); 453 + 454 + handle->handle_bytes = MAX_HANDLE_SZ; 455 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 456 + if (ret < 0 && errno == EOPNOTSUPP) { 457 + SKIP(free(handle); close(ns_fd); close(pipefd[0]); 458 + close(pipefd[1]); 459 + return, "nsfs doesn't support file handles"); 460 + } 461 + ASSERT_EQ(ret, 0); 462 + close(ns_fd); 463 + 464 + pid = fork(); 465 + ASSERT_GE(pid, 0); 466 + 467 + if (pid == 0) { 468 + /* Child process */ 469 + close(pipefd[0]); 470 + 471 + /* First create new user namespace to drop privileges */ 472 + ret = unshare(CLONE_NEWUSER); 473 + if (ret < 0) { 474 + write(pipefd[1], "U", 475 + 1); /* Unable to create user namespace */ 476 + close(pipefd[1]); 477 + exit(0); 478 + } 479 + 480 + /* Write uid/gid mappings to maintain some capabilities */ 481 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 482 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 483 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 484 + 485 + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { 486 + write(pipefd[1], "M", 1); /* Unable to set mappings */ 487 + close(pipefd[1]); 488 + exit(0); 489 + } 490 + 491 + /* Disable setgroups to allow gid mapping */ 492 + write(setgroups_fd, "deny", 4); 493 + close(setgroups_fd); 494 + 495 + /* Map current uid/gid to root in the new namespace */ 496 + char mapping[64]; 497 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 498 + write(uid_map_fd, mapping, strlen(mapping)); 499 + close(uid_map_fd); 500 + 501 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 502 + write(gid_map_fd, mapping, strlen(mapping)); 503 + close(gid_map_fd); 504 + 505 + /* Now create new network namespace */ 506 + ret = unshare(CLONE_NEWNET); 507 + if (ret < 0) { 508 + write(pipefd[1], "N", 509 + 1); /* Unable to create network namespace */ 510 + close(pipefd[1]); 511 + exit(0); 512 + } 513 + 514 + /* Try to open parent's network namespace handle from new user+net namespace */ 515 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 516 + 517 + if (fd >= 0) { 518 + /* Should NOT succeed - we're in a different user namespace */ 519 + write(pipefd[1], "S", 1); /* Unexpected success */ 520 + close(fd); 521 + } else if (errno == ESTALE) { 522 + /* Expected: Stale file handle */ 523 + write(pipefd[1], "P", 1); 524 + } else { 525 + /* Other error */ 526 + write(pipefd[1], "F", 1); 527 + } 528 + 529 + close(pipefd[1]); 530 + exit(0); 531 + } 532 + 533 + /* Parent process */ 534 + close(pipefd[1]); 535 + ASSERT_EQ(read(pipefd[0], &result, 1), 1); 536 + 537 + waitpid(pid, &status, 0); 538 + ASSERT_TRUE(WIFEXITED(status)); 539 + ASSERT_EQ(WEXITSTATUS(status), 0); 540 + 541 + if (result == 'U') { 542 + SKIP(free(handle); close(pipefd[0]); 543 + return, "Cannot create new user namespace"); 544 + } 545 + if (result == 'M') { 546 + SKIP(free(handle); close(pipefd[0]); 547 + return, "Cannot set uid/gid mappings"); 548 + } 549 + if (result == 'N') { 550 + SKIP(free(handle); close(pipefd[0]); 551 + return, "Cannot create new network namespace"); 552 + } 553 + 554 + /* Should fail with permission denied since we're in a different user namespace */ 555 + ASSERT_EQ(result, 'P'); 556 + 557 + close(pipefd[0]); 558 + free(handle); 559 + } 560 + 561 + TEST(nsfs_user_uts_namespace_isolation) 562 + { 563 + struct file_handle *handle; 564 + int mount_id; 565 + int ret; 566 + int fd; 567 + int ns_fd; 568 + pid_t pid; 569 + int status; 570 + int pipefd[2]; 571 + char result; 572 + 573 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 574 + ASSERT_NE(handle, NULL); 575 + 576 + /* Create pipe for communication */ 577 + ASSERT_EQ(pipe(pipefd), 0); 578 + 579 + /* Get handle for current UTS namespace */ 580 + ns_fd = open("/proc/self/ns/uts", O_RDONLY); 581 + ASSERT_GE(ns_fd, 0); 582 + 583 + handle->handle_bytes = MAX_HANDLE_SZ; 584 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 585 + if (ret < 0 && errno == EOPNOTSUPP) { 586 + SKIP(free(handle); close(ns_fd); close(pipefd[0]); 587 + close(pipefd[1]); 588 + return, "nsfs doesn't support file handles"); 589 + } 590 + ASSERT_EQ(ret, 0); 591 + close(ns_fd); 592 + 593 + pid = fork(); 594 + ASSERT_GE(pid, 0); 595 + 596 + if (pid == 0) { 597 + /* Child process */ 598 + close(pipefd[0]); 599 + 600 + /* First create new user namespace to drop privileges */ 601 + ret = unshare(CLONE_NEWUSER); 602 + if (ret < 0) { 603 + write(pipefd[1], "U", 604 + 1); /* Unable to create user namespace */ 605 + close(pipefd[1]); 606 + exit(0); 607 + } 608 + 609 + /* Write uid/gid mappings to maintain some capabilities */ 610 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 611 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 612 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 613 + 614 + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { 615 + write(pipefd[1], "M", 1); /* Unable to set mappings */ 616 + close(pipefd[1]); 617 + exit(0); 618 + } 619 + 620 + /* Disable setgroups to allow gid mapping */ 621 + write(setgroups_fd, "deny", 4); 622 + close(setgroups_fd); 623 + 624 + /* Map current uid/gid to root in the new namespace */ 625 + char mapping[64]; 626 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 627 + write(uid_map_fd, mapping, strlen(mapping)); 628 + close(uid_map_fd); 629 + 630 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 631 + write(gid_map_fd, mapping, strlen(mapping)); 632 + close(gid_map_fd); 633 + 634 + /* Now create new UTS namespace */ 635 + ret = unshare(CLONE_NEWUTS); 636 + if (ret < 0) { 637 + write(pipefd[1], "N", 638 + 1); /* Unable to create UTS namespace */ 639 + close(pipefd[1]); 640 + exit(0); 641 + } 642 + 643 + /* Try to open parent's UTS namespace handle from new user+uts namespace */ 644 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 645 + 646 + if (fd >= 0) { 647 + /* Should NOT succeed - we're in a different user namespace */ 648 + write(pipefd[1], "S", 1); /* Unexpected success */ 649 + close(fd); 650 + } else if (errno == ESTALE) { 651 + /* Expected: Stale file handle */ 652 + write(pipefd[1], "P", 1); 653 + } else { 654 + /* Other error */ 655 + write(pipefd[1], "F", 1); 656 + } 657 + 658 + close(pipefd[1]); 659 + exit(0); 660 + } 661 + 662 + /* Parent process */ 663 + close(pipefd[1]); 664 + ASSERT_EQ(read(pipefd[0], &result, 1), 1); 665 + 666 + waitpid(pid, &status, 0); 667 + ASSERT_TRUE(WIFEXITED(status)); 668 + ASSERT_EQ(WEXITSTATUS(status), 0); 669 + 670 + if (result == 'U') { 671 + SKIP(free(handle); close(pipefd[0]); 672 + return, "Cannot create new user namespace"); 673 + } 674 + if (result == 'M') { 675 + SKIP(free(handle); close(pipefd[0]); 676 + return, "Cannot set uid/gid mappings"); 677 + } 678 + if (result == 'N') { 679 + SKIP(free(handle); close(pipefd[0]); 680 + return, "Cannot create new UTS namespace"); 681 + } 682 + 683 + /* Should fail with ESTALE since we're in a different user namespace */ 684 + ASSERT_EQ(result, 'P'); 685 + 686 + close(pipefd[0]); 687 + free(handle); 688 + } 689 + 690 + TEST(nsfs_user_ipc_namespace_isolation) 691 + { 692 + struct file_handle *handle; 693 + int mount_id; 694 + int ret; 695 + int fd; 696 + int ns_fd; 697 + pid_t pid; 698 + int status; 699 + int pipefd[2]; 700 + char result; 701 + 702 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 703 + ASSERT_NE(handle, NULL); 704 + 705 + /* Create pipe for communication */ 706 + ASSERT_EQ(pipe(pipefd), 0); 707 + 708 + /* Get handle for current IPC namespace */ 709 + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); 710 + ASSERT_GE(ns_fd, 0); 711 + 712 + handle->handle_bytes = MAX_HANDLE_SZ; 713 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 714 + if (ret < 0 && errno == EOPNOTSUPP) { 715 + SKIP(free(handle); close(ns_fd); close(pipefd[0]); 716 + close(pipefd[1]); 717 + return, "nsfs doesn't support file handles"); 718 + } 719 + ASSERT_EQ(ret, 0); 720 + close(ns_fd); 721 + 722 + pid = fork(); 723 + ASSERT_GE(pid, 0); 724 + 725 + if (pid == 0) { 726 + /* Child process */ 727 + close(pipefd[0]); 728 + 729 + /* First create new user namespace to drop privileges */ 730 + ret = unshare(CLONE_NEWUSER); 731 + if (ret < 0) { 732 + write(pipefd[1], "U", 733 + 1); /* Unable to create user namespace */ 734 + close(pipefd[1]); 735 + exit(0); 736 + } 737 + 738 + /* Write uid/gid mappings to maintain some capabilities */ 739 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 740 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 741 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 742 + 743 + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { 744 + write(pipefd[1], "M", 1); /* Unable to set mappings */ 745 + close(pipefd[1]); 746 + exit(0); 747 + } 748 + 749 + /* Disable setgroups to allow gid mapping */ 750 + write(setgroups_fd, "deny", 4); 751 + close(setgroups_fd); 752 + 753 + /* Map current uid/gid to root in the new namespace */ 754 + char mapping[64]; 755 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 756 + write(uid_map_fd, mapping, strlen(mapping)); 757 + close(uid_map_fd); 758 + 759 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 760 + write(gid_map_fd, mapping, strlen(mapping)); 761 + close(gid_map_fd); 762 + 763 + /* Now create new IPC namespace */ 764 + ret = unshare(CLONE_NEWIPC); 765 + if (ret < 0) { 766 + write(pipefd[1], "N", 767 + 1); /* Unable to create IPC namespace */ 768 + close(pipefd[1]); 769 + exit(0); 770 + } 771 + 772 + /* Try to open parent's IPC namespace handle from new user+ipc namespace */ 773 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 774 + 775 + if (fd >= 0) { 776 + /* Should NOT succeed - we're in a different user namespace */ 777 + write(pipefd[1], "S", 1); /* Unexpected success */ 778 + close(fd); 779 + } else if (errno == ESTALE) { 780 + /* Expected: Stale file handle */ 781 + write(pipefd[1], "P", 1); 782 + } else { 783 + /* Other error */ 784 + write(pipefd[1], "F", 1); 785 + } 786 + 787 + close(pipefd[1]); 788 + exit(0); 789 + } 790 + 791 + /* Parent process */ 792 + close(pipefd[1]); 793 + ASSERT_EQ(read(pipefd[0], &result, 1), 1); 794 + 795 + waitpid(pid, &status, 0); 796 + ASSERT_TRUE(WIFEXITED(status)); 797 + ASSERT_EQ(WEXITSTATUS(status), 0); 798 + 799 + if (result == 'U') { 800 + SKIP(free(handle); close(pipefd[0]); 801 + return, "Cannot create new user namespace"); 802 + } 803 + if (result == 'M') { 804 + SKIP(free(handle); close(pipefd[0]); 805 + return, "Cannot set uid/gid mappings"); 806 + } 807 + if (result == 'N') { 808 + SKIP(free(handle); close(pipefd[0]); 809 + return, "Cannot create new IPC namespace"); 810 + } 811 + 812 + /* Should fail with ESTALE since we're in a different user namespace */ 813 + ASSERT_EQ(result, 'P'); 814 + 815 + close(pipefd[0]); 816 + free(handle); 817 + } 818 + 819 + TEST(nsfs_user_mnt_namespace_isolation) 820 + { 821 + struct file_handle *handle; 822 + int mount_id; 823 + int ret; 824 + int fd; 825 + int ns_fd; 826 + pid_t pid; 827 + int status; 828 + int pipefd[2]; 829 + char result; 830 + 831 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 832 + ASSERT_NE(handle, NULL); 833 + 834 + /* Create pipe for communication */ 835 + ASSERT_EQ(pipe(pipefd), 0); 836 + 837 + /* Get handle for current mount namespace */ 838 + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); 839 + ASSERT_GE(ns_fd, 0); 840 + 841 + handle->handle_bytes = MAX_HANDLE_SZ; 842 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 843 + if (ret < 0 && errno == EOPNOTSUPP) { 844 + SKIP(free(handle); close(ns_fd); close(pipefd[0]); 845 + close(pipefd[1]); 846 + return, "nsfs doesn't support file handles"); 847 + } 848 + ASSERT_EQ(ret, 0); 849 + close(ns_fd); 850 + 851 + pid = fork(); 852 + ASSERT_GE(pid, 0); 853 + 854 + if (pid == 0) { 855 + /* Child process */ 856 + close(pipefd[0]); 857 + 858 + /* First create new user namespace to drop privileges */ 859 + ret = unshare(CLONE_NEWUSER); 860 + if (ret < 0) { 861 + write(pipefd[1], "U", 862 + 1); /* Unable to create user namespace */ 863 + close(pipefd[1]); 864 + exit(0); 865 + } 866 + 867 + /* Write uid/gid mappings to maintain some capabilities */ 868 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 869 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 870 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 871 + 872 + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { 873 + write(pipefd[1], "M", 1); /* Unable to set mappings */ 874 + close(pipefd[1]); 875 + exit(0); 876 + } 877 + 878 + /* Disable setgroups to allow gid mapping */ 879 + write(setgroups_fd, "deny", 4); 880 + close(setgroups_fd); 881 + 882 + /* Map current uid/gid to root in the new namespace */ 883 + char mapping[64]; 884 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 885 + write(uid_map_fd, mapping, strlen(mapping)); 886 + close(uid_map_fd); 887 + 888 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 889 + write(gid_map_fd, mapping, strlen(mapping)); 890 + close(gid_map_fd); 891 + 892 + /* Now create new mount namespace */ 893 + ret = unshare(CLONE_NEWNS); 894 + if (ret < 0) { 895 + write(pipefd[1], "N", 896 + 1); /* Unable to create mount namespace */ 897 + close(pipefd[1]); 898 + exit(0); 899 + } 900 + 901 + /* Try to open parent's mount namespace handle from new user+mnt namespace */ 902 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 903 + 904 + if (fd >= 0) { 905 + /* Should NOT succeed - we're in a different user namespace */ 906 + write(pipefd[1], "S", 1); /* Unexpected success */ 907 + close(fd); 908 + } else if (errno == ESTALE) { 909 + /* Expected: Stale file handle */ 910 + write(pipefd[1], "P", 1); 911 + } else { 912 + /* Other error */ 913 + write(pipefd[1], "F", 1); 914 + } 915 + 916 + close(pipefd[1]); 917 + exit(0); 918 + } 919 + 920 + /* Parent process */ 921 + close(pipefd[1]); 922 + ASSERT_EQ(read(pipefd[0], &result, 1), 1); 923 + 924 + waitpid(pid, &status, 0); 925 + ASSERT_TRUE(WIFEXITED(status)); 926 + ASSERT_EQ(WEXITSTATUS(status), 0); 927 + 928 + if (result == 'U') { 929 + SKIP(free(handle); close(pipefd[0]); 930 + return, "Cannot create new user namespace"); 931 + } 932 + if (result == 'M') { 933 + SKIP(free(handle); close(pipefd[0]); 934 + return, "Cannot set uid/gid mappings"); 935 + } 936 + if (result == 'N') { 937 + SKIP(free(handle); close(pipefd[0]); 938 + return, "Cannot create new mount namespace"); 939 + } 940 + 941 + /* Should fail with ESTALE since we're in a different user namespace */ 942 + ASSERT_EQ(result, 'P'); 943 + 944 + close(pipefd[0]); 945 + free(handle); 946 + } 947 + 948 + TEST(nsfs_user_cgroup_namespace_isolation) 949 + { 950 + struct file_handle *handle; 951 + int mount_id; 952 + int ret; 953 + int fd; 954 + int ns_fd; 955 + pid_t pid; 956 + int status; 957 + int pipefd[2]; 958 + char result; 959 + 960 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 961 + ASSERT_NE(handle, NULL); 962 + 963 + /* Create pipe for communication */ 964 + ASSERT_EQ(pipe(pipefd), 0); 965 + 966 + /* Get handle for current cgroup namespace */ 967 + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); 968 + if (ns_fd < 0) { 969 + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); 970 + return, "cgroup namespace not available"); 971 + } 972 + 973 + handle->handle_bytes = MAX_HANDLE_SZ; 974 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 975 + if (ret < 0 && errno == EOPNOTSUPP) { 976 + SKIP(free(handle); close(ns_fd); close(pipefd[0]); 977 + close(pipefd[1]); 978 + return, "nsfs doesn't support file handles"); 979 + } 980 + ASSERT_EQ(ret, 0); 981 + close(ns_fd); 982 + 983 + pid = fork(); 984 + ASSERT_GE(pid, 0); 985 + 986 + if (pid == 0) { 987 + /* Child process */ 988 + close(pipefd[0]); 989 + 990 + /* First create new user namespace to drop privileges */ 991 + ret = unshare(CLONE_NEWUSER); 992 + if (ret < 0) { 993 + write(pipefd[1], "U", 994 + 1); /* Unable to create user namespace */ 995 + close(pipefd[1]); 996 + exit(0); 997 + } 998 + 999 + /* Write uid/gid mappings to maintain some capabilities */ 1000 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 1001 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 1002 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 1003 + 1004 + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { 1005 + write(pipefd[1], "M", 1); /* Unable to set mappings */ 1006 + close(pipefd[1]); 1007 + exit(0); 1008 + } 1009 + 1010 + /* Disable setgroups to allow gid mapping */ 1011 + write(setgroups_fd, "deny", 4); 1012 + close(setgroups_fd); 1013 + 1014 + /* Map current uid/gid to root in the new namespace */ 1015 + char mapping[64]; 1016 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 1017 + write(uid_map_fd, mapping, strlen(mapping)); 1018 + close(uid_map_fd); 1019 + 1020 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 1021 + write(gid_map_fd, mapping, strlen(mapping)); 1022 + close(gid_map_fd); 1023 + 1024 + /* Now create new cgroup namespace */ 1025 + ret = unshare(CLONE_NEWCGROUP); 1026 + if (ret < 0) { 1027 + write(pipefd[1], "N", 1028 + 1); /* Unable to create cgroup namespace */ 1029 + close(pipefd[1]); 1030 + exit(0); 1031 + } 1032 + 1033 + /* Try to open parent's cgroup namespace handle from new user+cgroup namespace */ 1034 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1035 + 1036 + if (fd >= 0) { 1037 + /* Should NOT succeed - we're in a different user namespace */ 1038 + write(pipefd[1], "S", 1); /* Unexpected success */ 1039 + close(fd); 1040 + } else if (errno == ESTALE) { 1041 + /* Expected: Stale file handle */ 1042 + write(pipefd[1], "P", 1); 1043 + } else { 1044 + /* Other error */ 1045 + write(pipefd[1], "F", 1); 1046 + } 1047 + 1048 + close(pipefd[1]); 1049 + exit(0); 1050 + } 1051 + 1052 + /* Parent process */ 1053 + close(pipefd[1]); 1054 + ASSERT_EQ(read(pipefd[0], &result, 1), 1); 1055 + 1056 + waitpid(pid, &status, 0); 1057 + ASSERT_TRUE(WIFEXITED(status)); 1058 + ASSERT_EQ(WEXITSTATUS(status), 0); 1059 + 1060 + if (result == 'U') { 1061 + SKIP(free(handle); close(pipefd[0]); 1062 + return, "Cannot create new user namespace"); 1063 + } 1064 + if (result == 'M') { 1065 + SKIP(free(handle); close(pipefd[0]); 1066 + return, "Cannot set uid/gid mappings"); 1067 + } 1068 + if (result == 'N') { 1069 + SKIP(free(handle); close(pipefd[0]); 1070 + return, "Cannot create new cgroup namespace"); 1071 + } 1072 + 1073 + /* Should fail with ESTALE since we're in a different user namespace */ 1074 + ASSERT_EQ(result, 'P'); 1075 + 1076 + close(pipefd[0]); 1077 + free(handle); 1078 + } 1079 + 1080 + TEST(nsfs_user_pid_namespace_isolation) 1081 + { 1082 + struct file_handle *handle; 1083 + int mount_id; 1084 + int ret; 1085 + int fd; 1086 + int ns_fd; 1087 + pid_t pid; 1088 + int status; 1089 + int pipefd[2]; 1090 + char result; 1091 + 1092 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 1093 + ASSERT_NE(handle, NULL); 1094 + 1095 + /* Create pipe for communication */ 1096 + ASSERT_EQ(pipe(pipefd), 0); 1097 + 1098 + /* Get handle for current PID namespace */ 1099 + ns_fd = open("/proc/self/ns/pid", O_RDONLY); 1100 + ASSERT_GE(ns_fd, 0); 1101 + 1102 + handle->handle_bytes = MAX_HANDLE_SZ; 1103 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 1104 + if (ret < 0 && errno == EOPNOTSUPP) { 1105 + SKIP(free(handle); close(ns_fd); close(pipefd[0]); 1106 + close(pipefd[1]); 1107 + return, "nsfs doesn't support file handles"); 1108 + } 1109 + ASSERT_EQ(ret, 0); 1110 + close(ns_fd); 1111 + 1112 + pid = fork(); 1113 + ASSERT_GE(pid, 0); 1114 + 1115 + if (pid == 0) { 1116 + /* Child process */ 1117 + close(pipefd[0]); 1118 + 1119 + /* First create new user namespace to drop privileges */ 1120 + ret = unshare(CLONE_NEWUSER); 1121 + if (ret < 0) { 1122 + write(pipefd[1], "U", 1123 + 1); /* Unable to create user namespace */ 1124 + close(pipefd[1]); 1125 + exit(0); 1126 + } 1127 + 1128 + /* Write uid/gid mappings to maintain some capabilities */ 1129 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 1130 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 1131 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 1132 + 1133 + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { 1134 + write(pipefd[1], "M", 1); /* Unable to set mappings */ 1135 + close(pipefd[1]); 1136 + exit(0); 1137 + } 1138 + 1139 + /* Disable setgroups to allow gid mapping */ 1140 + write(setgroups_fd, "deny", 4); 1141 + close(setgroups_fd); 1142 + 1143 + /* Map current uid/gid to root in the new namespace */ 1144 + char mapping[64]; 1145 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 1146 + write(uid_map_fd, mapping, strlen(mapping)); 1147 + close(uid_map_fd); 1148 + 1149 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 1150 + write(gid_map_fd, mapping, strlen(mapping)); 1151 + close(gid_map_fd); 1152 + 1153 + /* Now create new PID namespace - requires fork to take effect */ 1154 + ret = unshare(CLONE_NEWPID); 1155 + if (ret < 0) { 1156 + write(pipefd[1], "N", 1157 + 1); /* Unable to create PID namespace */ 1158 + close(pipefd[1]); 1159 + exit(0); 1160 + } 1161 + 1162 + /* Fork again for PID namespace to take effect */ 1163 + pid_t child_pid = fork(); 1164 + if (child_pid < 0) { 1165 + write(pipefd[1], "N", 1166 + 1); /* Unable to fork in PID namespace */ 1167 + close(pipefd[1]); 1168 + exit(0); 1169 + } 1170 + 1171 + if (child_pid == 0) { 1172 + /* Grandchild in new PID namespace */ 1173 + /* Try to open parent's PID namespace handle from new user+pid namespace */ 1174 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1175 + 1176 + if (fd >= 0) { 1177 + /* Should NOT succeed - we're in a different user namespace */ 1178 + write(pipefd[1], "S", 1179 + 1); /* Unexpected success */ 1180 + close(fd); 1181 + } else if (errno == ESTALE) { 1182 + /* Expected: Stale file handle */ 1183 + write(pipefd[1], "P", 1); 1184 + } else { 1185 + /* Other error */ 1186 + write(pipefd[1], "F", 1); 1187 + } 1188 + 1189 + close(pipefd[1]); 1190 + exit(0); 1191 + } 1192 + 1193 + /* Wait for grandchild */ 1194 + waitpid(child_pid, NULL, 0); 1195 + exit(0); 1196 + } 1197 + 1198 + /* Parent process */ 1199 + close(pipefd[1]); 1200 + ASSERT_EQ(read(pipefd[0], &result, 1), 1); 1201 + 1202 + waitpid(pid, &status, 0); 1203 + ASSERT_TRUE(WIFEXITED(status)); 1204 + ASSERT_EQ(WEXITSTATUS(status), 0); 1205 + 1206 + if (result == 'U') { 1207 + SKIP(free(handle); close(pipefd[0]); 1208 + return, "Cannot create new user namespace"); 1209 + } 1210 + if (result == 'M') { 1211 + SKIP(free(handle); close(pipefd[0]); 1212 + return, "Cannot set uid/gid mappings"); 1213 + } 1214 + if (result == 'N') { 1215 + SKIP(free(handle); close(pipefd[0]); 1216 + return, "Cannot create new PID namespace"); 1217 + } 1218 + 1219 + /* Should fail with ESTALE since we're in a different user namespace */ 1220 + ASSERT_EQ(result, 'P'); 1221 + 1222 + close(pipefd[0]); 1223 + free(handle); 1224 + } 1225 + 1226 + TEST(nsfs_user_time_namespace_isolation) 1227 + { 1228 + struct file_handle *handle; 1229 + int mount_id; 1230 + int ret; 1231 + int fd; 1232 + int ns_fd; 1233 + pid_t pid; 1234 + int status; 1235 + int pipefd[2]; 1236 + char result; 1237 + 1238 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 1239 + ASSERT_NE(handle, NULL); 1240 + 1241 + /* Create pipe for communication */ 1242 + ASSERT_EQ(pipe(pipefd), 0); 1243 + 1244 + /* Get handle for current time namespace */ 1245 + ns_fd = open("/proc/self/ns/time", O_RDONLY); 1246 + if (ns_fd < 0) { 1247 + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); 1248 + return, "time namespace not available"); 1249 + } 1250 + 1251 + handle->handle_bytes = MAX_HANDLE_SZ; 1252 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 1253 + if (ret < 0 && errno == EOPNOTSUPP) { 1254 + SKIP(free(handle); close(ns_fd); close(pipefd[0]); 1255 + close(pipefd[1]); 1256 + return, "nsfs doesn't support file handles"); 1257 + } 1258 + ASSERT_EQ(ret, 0); 1259 + close(ns_fd); 1260 + 1261 + pid = fork(); 1262 + ASSERT_GE(pid, 0); 1263 + 1264 + if (pid == 0) { 1265 + /* Child process */ 1266 + close(pipefd[0]); 1267 + 1268 + /* First create new user namespace to drop privileges */ 1269 + ret = unshare(CLONE_NEWUSER); 1270 + if (ret < 0) { 1271 + write(pipefd[1], "U", 1272 + 1); /* Unable to create user namespace */ 1273 + close(pipefd[1]); 1274 + exit(0); 1275 + } 1276 + 1277 + /* Write uid/gid mappings to maintain some capabilities */ 1278 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 1279 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 1280 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 1281 + 1282 + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { 1283 + write(pipefd[1], "M", 1); /* Unable to set mappings */ 1284 + close(pipefd[1]); 1285 + exit(0); 1286 + } 1287 + 1288 + /* Disable setgroups to allow gid mapping */ 1289 + write(setgroups_fd, "deny", 4); 1290 + close(setgroups_fd); 1291 + 1292 + /* Map current uid/gid to root in the new namespace */ 1293 + char mapping[64]; 1294 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 1295 + write(uid_map_fd, mapping, strlen(mapping)); 1296 + close(uid_map_fd); 1297 + 1298 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 1299 + write(gid_map_fd, mapping, strlen(mapping)); 1300 + close(gid_map_fd); 1301 + 1302 + /* Now create new time namespace - requires fork to take effect */ 1303 + ret = unshare(CLONE_NEWTIME); 1304 + if (ret < 0) { 1305 + write(pipefd[1], "N", 1306 + 1); /* Unable to create time namespace */ 1307 + close(pipefd[1]); 1308 + exit(0); 1309 + } 1310 + 1311 + /* Fork again for time namespace to take effect */ 1312 + pid_t child_pid = fork(); 1313 + if (child_pid < 0) { 1314 + write(pipefd[1], "N", 1315 + 1); /* Unable to fork in time namespace */ 1316 + close(pipefd[1]); 1317 + exit(0); 1318 + } 1319 + 1320 + if (child_pid == 0) { 1321 + /* Grandchild in new time namespace */ 1322 + /* Try to open parent's time namespace handle from new user+time namespace */ 1323 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1324 + 1325 + if (fd >= 0) { 1326 + /* Should NOT succeed - we're in a different user namespace */ 1327 + write(pipefd[1], "S", 1328 + 1); /* Unexpected success */ 1329 + close(fd); 1330 + } else if (errno == ESTALE) { 1331 + /* Expected: Stale file handle */ 1332 + write(pipefd[1], "P", 1); 1333 + } else { 1334 + /* Other error */ 1335 + write(pipefd[1], "F", 1); 1336 + } 1337 + 1338 + close(pipefd[1]); 1339 + exit(0); 1340 + } 1341 + 1342 + /* Wait for grandchild */ 1343 + waitpid(child_pid, NULL, 0); 1344 + exit(0); 1345 + } 1346 + 1347 + /* Parent process */ 1348 + close(pipefd[1]); 1349 + ASSERT_EQ(read(pipefd[0], &result, 1), 1); 1350 + 1351 + waitpid(pid, &status, 0); 1352 + ASSERT_TRUE(WIFEXITED(status)); 1353 + ASSERT_EQ(WEXITSTATUS(status), 0); 1354 + 1355 + if (result == 'U') { 1356 + SKIP(free(handle); close(pipefd[0]); 1357 + return, "Cannot create new user namespace"); 1358 + } 1359 + if (result == 'M') { 1360 + SKIP(free(handle); close(pipefd[0]); 1361 + return, "Cannot set uid/gid mappings"); 1362 + } 1363 + if (result == 'N') { 1364 + SKIP(free(handle); close(pipefd[0]); 1365 + return, "Cannot create new time namespace"); 1366 + } 1367 + 1368 + /* Should fail with ESTALE since we're in a different user namespace */ 1369 + ASSERT_EQ(result, 'P'); 1370 + 1371 + close(pipefd[0]); 1372 + free(handle); 1373 + } 1374 + 1375 + TEST(nsfs_open_flags) 1376 + { 1377 + struct file_handle *handle; 1378 + int mount_id; 1379 + int ret; 1380 + int fd; 1381 + int ns_fd; 1382 + 1383 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 1384 + ASSERT_NE(handle, NULL); 1385 + 1386 + /* Open a namespace file descriptor */ 1387 + ns_fd = open("/proc/self/ns/net", O_RDONLY); 1388 + ASSERT_GE(ns_fd, 0); 1389 + 1390 + /* Get handle for the namespace */ 1391 + handle->handle_bytes = MAX_HANDLE_SZ; 1392 + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); 1393 + if (ret < 0 && errno == EOPNOTSUPP) { 1394 + SKIP(free(handle); close(ns_fd); 1395 + return, "nsfs doesn't support file handles"); 1396 + } 1397 + ASSERT_EQ(ret, 0); 1398 + ASSERT_GT(handle->handle_bytes, 0); 1399 + 1400 + /* Test invalid flags that should fail */ 1401 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY); 1402 + ASSERT_LT(fd, 0); 1403 + ASSERT_EQ(errno, EPERM); 1404 + 1405 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR); 1406 + ASSERT_LT(fd, 0); 1407 + ASSERT_EQ(errno, EPERM); 1408 + 1409 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC); 1410 + ASSERT_LT(fd, 0); 1411 + ASSERT_EQ(errno, EPERM); 1412 + 1413 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT); 1414 + ASSERT_LT(fd, 0); 1415 + ASSERT_EQ(errno, EINVAL); 1416 + 1417 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE); 1418 + ASSERT_LT(fd, 0); 1419 + ASSERT_EQ(errno, EINVAL); 1420 + 1421 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY); 1422 + ASSERT_LT(fd, 0); 1423 + ASSERT_EQ(errno, ENOTDIR); 1424 + 1425 + close(ns_fd); 1426 + free(handle); 1427 + } 1428 + 1429 + TEST_HARNESS_MAIN

+61

tools/testing/selftests/namespaces/init_ino_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + // Copyright (c) 2025 Christian Brauner <brauner@kernel.org> 3 + 4 + #define _GNU_SOURCE 5 + #include <fcntl.h> 6 + #include <stdio.h> 7 + #include <stdlib.h> 8 + #include <sys/stat.h> 9 + #include <unistd.h> 10 + #include <errno.h> 11 + #include <string.h> 12 + #include <linux/nsfs.h> 13 + 14 + #include "../kselftest_harness.h" 15 + 16 + struct ns_info { 17 + const char *name; 18 + const char *proc_path; 19 + unsigned int expected_ino; 20 + }; 21 + 22 + static struct ns_info namespaces[] = { 23 + { "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO }, 24 + { "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO }, 25 + { "user", "/proc/1/ns/user", USER_NS_INIT_INO }, 26 + { "pid", "/proc/1/ns/pid", PID_NS_INIT_INO }, 27 + { "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO }, 28 + { "time", "/proc/1/ns/time", TIME_NS_INIT_INO }, 29 + { "net", "/proc/1/ns/net", NET_NS_INIT_INO }, 30 + { "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO }, 31 + }; 32 + 33 + TEST(init_namespace_inodes) 34 + { 35 + struct stat st; 36 + 37 + for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { 38 + int ret = stat(namespaces[i].proc_path, &st); 39 + 40 + /* Some namespaces might not be available (e.g., time namespace on older kernels) */ 41 + if (ret < 0) { 42 + if (errno == ENOENT) { 43 + ksft_test_result_skip("%s namespace not available\n", 44 + namespaces[i].name); 45 + continue; 46 + } 47 + ASSERT_GE(ret, 0) 48 + TH_LOG("Failed to stat %s: %s", 49 + namespaces[i].proc_path, strerror(errno)); 50 + } 51 + 52 + ASSERT_EQ(st.st_ino, namespaces[i].expected_ino) 53 + TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x", 54 + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); 55 + 56 + ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n", 57 + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); 58 + } 59 + } 60 + 61 + TEST_HARNESS_MAIN

+986

tools/testing/selftests/namespaces/nsid_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <assert.h> 4 + #include <fcntl.h> 5 + #include <inttypes.h> 6 + #include <libgen.h> 7 + #include <limits.h> 8 + #include <pthread.h> 9 + #include <string.h> 10 + #include <sys/mount.h> 11 + #include <poll.h> 12 + #include <sys/epoll.h> 13 + #include <sys/resource.h> 14 + #include <sys/stat.h> 15 + #include <sys/socket.h> 16 + #include <sys/un.h> 17 + #include <unistd.h> 18 + #include <linux/fs.h> 19 + #include <linux/limits.h> 20 + #include <linux/nsfs.h> 21 + #include "../kselftest_harness.h" 22 + 23 + TEST(nsid_mntns_basic) 24 + { 25 + __u64 mnt_ns_id = 0; 26 + int fd_mntns; 27 + int ret; 28 + 29 + /* Open the current mount namespace */ 30 + fd_mntns = open("/proc/self/ns/mnt", O_RDONLY); 31 + ASSERT_GE(fd_mntns, 0); 32 + 33 + /* Get the mount namespace ID */ 34 + ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id); 35 + ASSERT_EQ(ret, 0); 36 + ASSERT_NE(mnt_ns_id, 0); 37 + 38 + /* Verify we can get the same ID again */ 39 + __u64 mnt_ns_id2 = 0; 40 + ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2); 41 + ASSERT_EQ(ret, 0); 42 + ASSERT_EQ(mnt_ns_id, mnt_ns_id2); 43 + 44 + close(fd_mntns); 45 + } 46 + 47 + TEST(nsid_mntns_separate) 48 + { 49 + __u64 parent_mnt_ns_id = 0; 50 + __u64 child_mnt_ns_id = 0; 51 + int fd_parent_mntns, fd_child_mntns; 52 + int ret; 53 + pid_t pid; 54 + int pipefd[2]; 55 + 56 + /* Get parent's mount namespace ID */ 57 + fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY); 58 + ASSERT_GE(fd_parent_mntns, 0); 59 + ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id); 60 + ASSERT_EQ(ret, 0); 61 + ASSERT_NE(parent_mnt_ns_id, 0); 62 + 63 + /* Create a pipe for synchronization */ 64 + ASSERT_EQ(pipe(pipefd), 0); 65 + 66 + pid = fork(); 67 + ASSERT_GE(pid, 0); 68 + 69 + if (pid == 0) { 70 + /* Child process */ 71 + close(pipefd[0]); 72 + 73 + /* Create new mount namespace */ 74 + ret = unshare(CLONE_NEWNS); 75 + if (ret != 0) { 76 + /* Skip test if we don't have permission */ 77 + if (errno == EPERM || errno == EACCES) { 78 + write(pipefd[1], "S", 1); /* Signal skip */ 79 + _exit(0); 80 + } 81 + _exit(1); 82 + } 83 + 84 + /* Signal success */ 85 + write(pipefd[1], "Y", 1); 86 + close(pipefd[1]); 87 + 88 + /* Keep namespace alive */ 89 + pause(); 90 + _exit(0); 91 + } 92 + 93 + /* Parent process */ 94 + close(pipefd[1]); 95 + 96 + char buf; 97 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 98 + close(pipefd[0]); 99 + 100 + if (buf == 'S') { 101 + /* Child couldn't create namespace, skip test */ 102 + kill(pid, SIGTERM); 103 + waitpid(pid, NULL, 0); 104 + close(fd_parent_mntns); 105 + SKIP(return, "No permission to create mount namespace"); 106 + } 107 + 108 + ASSERT_EQ(buf, 'Y'); 109 + 110 + /* Open child's mount namespace */ 111 + char path[256]; 112 + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid); 113 + fd_child_mntns = open(path, O_RDONLY); 114 + ASSERT_GE(fd_child_mntns, 0); 115 + 116 + /* Get child's mount namespace ID */ 117 + ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id); 118 + ASSERT_EQ(ret, 0); 119 + ASSERT_NE(child_mnt_ns_id, 0); 120 + 121 + /* Parent and child should have different mount namespace IDs */ 122 + ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id); 123 + 124 + close(fd_parent_mntns); 125 + close(fd_child_mntns); 126 + 127 + /* Clean up child process */ 128 + kill(pid, SIGTERM); 129 + waitpid(pid, NULL, 0); 130 + } 131 + 132 + TEST(nsid_cgroupns_basic) 133 + { 134 + __u64 cgroup_ns_id = 0; 135 + int fd_cgroupns; 136 + int ret; 137 + 138 + /* Open the current cgroup namespace */ 139 + fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); 140 + ASSERT_GE(fd_cgroupns, 0); 141 + 142 + /* Get the cgroup namespace ID */ 143 + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id); 144 + ASSERT_EQ(ret, 0); 145 + ASSERT_NE(cgroup_ns_id, 0); 146 + 147 + /* Verify we can get the same ID again */ 148 + __u64 cgroup_ns_id2 = 0; 149 + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2); 150 + ASSERT_EQ(ret, 0); 151 + ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2); 152 + 153 + close(fd_cgroupns); 154 + } 155 + 156 + TEST(nsid_cgroupns_separate) 157 + { 158 + __u64 parent_cgroup_ns_id = 0; 159 + __u64 child_cgroup_ns_id = 0; 160 + int fd_parent_cgroupns, fd_child_cgroupns; 161 + int ret; 162 + pid_t pid; 163 + int pipefd[2]; 164 + 165 + /* Get parent's cgroup namespace ID */ 166 + fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); 167 + ASSERT_GE(fd_parent_cgroupns, 0); 168 + ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id); 169 + ASSERT_EQ(ret, 0); 170 + ASSERT_NE(parent_cgroup_ns_id, 0); 171 + 172 + /* Create a pipe for synchronization */ 173 + ASSERT_EQ(pipe(pipefd), 0); 174 + 175 + pid = fork(); 176 + ASSERT_GE(pid, 0); 177 + 178 + if (pid == 0) { 179 + /* Child process */ 180 + close(pipefd[0]); 181 + 182 + /* Create new cgroup namespace */ 183 + ret = unshare(CLONE_NEWCGROUP); 184 + if (ret != 0) { 185 + /* Skip test if we don't have permission */ 186 + if (errno == EPERM || errno == EACCES) { 187 + write(pipefd[1], "S", 1); /* Signal skip */ 188 + _exit(0); 189 + } 190 + _exit(1); 191 + } 192 + 193 + /* Signal success */ 194 + write(pipefd[1], "Y", 1); 195 + close(pipefd[1]); 196 + 197 + /* Keep namespace alive */ 198 + pause(); 199 + _exit(0); 200 + } 201 + 202 + /* Parent process */ 203 + close(pipefd[1]); 204 + 205 + char buf; 206 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 207 + close(pipefd[0]); 208 + 209 + if (buf == 'S') { 210 + /* Child couldn't create namespace, skip test */ 211 + kill(pid, SIGTERM); 212 + waitpid(pid, NULL, 0); 213 + close(fd_parent_cgroupns); 214 + SKIP(return, "No permission to create cgroup namespace"); 215 + } 216 + 217 + ASSERT_EQ(buf, 'Y'); 218 + 219 + /* Open child's cgroup namespace */ 220 + char path[256]; 221 + snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid); 222 + fd_child_cgroupns = open(path, O_RDONLY); 223 + ASSERT_GE(fd_child_cgroupns, 0); 224 + 225 + /* Get child's cgroup namespace ID */ 226 + ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id); 227 + ASSERT_EQ(ret, 0); 228 + ASSERT_NE(child_cgroup_ns_id, 0); 229 + 230 + /* Parent and child should have different cgroup namespace IDs */ 231 + ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id); 232 + 233 + close(fd_parent_cgroupns); 234 + close(fd_child_cgroupns); 235 + 236 + /* Clean up child process */ 237 + kill(pid, SIGTERM); 238 + waitpid(pid, NULL, 0); 239 + } 240 + 241 + TEST(nsid_ipcns_basic) 242 + { 243 + __u64 ipc_ns_id = 0; 244 + int fd_ipcns; 245 + int ret; 246 + 247 + /* Open the current IPC namespace */ 248 + fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY); 249 + ASSERT_GE(fd_ipcns, 0); 250 + 251 + /* Get the IPC namespace ID */ 252 + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id); 253 + ASSERT_EQ(ret, 0); 254 + ASSERT_NE(ipc_ns_id, 0); 255 + 256 + /* Verify we can get the same ID again */ 257 + __u64 ipc_ns_id2 = 0; 258 + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2); 259 + ASSERT_EQ(ret, 0); 260 + ASSERT_EQ(ipc_ns_id, ipc_ns_id2); 261 + 262 + close(fd_ipcns); 263 + } 264 + 265 + TEST(nsid_ipcns_separate) 266 + { 267 + __u64 parent_ipc_ns_id = 0; 268 + __u64 child_ipc_ns_id = 0; 269 + int fd_parent_ipcns, fd_child_ipcns; 270 + int ret; 271 + pid_t pid; 272 + int pipefd[2]; 273 + 274 + /* Get parent's IPC namespace ID */ 275 + fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY); 276 + ASSERT_GE(fd_parent_ipcns, 0); 277 + ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id); 278 + ASSERT_EQ(ret, 0); 279 + ASSERT_NE(parent_ipc_ns_id, 0); 280 + 281 + /* Create a pipe for synchronization */ 282 + ASSERT_EQ(pipe(pipefd), 0); 283 + 284 + pid = fork(); 285 + ASSERT_GE(pid, 0); 286 + 287 + if (pid == 0) { 288 + /* Child process */ 289 + close(pipefd[0]); 290 + 291 + /* Create new IPC namespace */ 292 + ret = unshare(CLONE_NEWIPC); 293 + if (ret != 0) { 294 + /* Skip test if we don't have permission */ 295 + if (errno == EPERM || errno == EACCES) { 296 + write(pipefd[1], "S", 1); /* Signal skip */ 297 + _exit(0); 298 + } 299 + _exit(1); 300 + } 301 + 302 + /* Signal success */ 303 + write(pipefd[1], "Y", 1); 304 + close(pipefd[1]); 305 + 306 + /* Keep namespace alive */ 307 + pause(); 308 + _exit(0); 309 + } 310 + 311 + /* Parent process */ 312 + close(pipefd[1]); 313 + 314 + char buf; 315 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 316 + close(pipefd[0]); 317 + 318 + if (buf == 'S') { 319 + /* Child couldn't create namespace, skip test */ 320 + kill(pid, SIGTERM); 321 + waitpid(pid, NULL, 0); 322 + close(fd_parent_ipcns); 323 + SKIP(return, "No permission to create IPC namespace"); 324 + } 325 + 326 + ASSERT_EQ(buf, 'Y'); 327 + 328 + /* Open child's IPC namespace */ 329 + char path[256]; 330 + snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid); 331 + fd_child_ipcns = open(path, O_RDONLY); 332 + ASSERT_GE(fd_child_ipcns, 0); 333 + 334 + /* Get child's IPC namespace ID */ 335 + ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id); 336 + ASSERT_EQ(ret, 0); 337 + ASSERT_NE(child_ipc_ns_id, 0); 338 + 339 + /* Parent and child should have different IPC namespace IDs */ 340 + ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id); 341 + 342 + close(fd_parent_ipcns); 343 + close(fd_child_ipcns); 344 + 345 + /* Clean up child process */ 346 + kill(pid, SIGTERM); 347 + waitpid(pid, NULL, 0); 348 + } 349 + 350 + TEST(nsid_utsns_basic) 351 + { 352 + __u64 uts_ns_id = 0; 353 + int fd_utsns; 354 + int ret; 355 + 356 + /* Open the current UTS namespace */ 357 + fd_utsns = open("/proc/self/ns/uts", O_RDONLY); 358 + ASSERT_GE(fd_utsns, 0); 359 + 360 + /* Get the UTS namespace ID */ 361 + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id); 362 + ASSERT_EQ(ret, 0); 363 + ASSERT_NE(uts_ns_id, 0); 364 + 365 + /* Verify we can get the same ID again */ 366 + __u64 uts_ns_id2 = 0; 367 + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2); 368 + ASSERT_EQ(ret, 0); 369 + ASSERT_EQ(uts_ns_id, uts_ns_id2); 370 + 371 + close(fd_utsns); 372 + } 373 + 374 + TEST(nsid_utsns_separate) 375 + { 376 + __u64 parent_uts_ns_id = 0; 377 + __u64 child_uts_ns_id = 0; 378 + int fd_parent_utsns, fd_child_utsns; 379 + int ret; 380 + pid_t pid; 381 + int pipefd[2]; 382 + 383 + /* Get parent's UTS namespace ID */ 384 + fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY); 385 + ASSERT_GE(fd_parent_utsns, 0); 386 + ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id); 387 + ASSERT_EQ(ret, 0); 388 + ASSERT_NE(parent_uts_ns_id, 0); 389 + 390 + /* Create a pipe for synchronization */ 391 + ASSERT_EQ(pipe(pipefd), 0); 392 + 393 + pid = fork(); 394 + ASSERT_GE(pid, 0); 395 + 396 + if (pid == 0) { 397 + /* Child process */ 398 + close(pipefd[0]); 399 + 400 + /* Create new UTS namespace */ 401 + ret = unshare(CLONE_NEWUTS); 402 + if (ret != 0) { 403 + /* Skip test if we don't have permission */ 404 + if (errno == EPERM || errno == EACCES) { 405 + write(pipefd[1], "S", 1); /* Signal skip */ 406 + _exit(0); 407 + } 408 + _exit(1); 409 + } 410 + 411 + /* Signal success */ 412 + write(pipefd[1], "Y", 1); 413 + close(pipefd[1]); 414 + 415 + /* Keep namespace alive */ 416 + pause(); 417 + _exit(0); 418 + } 419 + 420 + /* Parent process */ 421 + close(pipefd[1]); 422 + 423 + char buf; 424 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 425 + close(pipefd[0]); 426 + 427 + if (buf == 'S') { 428 + /* Child couldn't create namespace, skip test */ 429 + kill(pid, SIGTERM); 430 + waitpid(pid, NULL, 0); 431 + close(fd_parent_utsns); 432 + SKIP(return, "No permission to create UTS namespace"); 433 + } 434 + 435 + ASSERT_EQ(buf, 'Y'); 436 + 437 + /* Open child's UTS namespace */ 438 + char path[256]; 439 + snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid); 440 + fd_child_utsns = open(path, O_RDONLY); 441 + ASSERT_GE(fd_child_utsns, 0); 442 + 443 + /* Get child's UTS namespace ID */ 444 + ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id); 445 + ASSERT_EQ(ret, 0); 446 + ASSERT_NE(child_uts_ns_id, 0); 447 + 448 + /* Parent and child should have different UTS namespace IDs */ 449 + ASSERT_NE(parent_uts_ns_id, child_uts_ns_id); 450 + 451 + close(fd_parent_utsns); 452 + close(fd_child_utsns); 453 + 454 + /* Clean up child process */ 455 + kill(pid, SIGTERM); 456 + waitpid(pid, NULL, 0); 457 + } 458 + 459 + TEST(nsid_userns_basic) 460 + { 461 + __u64 user_ns_id = 0; 462 + int fd_userns; 463 + int ret; 464 + 465 + /* Open the current user namespace */ 466 + fd_userns = open("/proc/self/ns/user", O_RDONLY); 467 + ASSERT_GE(fd_userns, 0); 468 + 469 + /* Get the user namespace ID */ 470 + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id); 471 + ASSERT_EQ(ret, 0); 472 + ASSERT_NE(user_ns_id, 0); 473 + 474 + /* Verify we can get the same ID again */ 475 + __u64 user_ns_id2 = 0; 476 + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2); 477 + ASSERT_EQ(ret, 0); 478 + ASSERT_EQ(user_ns_id, user_ns_id2); 479 + 480 + close(fd_userns); 481 + } 482 + 483 + TEST(nsid_userns_separate) 484 + { 485 + __u64 parent_user_ns_id = 0; 486 + __u64 child_user_ns_id = 0; 487 + int fd_parent_userns, fd_child_userns; 488 + int ret; 489 + pid_t pid; 490 + int pipefd[2]; 491 + 492 + /* Get parent's user namespace ID */ 493 + fd_parent_userns = open("/proc/self/ns/user", O_RDONLY); 494 + ASSERT_GE(fd_parent_userns, 0); 495 + ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id); 496 + ASSERT_EQ(ret, 0); 497 + ASSERT_NE(parent_user_ns_id, 0); 498 + 499 + /* Create a pipe for synchronization */ 500 + ASSERT_EQ(pipe(pipefd), 0); 501 + 502 + pid = fork(); 503 + ASSERT_GE(pid, 0); 504 + 505 + if (pid == 0) { 506 + /* Child process */ 507 + close(pipefd[0]); 508 + 509 + /* Create new user namespace */ 510 + ret = unshare(CLONE_NEWUSER); 511 + if (ret != 0) { 512 + /* Skip test if we don't have permission */ 513 + if (errno == EPERM || errno == EACCES) { 514 + write(pipefd[1], "S", 1); /* Signal skip */ 515 + _exit(0); 516 + } 517 + _exit(1); 518 + } 519 + 520 + /* Signal success */ 521 + write(pipefd[1], "Y", 1); 522 + close(pipefd[1]); 523 + 524 + /* Keep namespace alive */ 525 + pause(); 526 + _exit(0); 527 + } 528 + 529 + /* Parent process */ 530 + close(pipefd[1]); 531 + 532 + char buf; 533 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 534 + close(pipefd[0]); 535 + 536 + if (buf == 'S') { 537 + /* Child couldn't create namespace, skip test */ 538 + kill(pid, SIGTERM); 539 + waitpid(pid, NULL, 0); 540 + close(fd_parent_userns); 541 + SKIP(return, "No permission to create user namespace"); 542 + } 543 + 544 + ASSERT_EQ(buf, 'Y'); 545 + 546 + /* Open child's user namespace */ 547 + char path[256]; 548 + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); 549 + fd_child_userns = open(path, O_RDONLY); 550 + ASSERT_GE(fd_child_userns, 0); 551 + 552 + /* Get child's user namespace ID */ 553 + ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id); 554 + ASSERT_EQ(ret, 0); 555 + ASSERT_NE(child_user_ns_id, 0); 556 + 557 + /* Parent and child should have different user namespace IDs */ 558 + ASSERT_NE(parent_user_ns_id, child_user_ns_id); 559 + 560 + close(fd_parent_userns); 561 + close(fd_child_userns); 562 + 563 + /* Clean up child process */ 564 + kill(pid, SIGTERM); 565 + waitpid(pid, NULL, 0); 566 + } 567 + 568 + TEST(nsid_timens_basic) 569 + { 570 + __u64 time_ns_id = 0; 571 + int fd_timens; 572 + int ret; 573 + 574 + /* Open the current time namespace */ 575 + fd_timens = open("/proc/self/ns/time", O_RDONLY); 576 + if (fd_timens < 0) { 577 + SKIP(return, "Time namespaces not supported"); 578 + } 579 + 580 + /* Get the time namespace ID */ 581 + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id); 582 + ASSERT_EQ(ret, 0); 583 + ASSERT_NE(time_ns_id, 0); 584 + 585 + /* Verify we can get the same ID again */ 586 + __u64 time_ns_id2 = 0; 587 + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2); 588 + ASSERT_EQ(ret, 0); 589 + ASSERT_EQ(time_ns_id, time_ns_id2); 590 + 591 + close(fd_timens); 592 + } 593 + 594 + TEST(nsid_timens_separate) 595 + { 596 + __u64 parent_time_ns_id = 0; 597 + __u64 child_time_ns_id = 0; 598 + int fd_parent_timens, fd_child_timens; 599 + int ret; 600 + pid_t pid; 601 + int pipefd[2]; 602 + 603 + /* Open the current time namespace */ 604 + fd_parent_timens = open("/proc/self/ns/time", O_RDONLY); 605 + if (fd_parent_timens < 0) { 606 + SKIP(return, "Time namespaces not supported"); 607 + } 608 + 609 + /* Get parent's time namespace ID */ 610 + ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id); 611 + ASSERT_EQ(ret, 0); 612 + ASSERT_NE(parent_time_ns_id, 0); 613 + 614 + /* Create a pipe for synchronization */ 615 + ASSERT_EQ(pipe(pipefd), 0); 616 + 617 + pid = fork(); 618 + ASSERT_GE(pid, 0); 619 + 620 + if (pid == 0) { 621 + /* Child process */ 622 + close(pipefd[0]); 623 + 624 + /* Create new time namespace */ 625 + ret = unshare(CLONE_NEWTIME); 626 + if (ret != 0) { 627 + /* Skip test if we don't have permission */ 628 + if (errno == EPERM || errno == EACCES || errno == EINVAL) { 629 + write(pipefd[1], "S", 1); /* Signal skip */ 630 + _exit(0); 631 + } 632 + _exit(1); 633 + } 634 + 635 + /* Fork a grandchild to actually enter the new namespace */ 636 + pid_t grandchild = fork(); 637 + if (grandchild == 0) { 638 + /* Grandchild is in the new namespace */ 639 + write(pipefd[1], "Y", 1); 640 + close(pipefd[1]); 641 + pause(); 642 + _exit(0); 643 + } else if (grandchild > 0) { 644 + /* Child writes grandchild PID and waits */ 645 + write(pipefd[1], "Y", 1); 646 + write(pipefd[1], &grandchild, sizeof(grandchild)); 647 + close(pipefd[1]); 648 + pause(); /* Keep the parent alive to maintain the grandchild */ 649 + _exit(0); 650 + } else { 651 + _exit(1); 652 + } 653 + } 654 + 655 + /* Parent process */ 656 + close(pipefd[1]); 657 + 658 + char buf; 659 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 660 + 661 + if (buf == 'S') { 662 + /* Child couldn't create namespace, skip test */ 663 + kill(pid, SIGTERM); 664 + waitpid(pid, NULL, 0); 665 + close(fd_parent_timens); 666 + close(pipefd[0]); 667 + SKIP(return, "Cannot create time namespace"); 668 + } 669 + 670 + ASSERT_EQ(buf, 'Y'); 671 + 672 + pid_t grandchild_pid; 673 + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); 674 + close(pipefd[0]); 675 + 676 + /* Open grandchild's time namespace */ 677 + char path[256]; 678 + snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid); 679 + fd_child_timens = open(path, O_RDONLY); 680 + ASSERT_GE(fd_child_timens, 0); 681 + 682 + /* Get child's time namespace ID */ 683 + ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id); 684 + ASSERT_EQ(ret, 0); 685 + ASSERT_NE(child_time_ns_id, 0); 686 + 687 + /* Parent and child should have different time namespace IDs */ 688 + ASSERT_NE(parent_time_ns_id, child_time_ns_id); 689 + 690 + close(fd_parent_timens); 691 + close(fd_child_timens); 692 + 693 + /* Clean up child process */ 694 + kill(pid, SIGTERM); 695 + waitpid(pid, NULL, 0); 696 + } 697 + 698 + TEST(nsid_pidns_basic) 699 + { 700 + __u64 pid_ns_id = 0; 701 + int fd_pidns; 702 + int ret; 703 + 704 + /* Open the current PID namespace */ 705 + fd_pidns = open("/proc/self/ns/pid", O_RDONLY); 706 + ASSERT_GE(fd_pidns, 0); 707 + 708 + /* Get the PID namespace ID */ 709 + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id); 710 + ASSERT_EQ(ret, 0); 711 + ASSERT_NE(pid_ns_id, 0); 712 + 713 + /* Verify we can get the same ID again */ 714 + __u64 pid_ns_id2 = 0; 715 + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2); 716 + ASSERT_EQ(ret, 0); 717 + ASSERT_EQ(pid_ns_id, pid_ns_id2); 718 + 719 + close(fd_pidns); 720 + } 721 + 722 + TEST(nsid_pidns_separate) 723 + { 724 + __u64 parent_pid_ns_id = 0; 725 + __u64 child_pid_ns_id = 0; 726 + int fd_parent_pidns, fd_child_pidns; 727 + int ret; 728 + pid_t pid; 729 + int pipefd[2]; 730 + 731 + /* Get parent's PID namespace ID */ 732 + fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY); 733 + ASSERT_GE(fd_parent_pidns, 0); 734 + ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id); 735 + ASSERT_EQ(ret, 0); 736 + ASSERT_NE(parent_pid_ns_id, 0); 737 + 738 + /* Create a pipe for synchronization */ 739 + ASSERT_EQ(pipe(pipefd), 0); 740 + 741 + pid = fork(); 742 + ASSERT_GE(pid, 0); 743 + 744 + if (pid == 0) { 745 + /* Child process */ 746 + close(pipefd[0]); 747 + 748 + /* Create new PID namespace */ 749 + ret = unshare(CLONE_NEWPID); 750 + if (ret != 0) { 751 + /* Skip test if we don't have permission */ 752 + if (errno == EPERM || errno == EACCES) { 753 + write(pipefd[1], "S", 1); /* Signal skip */ 754 + _exit(0); 755 + } 756 + _exit(1); 757 + } 758 + 759 + /* Fork a grandchild to actually enter the new namespace */ 760 + pid_t grandchild = fork(); 761 + if (grandchild == 0) { 762 + /* Grandchild is in the new namespace */ 763 + write(pipefd[1], "Y", 1); 764 + close(pipefd[1]); 765 + pause(); 766 + _exit(0); 767 + } else if (grandchild > 0) { 768 + /* Child writes grandchild PID and waits */ 769 + write(pipefd[1], "Y", 1); 770 + write(pipefd[1], &grandchild, sizeof(grandchild)); 771 + close(pipefd[1]); 772 + pause(); /* Keep the parent alive to maintain the grandchild */ 773 + _exit(0); 774 + } else { 775 + _exit(1); 776 + } 777 + } 778 + 779 + /* Parent process */ 780 + close(pipefd[1]); 781 + 782 + char buf; 783 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 784 + 785 + if (buf == 'S') { 786 + /* Child couldn't create namespace, skip test */ 787 + kill(pid, SIGTERM); 788 + waitpid(pid, NULL, 0); 789 + close(fd_parent_pidns); 790 + close(pipefd[0]); 791 + SKIP(return, "No permission to create PID namespace"); 792 + } 793 + 794 + ASSERT_EQ(buf, 'Y'); 795 + 796 + pid_t grandchild_pid; 797 + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); 798 + close(pipefd[0]); 799 + 800 + /* Open grandchild's PID namespace */ 801 + char path[256]; 802 + snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid); 803 + fd_child_pidns = open(path, O_RDONLY); 804 + ASSERT_GE(fd_child_pidns, 0); 805 + 806 + /* Get child's PID namespace ID */ 807 + ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id); 808 + ASSERT_EQ(ret, 0); 809 + ASSERT_NE(child_pid_ns_id, 0); 810 + 811 + /* Parent and child should have different PID namespace IDs */ 812 + ASSERT_NE(parent_pid_ns_id, child_pid_ns_id); 813 + 814 + close(fd_parent_pidns); 815 + close(fd_child_pidns); 816 + 817 + /* Clean up child process */ 818 + kill(pid, SIGTERM); 819 + waitpid(pid, NULL, 0); 820 + } 821 + 822 + TEST(nsid_netns_basic) 823 + { 824 + __u64 net_ns_id = 0; 825 + __u64 netns_cookie = 0; 826 + int fd_netns; 827 + int sock; 828 + socklen_t optlen; 829 + int ret; 830 + 831 + /* Open the current network namespace */ 832 + fd_netns = open("/proc/self/ns/net", O_RDONLY); 833 + ASSERT_GE(fd_netns, 0); 834 + 835 + /* Get the network namespace ID via ioctl */ 836 + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id); 837 + ASSERT_EQ(ret, 0); 838 + ASSERT_NE(net_ns_id, 0); 839 + 840 + /* Create a socket to get the SO_NETNS_COOKIE */ 841 + sock = socket(AF_UNIX, SOCK_STREAM, 0); 842 + ASSERT_GE(sock, 0); 843 + 844 + /* Get the network namespace cookie via socket option */ 845 + optlen = sizeof(netns_cookie); 846 + ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); 847 + ASSERT_EQ(ret, 0); 848 + ASSERT_EQ(optlen, sizeof(netns_cookie)); 849 + 850 + /* The namespace ID and cookie should be identical */ 851 + ASSERT_EQ(net_ns_id, netns_cookie); 852 + 853 + /* Verify we can get the same ID again */ 854 + __u64 net_ns_id2 = 0; 855 + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2); 856 + ASSERT_EQ(ret, 0); 857 + ASSERT_EQ(net_ns_id, net_ns_id2); 858 + 859 + close(sock); 860 + close(fd_netns); 861 + } 862 + 863 + TEST(nsid_netns_separate) 864 + { 865 + __u64 parent_net_ns_id = 0; 866 + __u64 parent_netns_cookie = 0; 867 + __u64 child_net_ns_id = 0; 868 + __u64 child_netns_cookie = 0; 869 + int fd_parent_netns, fd_child_netns; 870 + int parent_sock, child_sock; 871 + socklen_t optlen; 872 + int ret; 873 + pid_t pid; 874 + int pipefd[2]; 875 + 876 + /* Get parent's network namespace ID */ 877 + fd_parent_netns = open("/proc/self/ns/net", O_RDONLY); 878 + ASSERT_GE(fd_parent_netns, 0); 879 + ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id); 880 + ASSERT_EQ(ret, 0); 881 + ASSERT_NE(parent_net_ns_id, 0); 882 + 883 + /* Get parent's network namespace cookie */ 884 + parent_sock = socket(AF_UNIX, SOCK_STREAM, 0); 885 + ASSERT_GE(parent_sock, 0); 886 + optlen = sizeof(parent_netns_cookie); 887 + ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen); 888 + ASSERT_EQ(ret, 0); 889 + 890 + /* Verify parent's ID and cookie match */ 891 + ASSERT_EQ(parent_net_ns_id, parent_netns_cookie); 892 + 893 + /* Create a pipe for synchronization */ 894 + ASSERT_EQ(pipe(pipefd), 0); 895 + 896 + pid = fork(); 897 + ASSERT_GE(pid, 0); 898 + 899 + if (pid == 0) { 900 + /* Child process */ 901 + close(pipefd[0]); 902 + 903 + /* Create new network namespace */ 904 + ret = unshare(CLONE_NEWNET); 905 + if (ret != 0) { 906 + /* Skip test if we don't have permission */ 907 + if (errno == EPERM || errno == EACCES) { 908 + write(pipefd[1], "S", 1); /* Signal skip */ 909 + _exit(0); 910 + } 911 + _exit(1); 912 + } 913 + 914 + /* Signal success */ 915 + write(pipefd[1], "Y", 1); 916 + close(pipefd[1]); 917 + 918 + /* Keep namespace alive */ 919 + pause(); 920 + _exit(0); 921 + } 922 + 923 + /* Parent process */ 924 + close(pipefd[1]); 925 + 926 + char buf; 927 + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); 928 + close(pipefd[0]); 929 + 930 + if (buf == 'S') { 931 + /* Child couldn't create namespace, skip test */ 932 + kill(pid, SIGTERM); 933 + waitpid(pid, NULL, 0); 934 + close(fd_parent_netns); 935 + close(parent_sock); 936 + SKIP(return, "No permission to create network namespace"); 937 + } 938 + 939 + ASSERT_EQ(buf, 'Y'); 940 + 941 + /* Open child's network namespace */ 942 + char path[256]; 943 + snprintf(path, sizeof(path), "/proc/%d/ns/net", pid); 944 + fd_child_netns = open(path, O_RDONLY); 945 + ASSERT_GE(fd_child_netns, 0); 946 + 947 + /* Get child's network namespace ID */ 948 + ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id); 949 + ASSERT_EQ(ret, 0); 950 + ASSERT_NE(child_net_ns_id, 0); 951 + 952 + /* Create socket in child's namespace to get cookie */ 953 + ret = setns(fd_child_netns, CLONE_NEWNET); 954 + if (ret == 0) { 955 + child_sock = socket(AF_UNIX, SOCK_STREAM, 0); 956 + ASSERT_GE(child_sock, 0); 957 + 958 + optlen = sizeof(child_netns_cookie); 959 + ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen); 960 + ASSERT_EQ(ret, 0); 961 + 962 + /* Verify child's ID and cookie match */ 963 + ASSERT_EQ(child_net_ns_id, child_netns_cookie); 964 + 965 + close(child_sock); 966 + 967 + /* Return to parent namespace */ 968 + setns(fd_parent_netns, CLONE_NEWNET); 969 + } 970 + 971 + /* Parent and child should have different network namespace IDs */ 972 + ASSERT_NE(parent_net_ns_id, child_net_ns_id); 973 + if (child_netns_cookie != 0) { 974 + ASSERT_NE(parent_netns_cookie, child_netns_cookie); 975 + } 976 + 977 + close(fd_parent_netns); 978 + close(fd_child_netns); 979 + close(parent_sock); 980 + 981 + /* Clean up child process */ 982 + kill(pid, SIGTERM); 983 + waitpid(pid, NULL, 0); 984 + } 985 + 986 + TEST_HARNESS_MAIN