Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cgroup: remove the ns_cgroup

The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier and
leads to some problems:

* cgroup creation is out-of-control
* cgroup name can conflict when pids are looping
* it is not possible to have a single process handling a lot of
namespaces without falling in a exponential creation time
* we may want to create a namespace without creating a cgroup

The ns_cgroup was replaced by a compatibility flag 'clone_children',
where a newly created cgroup will copy the parent cgroup values.
The userspace has to manually create a cgroup and add a task to
the 'tasks' file.

This patch removes the ns_cgroup as suggested in the following thread:

https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html

The 'cgroup_clone' function is removed because it is no longer used.

This is a userspace-visible change. Commit 45531757b45c ("cgroup: notify
ns_cgroup deprecated") (merged into 2.6.27) caused the kernel to emit a
printk warning users that the feature is planned for removal. Since that
time we have heard from XXX users who were affected by this.

Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jamal Hadi Salim <hadi@cyberus.ca>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Daniel Lezcano and committed by
Linus Torvalds
a77aea92 d846687d

+4 -287
+1 -1
Documentation/cgroups/cgroups.txt
··· 651 651 void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) 652 652 (cgroup_mutex held by caller) 653 653 654 - Called at the end of cgroup_clone() to do any parameter 654 + Called during cgroup_create() to do any parameter 655 655 initialization which might be required before a task could attach. For 656 656 example in cpusets, no task may attach before 'cpus' and 'mems' are set 657 657 up.
-1
arch/mips/configs/bcm47xx_defconfig
··· 16 16 CONFIG_AUDIT=y 17 17 CONFIG_TINY_RCU=y 18 18 CONFIG_CGROUPS=y 19 - CONFIG_CGROUP_NS=y 20 19 CONFIG_CGROUP_CPUACCT=y 21 20 CONFIG_RELAY=y 22 21 CONFIG_BLK_DEV_INITRD=y
-1
arch/mn10300/configs/asb2364_defconfig
··· 8 8 CONFIG_TASK_IO_ACCOUNTING=y 9 9 CONFIG_LOG_BUF_SHIFT=14 10 10 CONFIG_CGROUPS=y 11 - CONFIG_CGROUP_NS=y 12 11 CONFIG_CGROUP_FREEZER=y 13 12 CONFIG_CGROUP_DEVICE=y 14 13 CONFIG_CGROUP_CPUACCT=y
-1
arch/powerpc/configs/ppc6xx_defconfig
··· 10 10 CONFIG_TASK_IO_ACCOUNTING=y 11 11 CONFIG_AUDIT=y 12 12 CONFIG_CGROUPS=y 13 - CONFIG_CGROUP_NS=y 14 13 CONFIG_CGROUP_DEVICE=y 15 14 CONFIG_CGROUP_CPUACCT=y 16 15 CONFIG_RESOURCE_COUNTERS=y
-1
arch/powerpc/configs/pseries_defconfig
··· 15 15 CONFIG_IKCONFIG=y 16 16 CONFIG_IKCONFIG_PROC=y 17 17 CONFIG_CGROUPS=y 18 - CONFIG_CGROUP_NS=y 19 18 CONFIG_CGROUP_FREEZER=y 20 19 CONFIG_CGROUP_DEVICE=y 21 20 CONFIG_CPUSETS=y
-1
arch/sh/configs/apsh4ad0a_defconfig
··· 7 7 CONFIG_IKCONFIG_PROC=y 8 8 CONFIG_LOG_BUF_SHIFT=14 9 9 CONFIG_CGROUPS=y 10 - CONFIG_CGROUP_NS=y 11 10 CONFIG_CGROUP_FREEZER=y 12 11 CONFIG_CGROUP_DEVICE=y 13 12 CONFIG_CGROUP_CPUACCT=y
-1
arch/sh/configs/sdk7786_defconfig
··· 12 12 CONFIG_IKCONFIG_PROC=y 13 13 CONFIG_CGROUPS=y 14 14 CONFIG_CGROUP_DEBUG=y 15 - CONFIG_CGROUP_NS=y 16 15 CONFIG_CGROUP_FREEZER=y 17 16 CONFIG_CGROUP_DEVICE=y 18 17 CONFIG_CPUSETS=y
-1
arch/sh/configs/se7206_defconfig
··· 8 8 CONFIG_LOG_BUF_SHIFT=14 9 9 CONFIG_CGROUPS=y 10 10 CONFIG_CGROUP_DEBUG=y 11 - CONFIG_CGROUP_NS=y 12 11 CONFIG_CGROUP_DEVICE=y 13 12 CONFIG_CGROUP_CPUACCT=y 14 13 CONFIG_RESOURCE_COUNTERS=y
-1
arch/sh/configs/shx3_defconfig
··· 9 9 CONFIG_IKCONFIG_PROC=y 10 10 CONFIG_LOG_BUF_SHIFT=14 11 11 CONFIG_CGROUPS=y 12 - CONFIG_CGROUP_NS=y 13 12 CONFIG_CGROUP_FREEZER=y 14 13 CONFIG_CGROUP_DEVICE=y 15 14 CONFIG_CGROUP_CPUACCT=y
-1
arch/sh/configs/urquell_defconfig
··· 9 9 CONFIG_LOG_BUF_SHIFT=14 10 10 CONFIG_CGROUPS=y 11 11 CONFIG_CGROUP_DEBUG=y 12 - CONFIG_CGROUP_NS=y 13 12 CONFIG_CGROUP_FREEZER=y 14 13 CONFIG_CGROUP_DEVICE=y 15 14 CONFIG_CPUSETS=y
-1
arch/x86/configs/i386_defconfig
··· 10 10 CONFIG_AUDIT=y 11 11 CONFIG_LOG_BUF_SHIFT=18 12 12 CONFIG_CGROUPS=y 13 - CONFIG_CGROUP_NS=y 14 13 CONFIG_CGROUP_FREEZER=y 15 14 CONFIG_CPUSETS=y 16 15 CONFIG_CGROUP_CPUACCT=y
-1
arch/x86/configs/x86_64_defconfig
··· 11 11 CONFIG_AUDIT=y 12 12 CONFIG_LOG_BUF_SHIFT=18 13 13 CONFIG_CGROUPS=y 14 - CONFIG_CGROUP_NS=y 15 14 CONFIG_CGROUP_FREEZER=y 16 15 CONFIG_CPUSETS=y 17 16 CONFIG_CGROUP_CPUACCT=y
-3
include/linux/cgroup.h
··· 555 555 return task_subsys_state(task, subsys_id)->cgroup; 556 556 } 557 557 558 - int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss, 559 - char *nodename); 560 - 561 558 /* A cgroup_iter should be treated as an opaque object */ 562 559 struct cgroup_iter { 563 560 struct list_head *cg_link;
-6
include/linux/cgroup_subsys.h
··· 19 19 20 20 /* */ 21 21 22 - #ifdef CONFIG_CGROUP_NS 23 - SUBSYS(ns) 24 - #endif 25 - 26 - /* */ 27 - 28 22 #ifdef CONFIG_CGROUP_SCHED 29 23 SUBSYS(cpu_cgroup) 30 24 #endif
-9
include/linux/nsproxy.h
··· 81 81 atomic_inc(&ns->count); 82 82 } 83 83 84 - #ifdef CONFIG_CGROUP_NS 85 - int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid); 86 - #else 87 - static inline int ns_cgroup_clone(struct task_struct *tsk, struct pid *pid) 88 - { 89 - return 0; 90 - } 91 - #endif 92 - 93 84 #endif
-8
init/Kconfig
··· 589 589 590 590 Say N if unsure. 591 591 592 - config CGROUP_NS 593 - bool "Namespace cgroup subsystem" 594 - help 595 - Provides a simple namespace cgroup subsystem to 596 - provide hierarchical naming of sets of namespaces, 597 - for instance virtual servers and checkpoint/restart 598 - jobs. 599 - 600 592 config CGROUP_FREEZER 601 593 bool "Freezer cgroup subsystem" 602 594 help
-1
kernel/Makefile
··· 61 61 obj-$(CONFIG_CGROUPS) += cgroup.o 62 62 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 63 63 obj-$(CONFIG_CPUSETS) += cpuset.o 64 - obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 65 64 obj-$(CONFIG_UTS_NS) += utsname.o 66 65 obj-$(CONFIG_USER_NS) += user_namespace.o 67 66 obj-$(CONFIG_PID_NS) += pid_namespace.o
-116
kernel/cgroup.c
··· 4630 4630 } 4631 4631 4632 4632 /** 4633 - * cgroup_clone - clone the cgroup the given subsystem is attached to 4634 - * @tsk: the task to be moved 4635 - * @subsys: the given subsystem 4636 - * @nodename: the name for the new cgroup 4637 - * 4638 - * Duplicate the current cgroup in the hierarchy that the given 4639 - * subsystem is attached to, and move this task into the new 4640 - * child. 4641 - */ 4642 - int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, 4643 - char *nodename) 4644 - { 4645 - struct dentry *dentry; 4646 - int ret = 0; 4647 - struct cgroup *parent, *child; 4648 - struct inode *inode; 4649 - struct css_set *cg; 4650 - struct cgroupfs_root *root; 4651 - struct cgroup_subsys *ss; 4652 - 4653 - /* We shouldn't be called by an unregistered subsystem */ 4654 - BUG_ON(!subsys->active); 4655 - 4656 - /* First figure out what hierarchy and cgroup we're dealing 4657 - * with, and pin them so we can drop cgroup_mutex */ 4658 - mutex_lock(&cgroup_mutex); 4659 - again: 4660 - root = subsys->root; 4661 - if (root == &rootnode) { 4662 - mutex_unlock(&cgroup_mutex); 4663 - return 0; 4664 - } 4665 - 4666 - /* Pin the hierarchy */ 4667 - if (!atomic_inc_not_zero(&root->sb->s_active)) { 4668 - /* We race with the final deactivate_super() */ 4669 - mutex_unlock(&cgroup_mutex); 4670 - return 0; 4671 - } 4672 - 4673 - /* Keep the cgroup alive */ 4674 - task_lock(tsk); 4675 - parent = task_cgroup(tsk, subsys->subsys_id); 4676 - cg = tsk->cgroups; 4677 - get_css_set(cg); 4678 - task_unlock(tsk); 4679 - 4680 - mutex_unlock(&cgroup_mutex); 4681 - 4682 - /* Now do the VFS work to create a cgroup */ 4683 - inode = parent->dentry->d_inode; 4684 - 4685 - /* Hold the parent directory mutex across this operation to 4686 - * stop anyone else deleting the new cgroup */ 4687 - mutex_lock(&inode->i_mutex); 4688 - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); 4689 - if (IS_ERR(dentry)) { 4690 - printk(KERN_INFO 4691 - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, 4692 - PTR_ERR(dentry)); 4693 - ret = PTR_ERR(dentry); 4694 - goto out_release; 4695 - } 4696 - 4697 - /* Create the cgroup directory, which also creates the cgroup */ 4698 - ret = vfs_mkdir(inode, dentry, 0755); 4699 - child = __d_cgrp(dentry); 4700 - dput(dentry); 4701 - if (ret) { 4702 - printk(KERN_INFO 4703 - "Failed to create cgroup %s: %d\n", nodename, 4704 - ret); 4705 - goto out_release; 4706 - } 4707 - 4708 - /* The cgroup now exists. Retake cgroup_mutex and check 4709 - * that we're still in the same state that we thought we 4710 - * were. */ 4711 - mutex_lock(&cgroup_mutex); 4712 - if ((root != subsys->root) || 4713 - (parent != task_cgroup(tsk, subsys->subsys_id))) { 4714 - /* Aargh, we raced ... */ 4715 - mutex_unlock(&inode->i_mutex); 4716 - put_css_set(cg); 4717 - 4718 - deactivate_super(root->sb); 4719 - /* The cgroup is still accessible in the VFS, but 4720 - * we're not going to try to rmdir() it at this 4721 - * point. */ 4722 - printk(KERN_INFO 4723 - "Race in cgroup_clone() - leaking cgroup %s\n", 4724 - nodename); 4725 - goto again; 4726 - } 4727 - 4728 - /* do any required auto-setup */ 4729 - for_each_subsys(root, ss) { 4730 - if (ss->post_clone) 4731 - ss->post_clone(ss, child); 4732 - } 4733 - 4734 - /* All seems fine. Finish by moving the task into the new cgroup */ 4735 - ret = cgroup_attach_task(child, tsk); 4736 - mutex_unlock(&cgroup_mutex); 4737 - 4738 - out_release: 4739 - mutex_unlock(&inode->i_mutex); 4740 - 4741 - mutex_lock(&cgroup_mutex); 4742 - put_css_set(cg); 4743 - mutex_unlock(&cgroup_mutex); 4744 - deactivate_super(root->sb); 4745 - return ret; 4746 - } 4747 - 4748 - /** 4749 4633 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp 4750 4634 * @cgrp: the cgroup in question 4751 4635 * @task: the task in question
+3 -4
kernel/cpuset.c
··· 1802 1802 } 1803 1803 1804 1804 /* 1805 - * post_clone() is called at the end of cgroup_clone(). 1806 - * 'cgroup' was just created automatically as a result of 1807 - * a cgroup_clone(), and the current task is about to 1808 - * be moved into 'cgroup'. 1805 + * post_clone() is called during cgroup_create() when the 1806 + * clone_children mount argument was specified. The cgroup 1807 + * can not yet have any tasks. 1809 1808 * 1810 1809 * Currently we refuse to set up the cgroup - thereby 1811 1810 * refusing the task to be entered, and as a result refusing
-6
kernel/fork.c
··· 1229 1229 if (clone_flags & CLONE_THREAD) 1230 1230 p->tgid = current->tgid; 1231 1231 1232 - if (current->nsproxy != p->nsproxy) { 1233 - retval = ns_cgroup_clone(p, pid); 1234 - if (retval) 1235 - goto bad_fork_free_pid; 1236 - } 1237 - 1238 1232 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1239 1233 /* 1240 1234 * Clear TID on mm_release()?
-118
kernel/ns_cgroup.c
··· 1 - /* 2 - * ns_cgroup.c - namespace cgroup subsystem 3 - * 4 - * Copyright 2006, 2007 IBM Corp 5 - */ 6 - 7 - #include <linux/module.h> 8 - #include <linux/cgroup.h> 9 - #include <linux/fs.h> 10 - #include <linux/proc_fs.h> 11 - #include <linux/slab.h> 12 - #include <linux/nsproxy.h> 13 - 14 - struct ns_cgroup { 15 - struct cgroup_subsys_state css; 16 - }; 17 - 18 - struct cgroup_subsys ns_subsys; 19 - 20 - static inline struct ns_cgroup *cgroup_to_ns( 21 - struct cgroup *cgroup) 22 - { 23 - return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), 24 - struct ns_cgroup, css); 25 - } 26 - 27 - int ns_cgroup_clone(struct task_struct *task, struct pid *pid) 28 - { 29 - char name[PROC_NUMBUF]; 30 - 31 - snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); 32 - return cgroup_clone(task, &ns_subsys, name); 33 - } 34 - 35 - /* 36 - * Rules: 37 - * 1. you can only enter a cgroup which is a descendant of your current 38 - * cgroup 39 - * 2. you can only place another process into a cgroup if 40 - * a. you have CAP_SYS_ADMIN 41 - * b. your cgroup is an ancestor of task's destination cgroup 42 - * (hence either you are in the same cgroup as task, or in an 43 - * ancestor cgroup thereof) 44 - */ 45 - static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, 46 - struct task_struct *task, bool threadgroup) 47 - { 48 - if (current != task) { 49 - if (!capable(CAP_SYS_ADMIN)) 50 - return -EPERM; 51 - 52 - if (!cgroup_is_descendant(new_cgroup, current)) 53 - return -EPERM; 54 - } 55 - 56 - if (!cgroup_is_descendant(new_cgroup, task)) 57 - return -EPERM; 58 - 59 - if (threadgroup) { 60 - struct task_struct *c; 61 - rcu_read_lock(); 62 - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { 63 - if (!cgroup_is_descendant(new_cgroup, c)) { 64 - rcu_read_unlock(); 65 - return -EPERM; 66 - } 67 - } 68 - rcu_read_unlock(); 69 - } 70 - 71 - return 0; 72 - } 73 - 74 - /* 75 - * Rules: you can only create a cgroup if 76 - * 1. you are capable(CAP_SYS_ADMIN) 77 - * 2. the target cgroup is a descendant of your own cgroup 78 - */ 79 - static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, 80 - struct cgroup *cgroup) 81 - { 82 - struct ns_cgroup *ns_cgroup; 83 - 84 - if (!capable(CAP_SYS_ADMIN)) 85 - return ERR_PTR(-EPERM); 86 - if (!cgroup_is_descendant(cgroup, current)) 87 - return ERR_PTR(-EPERM); 88 - if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { 89 - printk("ns_cgroup can't be created with parent " 90 - "'clone_children' set.\n"); 91 - return ERR_PTR(-EINVAL); 92 - } 93 - 94 - printk_once("ns_cgroup deprecated: consider using the " 95 - "'clone_children' flag without the ns_cgroup.\n"); 96 - 97 - ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 98 - if (!ns_cgroup) 99 - return ERR_PTR(-ENOMEM); 100 - return &ns_cgroup->css; 101 - } 102 - 103 - static void ns_destroy(struct cgroup_subsys *ss, 104 - struct cgroup *cgroup) 105 - { 106 - struct ns_cgroup *ns_cgroup; 107 - 108 - ns_cgroup = cgroup_to_ns(cgroup); 109 - kfree(ns_cgroup); 110 - } 111 - 112 - struct cgroup_subsys ns_subsys = { 113 - .name = "ns", 114 - .can_attach = ns_can_attach, 115 - .create = ns_create, 116 - .destroy = ns_destroy, 117 - .subsys_id = ns_subsys_id, 118 - };
-4
kernel/nsproxy.c
··· 201 201 goto out; 202 202 } 203 203 204 - err = ns_cgroup_clone(current, task_pid(current)); 205 - if (err) 206 - put_nsproxy(*new_nsp); 207 - 208 204 out: 209 205 return err; 210 206 }