Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cgroups: make procs file writable

Make procs file writable to move all threads by tgid at once.

Add functionality that enables users to move all threads in a threadgroup
at once to a cgroup by writing the tgid to the 'cgroup.procs' file. This
current implementation makes use of a per-threadgroup rwsem that's taken
for reading in the fork() path to prevent newly forking threads within the
threadgroup from "escaping" while the move is in progress.

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Ben Blum and committed by
Linus Torvalds
74a1166d f780bdb7

+401 -47
+8 -1
Documentation/cgroups/cgroups.txt
··· 236 236 - cgroup.procs: list of tgids in the cgroup. This list is not 237 237 guaranteed to be sorted or free of duplicate tgids, and userspace 238 238 should sort/uniquify the list if this property is required. 239 - This is a read-only file, for now. 239 + Writing a thread group id into this file moves all threads in that 240 + group into this cgroup. 240 241 - notify_on_release flag: run the release agent on exit? 241 242 - release_agent: the path to use for release notifications (this file 242 243 exists in the top cgroup only) ··· 430 429 You can attach the current shell task by echoing 0: 431 430 432 431 # echo 0 > tasks 432 + 433 + You can use the cgroup.procs file instead of the tasks file to move all 434 + threads in a threadgroup at once. Echoing the pid of any task in a 435 + threadgroup to cgroup.procs causes all tasks in that threadgroup to be 436 + be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks 437 + in the writing task's threadgroup. 433 438 434 439 Note: Since every task is always a member of exactly one cgroup in each 435 440 mounted hierarchy, to remove a task from its current cgroup you must
+393 -46
kernel/cgroup.c
··· 1735 1735 } 1736 1736 EXPORT_SYMBOL_GPL(cgroup_path); 1737 1737 1738 + /* 1739 + * cgroup_task_migrate - move a task from one cgroup to another. 1740 + * 1741 + * 'guarantee' is set if the caller promises that a new css_set for the task 1742 + * will already exist. If not set, this function might sleep, and can fail with 1743 + * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1744 + */ 1745 + static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1746 + struct task_struct *tsk, bool guarantee) 1747 + { 1748 + struct css_set *oldcg; 1749 + struct css_set *newcg; 1750 + 1751 + /* 1752 + * get old css_set. we need to take task_lock and refcount it, because 1753 + * an exiting task can change its css_set to init_css_set and drop its 1754 + * old one without taking cgroup_mutex. 1755 + */ 1756 + task_lock(tsk); 1757 + oldcg = tsk->cgroups; 1758 + get_css_set(oldcg); 1759 + task_unlock(tsk); 1760 + 1761 + /* locate or allocate a new css_set for this task. */ 1762 + if (guarantee) { 1763 + /* we know the css_set we want already exists. */ 1764 + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 1765 + read_lock(&css_set_lock); 1766 + newcg = find_existing_css_set(oldcg, cgrp, template); 1767 + BUG_ON(!newcg); 1768 + get_css_set(newcg); 1769 + read_unlock(&css_set_lock); 1770 + } else { 1771 + might_sleep(); 1772 + /* find_css_set will give us newcg already referenced. */ 1773 + newcg = find_css_set(oldcg, cgrp); 1774 + if (!newcg) { 1775 + put_css_set(oldcg); 1776 + return -ENOMEM; 1777 + } 1778 + } 1779 + put_css_set(oldcg); 1780 + 1781 + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ 1782 + task_lock(tsk); 1783 + if (tsk->flags & PF_EXITING) { 1784 + task_unlock(tsk); 1785 + put_css_set(newcg); 1786 + return -ESRCH; 1787 + } 1788 + rcu_assign_pointer(tsk->cgroups, newcg); 1789 + task_unlock(tsk); 1790 + 1791 + /* Update the css_set linked lists if we're using them */ 1792 + write_lock(&css_set_lock); 1793 + if (!list_empty(&tsk->cg_list)) 1794 + list_move(&tsk->cg_list, &newcg->tasks); 1795 + write_unlock(&css_set_lock); 1796 + 1797 + /* 1798 + * We just gained a reference on oldcg by taking it from the task. As 1799 + * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1800 + * it here; it will be freed under RCU. 1801 + */ 1802 + put_css_set(oldcg); 1803 + 1804 + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1805 + return 0; 1806 + } 1807 + 1738 1808 /** 1739 1809 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1740 1810 * @cgrp: the cgroup the task is attaching to ··· 1815 1745 */ 1816 1746 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1817 1747 { 1818 - int retval = 0; 1748 + int retval; 1819 1749 struct cgroup_subsys *ss, *failed_ss = NULL; 1820 1750 struct cgroup *oldcgrp; 1821 - struct css_set *cg; 1822 - struct css_set *newcg; 1823 1751 struct cgroupfs_root *root = cgrp->root; 1824 1752 1825 1753 /* Nothing to do if the task is already in that cgroup */ ··· 1848 1780 } 1849 1781 } 1850 1782 1851 - task_lock(tsk); 1852 - cg = tsk->cgroups; 1853 - get_css_set(cg); 1854 - task_unlock(tsk); 1855 - /* 1856 - * Locate or allocate a new css_set for this task, 1857 - * based on its final set of cgroups 1858 - */ 1859 - newcg = find_css_set(cg, cgrp); 1860 - put_css_set(cg); 1861 - if (!newcg) { 1862 - retval = -ENOMEM; 1783 + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1784 + if (retval) 1863 1785 goto out; 1864 - } 1865 - 1866 - task_lock(tsk); 1867 - if (tsk->flags & PF_EXITING) { 1868 - task_unlock(tsk); 1869 - put_css_set(newcg); 1870 - retval = -ESRCH; 1871 - goto out; 1872 - } 1873 - rcu_assign_pointer(tsk->cgroups, newcg); 1874 - task_unlock(tsk); 1875 - 1876 - /* Update the css_set linked lists if we're using them */ 1877 - write_lock(&css_set_lock); 1878 - if (!list_empty(&tsk->cg_list)) 1879 - list_move(&tsk->cg_list, &newcg->tasks); 1880 - write_unlock(&css_set_lock); 1881 1786 1882 1787 for_each_subsys(root, ss) { 1883 1788 if (ss->pre_attach) ··· 1860 1819 if (ss->attach) 1861 1820 ss->attach(ss, cgrp, oldcgrp, tsk); 1862 1821 } 1863 - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1822 + 1864 1823 synchronize_rcu(); 1865 - put_css_set(cg); 1866 1824 1867 1825 /* 1868 1826 * wake up rmdir() waiter. the rmdir should fail since the cgroup ··· 1911 1871 EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1912 1872 1913 1873 /* 1914 - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1915 - * held. May take task_lock of task 1874 + * cgroup_attach_proc works in two stages, the first of which prefetches all 1875 + * new css_sets needed (to make sure we have enough memory before committing 1876 + * to the move) and stores them in a list of entries of the following type. 1877 + * TODO: possible optimization: use css_set->rcu_head for chaining instead 1916 1878 */ 1917 - static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 1879 + struct cg_list_entry { 1880 + struct css_set *cg; 1881 + struct list_head links; 1882 + }; 1883 + 1884 + static bool css_set_check_fetched(struct cgroup *cgrp, 1885 + struct task_struct *tsk, struct css_set *cg, 1886 + struct list_head *newcg_list) 1887 + { 1888 + struct css_set *newcg; 1889 + struct cg_list_entry *cg_entry; 1890 + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 1891 + 1892 + read_lock(&css_set_lock); 1893 + newcg = find_existing_css_set(cg, cgrp, template); 1894 + if (newcg) 1895 + get_css_set(newcg); 1896 + read_unlock(&css_set_lock); 1897 + 1898 + /* doesn't exist at all? */ 1899 + if (!newcg) 1900 + return false; 1901 + /* see if it's already in the list */ 1902 + list_for_each_entry(cg_entry, newcg_list, links) { 1903 + if (cg_entry->cg == newcg) { 1904 + put_css_set(newcg); 1905 + return true; 1906 + } 1907 + } 1908 + 1909 + /* not found */ 1910 + put_css_set(newcg); 1911 + return false; 1912 + } 1913 + 1914 + /* 1915 + * Find the new css_set and store it in the list in preparation for moving the 1916 + * given task to the given cgroup. Returns 0 or -ENOMEM. 1917 + */ 1918 + static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, 1919 + struct list_head *newcg_list) 1920 + { 1921 + struct css_set *newcg; 1922 + struct cg_list_entry *cg_entry; 1923 + 1924 + /* ensure a new css_set will exist for this thread */ 1925 + newcg = find_css_set(cg, cgrp); 1926 + if (!newcg) 1927 + return -ENOMEM; 1928 + /* add it to the list */ 1929 + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); 1930 + if (!cg_entry) { 1931 + put_css_set(newcg); 1932 + return -ENOMEM; 1933 + } 1934 + cg_entry->cg = newcg; 1935 + list_add(&cg_entry->links, newcg_list); 1936 + return 0; 1937 + } 1938 + 1939 + /** 1940 + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 1941 + * @cgrp: the cgroup to attach to 1942 + * @leader: the threadgroup leader task_struct of the group to be attached 1943 + * 1944 + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 1945 + * take task_lock of each thread in leader's threadgroup individually in turn. 1946 + */ 1947 + int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1948 + { 1949 + int retval, i, group_size; 1950 + struct cgroup_subsys *ss, *failed_ss = NULL; 1951 + bool cancel_failed_ss = false; 1952 + /* guaranteed to be initialized later, but the compiler needs this */ 1953 + struct cgroup *oldcgrp = NULL; 1954 + struct css_set *oldcg; 1955 + struct cgroupfs_root *root = cgrp->root; 1956 + /* threadgroup list cursor and array */ 1957 + struct task_struct *tsk; 1958 + struct task_struct **group; 1959 + /* 1960 + * we need to make sure we have css_sets for all the tasks we're 1961 + * going to move -before- we actually start moving them, so that in 1962 + * case we get an ENOMEM we can bail out before making any changes. 1963 + */ 1964 + struct list_head newcg_list; 1965 + struct cg_list_entry *cg_entry, *temp_nobe; 1966 + 1967 + /* 1968 + * step 0: in order to do expensive, possibly blocking operations for 1969 + * every thread, we cannot iterate the thread group list, since it needs 1970 + * rcu or tasklist locked. instead, build an array of all threads in the 1971 + * group - threadgroup_fork_lock prevents new threads from appearing, 1972 + * and if threads exit, this will just be an over-estimate. 1973 + */ 1974 + group_size = get_nr_threads(leader); 1975 + group = kmalloc(group_size * sizeof(*group), GFP_KERNEL); 1976 + if (!group) 1977 + return -ENOMEM; 1978 + 1979 + /* prevent changes to the threadgroup list while we take a snapshot. */ 1980 + rcu_read_lock(); 1981 + if (!thread_group_leader(leader)) { 1982 + /* 1983 + * a race with de_thread from another thread's exec() may strip 1984 + * us of our leadership, making while_each_thread unsafe to use 1985 + * on this task. if this happens, there is no choice but to 1986 + * throw this task away and try again (from cgroup_procs_write); 1987 + * this is "double-double-toil-and-trouble-check locking". 1988 + */ 1989 + rcu_read_unlock(); 1990 + retval = -EAGAIN; 1991 + goto out_free_group_list; 1992 + } 1993 + /* take a reference on each task in the group to go in the array. */ 1994 + tsk = leader; 1995 + i = 0; 1996 + do { 1997 + /* as per above, nr_threads may decrease, but not increase. */ 1998 + BUG_ON(i >= group_size); 1999 + get_task_struct(tsk); 2000 + group[i] = tsk; 2001 + i++; 2002 + } while_each_thread(leader, tsk); 2003 + /* remember the number of threads in the array for later. */ 2004 + group_size = i; 2005 + rcu_read_unlock(); 2006 + 2007 + /* 2008 + * step 1: check that we can legitimately attach to the cgroup. 2009 + */ 2010 + for_each_subsys(root, ss) { 2011 + if (ss->can_attach) { 2012 + retval = ss->can_attach(ss, cgrp, leader); 2013 + if (retval) { 2014 + failed_ss = ss; 2015 + goto out_cancel_attach; 2016 + } 2017 + } 2018 + /* a callback to be run on every thread in the threadgroup. */ 2019 + if (ss->can_attach_task) { 2020 + /* run on each task in the threadgroup. */ 2021 + for (i = 0; i < group_size; i++) { 2022 + retval = ss->can_attach_task(cgrp, group[i]); 2023 + if (retval) { 2024 + failed_ss = ss; 2025 + cancel_failed_ss = true; 2026 + goto out_cancel_attach; 2027 + } 2028 + } 2029 + } 2030 + } 2031 + 2032 + /* 2033 + * step 2: make sure css_sets exist for all threads to be migrated. 2034 + * we use find_css_set, which allocates a new one if necessary. 2035 + */ 2036 + INIT_LIST_HEAD(&newcg_list); 2037 + for (i = 0; i < group_size; i++) { 2038 + tsk = group[i]; 2039 + /* nothing to do if this task is already in the cgroup */ 2040 + oldcgrp = task_cgroup_from_root(tsk, root); 2041 + if (cgrp == oldcgrp) 2042 + continue; 2043 + /* get old css_set pointer */ 2044 + task_lock(tsk); 2045 + if (tsk->flags & PF_EXITING) { 2046 + /* ignore this task if it's going away */ 2047 + task_unlock(tsk); 2048 + continue; 2049 + } 2050 + oldcg = tsk->cgroups; 2051 + get_css_set(oldcg); 2052 + task_unlock(tsk); 2053 + /* see if the new one for us is already in the list? */ 2054 + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { 2055 + /* was already there, nothing to do. */ 2056 + put_css_set(oldcg); 2057 + } else { 2058 + /* we don't already have it. get new one. */ 2059 + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2060 + put_css_set(oldcg); 2061 + if (retval) 2062 + goto out_list_teardown; 2063 + } 2064 + } 2065 + 2066 + /* 2067 + * step 3: now that we're guaranteed success wrt the css_sets, proceed 2068 + * to move all tasks to the new cgroup, calling ss->attach_task for each 2069 + * one along the way. there are no failure cases after here, so this is 2070 + * the commit point. 2071 + */ 2072 + for_each_subsys(root, ss) { 2073 + if (ss->pre_attach) 2074 + ss->pre_attach(cgrp); 2075 + } 2076 + for (i = 0; i < group_size; i++) { 2077 + tsk = group[i]; 2078 + /* leave current thread as it is if it's already there */ 2079 + oldcgrp = task_cgroup_from_root(tsk, root); 2080 + if (cgrp == oldcgrp) 2081 + continue; 2082 + /* attach each task to each subsystem */ 2083 + for_each_subsys(root, ss) { 2084 + if (ss->attach_task) 2085 + ss->attach_task(cgrp, tsk); 2086 + } 2087 + /* if the thread is PF_EXITING, it can just get skipped. */ 2088 + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); 2089 + BUG_ON(retval != 0 && retval != -ESRCH); 2090 + } 2091 + /* nothing is sensitive to fork() after this point. */ 2092 + 2093 + /* 2094 + * step 4: do expensive, non-thread-specific subsystem callbacks. 2095 + * TODO: if ever a subsystem needs to know the oldcgrp for each task 2096 + * being moved, this call will need to be reworked to communicate that. 2097 + */ 2098 + for_each_subsys(root, ss) { 2099 + if (ss->attach) 2100 + ss->attach(ss, cgrp, oldcgrp, leader); 2101 + } 2102 + 2103 + /* 2104 + * step 5: success! and cleanup 2105 + */ 2106 + synchronize_rcu(); 2107 + cgroup_wakeup_rmdir_waiter(cgrp); 2108 + retval = 0; 2109 + out_list_teardown: 2110 + /* clean up the list of prefetched css_sets. */ 2111 + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { 2112 + list_del(&cg_entry->links); 2113 + put_css_set(cg_entry->cg); 2114 + kfree(cg_entry); 2115 + } 2116 + out_cancel_attach: 2117 + /* same deal as in cgroup_attach_task */ 2118 + if (retval) { 2119 + for_each_subsys(root, ss) { 2120 + if (ss == failed_ss) { 2121 + if (cancel_failed_ss && ss->cancel_attach) 2122 + ss->cancel_attach(ss, cgrp, leader); 2123 + break; 2124 + } 2125 + if (ss->cancel_attach) 2126 + ss->cancel_attach(ss, cgrp, leader); 2127 + } 2128 + } 2129 + /* clean up the array of referenced threads in the group. */ 2130 + for (i = 0; i < group_size; i++) 2131 + put_task_struct(group[i]); 2132 + out_free_group_list: 2133 + kfree(group); 2134 + return retval; 2135 + } 2136 + 2137 + /* 2138 + * Find the task_struct of the task to attach by vpid and pass it along to the 2139 + * function to attach either it or all tasks in its threadgroup. Will take 2140 + * cgroup_mutex; may take task_lock of task. 2141 + */ 2142 + static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 1918 2143 { 1919 2144 struct task_struct *tsk; 1920 2145 const struct cred *cred = current_cred(), *tcred; 1921 2146 int ret; 1922 2147 2148 + if (!cgroup_lock_live_group(cgrp)) 2149 + return -ENODEV; 2150 + 1923 2151 if (pid) { 1924 2152 rcu_read_lock(); 1925 2153 tsk = find_task_by_vpid(pid); 1926 - if (!tsk || tsk->flags & PF_EXITING) { 2154 + if (!tsk) { 1927 2155 rcu_read_unlock(); 2156 + cgroup_unlock(); 2157 + return -ESRCH; 2158 + } 2159 + if (threadgroup) { 2160 + /* 2161 + * RCU protects this access, since tsk was found in the 2162 + * tid map. a race with de_thread may cause group_leader 2163 + * to stop being the leader, but cgroup_attach_proc will 2164 + * detect it later. 2165 + */ 2166 + tsk = tsk->group_leader; 2167 + } else if (tsk->flags & PF_EXITING) { 2168 + /* optimization for the single-task-only case */ 2169 + rcu_read_unlock(); 2170 + cgroup_unlock(); 1928 2171 return -ESRCH; 1929 2172 } 1930 2173 2174 + /* 2175 + * even if we're attaching all tasks in the thread group, we 2176 + * only need to check permissions on one of them. 2177 + */ 1931 2178 tcred = __task_cred(tsk); 1932 2179 if (cred->euid && 1933 2180 cred->euid != tcred->uid && 1934 2181 cred->euid != tcred->suid) { 1935 2182 rcu_read_unlock(); 2183 + cgroup_unlock(); 1936 2184 return -EACCES; 1937 2185 } 1938 2186 get_task_struct(tsk); 1939 2187 rcu_read_unlock(); 1940 2188 } else { 1941 - tsk = current; 2189 + if (threadgroup) 2190 + tsk = current->group_leader; 2191 + else 2192 + tsk = current; 1942 2193 get_task_struct(tsk); 1943 2194 } 1944 2195 1945 - ret = cgroup_attach_task(cgrp, tsk); 2196 + if (threadgroup) { 2197 + threadgroup_fork_write_lock(tsk); 2198 + ret = cgroup_attach_proc(cgrp, tsk); 2199 + threadgroup_fork_write_unlock(tsk); 2200 + } else { 2201 + ret = cgroup_attach_task(cgrp, tsk); 2202 + } 1946 2203 put_task_struct(tsk); 2204 + cgroup_unlock(); 1947 2205 return ret; 1948 2206 } 1949 2207 1950 2208 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 1951 2209 { 2210 + return attach_task_by_pid(cgrp, pid, false); 2211 + } 2212 + 2213 + static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2214 + { 1952 2215 int ret; 1953 - if (!cgroup_lock_live_group(cgrp)) 1954 - return -ENODEV; 1955 - ret = attach_task_by_pid(cgrp, pid); 1956 - cgroup_unlock(); 2216 + do { 2217 + /* 2218 + * attach_proc fails with -EAGAIN if threadgroup leadership 2219 + * changes in the middle of the operation, in which case we need 2220 + * to find the task_struct for the new leader and start over. 2221 + */ 2222 + ret = attach_task_by_pid(cgrp, tgid, true); 2223 + } while (ret == -EAGAIN); 1957 2224 return ret; 1958 2225 } 1959 2226 ··· 3617 3270 { 3618 3271 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3619 3272 .open = cgroup_procs_open, 3620 - /* .write_u64 = cgroup_procs_write, TODO */ 3273 + .write_u64 = cgroup_procs_write, 3621 3274 .release = cgroup_pidlist_release, 3622 - .mode = S_IRUGO, 3275 + .mode = S_IRUGO | S_IWUSR, 3623 3276 }, 3624 3277 { 3625 3278 .name = "notify_on_release",