Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cgroup/pids: Separate semantics of pids.events related to pids.max

Currently, when pids.max limit is breached in the hierarchy, the event
is counted and reported in the cgroup where the forking task resides.

This decouples the limit and the notification caused by the limit making
it hard to detect when the actual limit was effected.

Redefine the pids.events:max as: the number of times the limit of the
cgroup was hit.

(Implementation differentiates also "forkfail" event but this is
currently not exposed as it would better fit into pids.stat. It also
differs from pids.events:max only when pids.max is configured on
non-leaf cgroups.)

Since it changes semantics of the original "max" event, introduce this
change only in the v2 API of the controller and add a cgroup2 mount
option to revert to the legacy behavior.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Michal Koutný and committed by
Tejun Heo
73e75e6f 0ac38002

+64 -18
+2 -1
Documentation/admin-guide/cgroup-v1/pids.rst
··· 36 36 37 37 The pids.events file contains event counters: 38 38 39 - - max: Number of times fork failed because limit was hit. 39 + - max: Number of times fork failed in the cgroup because limit was hit in 40 + self or ancestors. 40 41 41 42 Example 42 43 -------
+9 -4
Documentation/admin-guide/cgroup-v2.rst
··· 239 239 will not be tracked by the memory controller (even if cgroup 240 240 v2 is remounted later on). 241 241 242 + pids_localevents 243 + Represent fork failures inside cgroup's pids.events:max (v1 behavior), 244 + not its limit being hit (v2 behavior). 245 + 242 246 243 247 Organizing Processes and Threads 244 248 -------------------------------- ··· 2209 2205 descendants has ever reached. 2210 2206 2211 2207 pids.events 2212 - A read-only flat-keyed file which exists on non-root cgroups. The 2213 - following entries are defined. Unless specified otherwise, a value 2214 - change in this file generates a file modified event. 2208 + A read-only flat-keyed file which exists on non-root cgroups. Unless 2209 + specified otherwise, a value change in this file generates a file 2210 + modified event. The following entries are defined. 2215 2211 2216 2212 max 2217 - Number of times fork failed because limit was hit. 2213 + The number of times the cgroup's number of processes hit the 2214 + limit (see also pids_localevents). 2218 2215 2219 2216 Organisational operations are not blocked by cgroup policies, so it is 2220 2217 possible to have pids.current > pids.max. This can be done by either
+6 -1
include/linux/cgroup-defs.h
··· 119 119 /* 120 120 * Enable hugetlb accounting for the memory controller. 121 121 */ 122 - CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), 122 + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), 123 + 124 + /* 125 + * Enable legacy local pids.events. 126 + */ 127 + CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20), 123 128 }; 124 129 125 130 /* cftype->flags */
+14 -1
kernel/cgroup/cgroup.c
··· 1922 1922 Opt_memory_localevents, 1923 1923 Opt_memory_recursiveprot, 1924 1924 Opt_memory_hugetlb_accounting, 1925 + Opt_pids_localevents, 1925 1926 nr__cgroup2_params 1926 1927 }; 1927 1928 ··· 1932 1931 fsparam_flag("memory_localevents", Opt_memory_localevents), 1933 1932 fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), 1934 1933 fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), 1934 + fsparam_flag("pids_localevents", Opt_pids_localevents), 1935 1935 {} 1936 1936 }; 1937 1937 ··· 1961 1959 return 0; 1962 1960 case Opt_memory_hugetlb_accounting: 1963 1961 ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; 1962 + return 0; 1963 + case Opt_pids_localevents: 1964 + ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; 1964 1965 return 0; 1965 1966 } 1966 1967 return -EINVAL; ··· 1994 1989 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; 1995 1990 else 1996 1991 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; 1992 + 1993 + if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) 1994 + cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; 1995 + else 1996 + cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; 1997 1997 } 1998 1998 } 1999 1999 ··· 2014 2004 seq_puts(seq, ",memory_recursiveprot"); 2015 2005 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) 2016 2006 seq_puts(seq, ",memory_hugetlb_accounting"); 2007 + if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) 2008 + seq_puts(seq, ",pids_localevents"); 2017 2009 return 0; 2018 2010 } 2019 2011 ··· 7074 7062 "favordynmods\n" 7075 7063 "memory_localevents\n" 7076 7064 "memory_recursiveprot\n" 7077 - "memory_hugetlb_accounting\n"); 7065 + "memory_hugetlb_accounting\n" 7066 + "pids_localevents\n"); 7078 7067 } 7079 7068 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); 7080 7069
+33 -11
kernel/cgroup/pids.c
··· 38 38 #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) 39 39 #define PIDS_MAX_STR "max" 40 40 41 + enum pidcg_event { 42 + /* Fork failed in subtree because this pids_cgroup limit was hit. */ 43 + PIDCG_MAX, 44 + /* Fork failed in this pids_cgroup because ancestor limit was hit. */ 45 + PIDCG_FORKFAIL, 46 + NR_PIDCG_EVENTS, 47 + }; 48 + 41 49 struct pids_cgroup { 42 50 struct cgroup_subsys_state css; 43 51 ··· 60 52 /* Handle for "pids.events" */ 61 53 struct cgroup_file events_file; 62 54 63 - /* Number of times fork failed because limit was hit. */ 64 - atomic64_t events_limit; 55 + atomic64_t events[NR_PIDCG_EVENTS]; 65 56 }; 66 57 67 58 static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) ··· 155 148 * pids_try_charge - hierarchically try to charge the pid count 156 149 * @pids: the pid cgroup state 157 150 * @num: the number of pids to charge 151 + * @fail: storage of pid cgroup causing the fail 158 152 * 159 153 * This function follows the set limit. It will fail if the charge would cause 160 154 * the new value to exceed the hierarchical limit. Returns 0 if the charge 161 155 * succeeded, otherwise -EAGAIN. 162 156 */ 163 - static int pids_try_charge(struct pids_cgroup *pids, int num) 157 + static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail) 164 158 { 165 159 struct pids_cgroup *p, *q; 166 160 ··· 174 166 * p->limit is %PIDS_MAX then we know that this test will never 175 167 * fail. 176 168 */ 177 - if (new > limit) 169 + if (new > limit) { 170 + *fail = p; 178 171 goto revert; 179 - 172 + } 180 173 /* 181 174 * Not technically accurate if we go over limit somewhere up 182 175 * the hierarchy, but that's tolerable for the watermark. ··· 245 236 static int pids_can_fork(struct task_struct *task, struct css_set *cset) 246 237 { 247 238 struct cgroup_subsys_state *css; 248 - struct pids_cgroup *pids; 239 + struct pids_cgroup *pids, *pids_over_limit; 249 240 int err; 250 241 251 242 if (cset) ··· 253 244 else 254 245 css = task_css_check(current, pids_cgrp_id, true); 255 246 pids = css_pids(css); 256 - err = pids_try_charge(pids, 1); 247 + err = pids_try_charge(pids, 1, &pids_over_limit); 257 248 if (err) { 258 - /* Only log the first time events_limit is incremented. */ 259 - if (atomic64_inc_return(&pids->events_limit) == 1) { 249 + /* compatibility on v1 where events were notified in leaves. */ 250 + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys)) 251 + pids_over_limit = pids; 252 + 253 + /* Only log the first time limit is hit. */ 254 + if (atomic64_inc_return(&pids->events[PIDCG_FORKFAIL]) == 1) { 260 255 pr_info("cgroup: fork rejected by pids controller in "); 261 - pr_cont_cgroup_path(css->cgroup); 256 + pr_cont_cgroup_path(pids->css.cgroup); 262 257 pr_cont("\n"); 263 258 } 259 + atomic64_inc(&pids_over_limit->events[PIDCG_MAX]); 260 + 264 261 cgroup_file_notify(&pids->events_file); 262 + if (pids_over_limit != pids) 263 + cgroup_file_notify(&pids_over_limit->events_file); 265 264 } 266 265 return err; 267 266 } ··· 357 340 static int pids_events_show(struct seq_file *sf, void *v) 358 341 { 359 342 struct pids_cgroup *pids = css_pids(seq_css(sf)); 343 + enum pidcg_event pe = PIDCG_MAX; 360 344 361 - seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); 345 + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || 346 + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) 347 + pe = PIDCG_FORKFAIL; 348 + 349 + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events[pe])); 362 350 return 0; 363 351 } 364 352