at v3.6 20 kB view raw
1#ifndef _LINUX_CGROUP_H 2#define _LINUX_CGROUP_H 3/* 4 * cgroup interface 5 * 6 * Copyright (C) 2003 BULL SA 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 8 * 9 */ 10 11#include <linux/sched.h> 12#include <linux/cpumask.h> 13#include <linux/nodemask.h> 14#include <linux/rcupdate.h> 15#include <linux/cgroupstats.h> 16#include <linux/prio_heap.h> 17#include <linux/rwsem.h> 18#include <linux/idr.h> 19#include <linux/workqueue.h> 20 21#ifdef CONFIG_CGROUPS 22 23struct cgroupfs_root; 24struct cgroup_subsys; 25struct inode; 26struct cgroup; 27struct css_id; 28 29extern int cgroup_init_early(void); 30extern int cgroup_init(void); 31extern void cgroup_lock(void); 32extern int cgroup_lock_is_held(void); 33extern bool cgroup_lock_live_group(struct cgroup *cgrp); 34extern void cgroup_unlock(void); 35extern void cgroup_fork(struct task_struct *p); 36extern void cgroup_fork_callbacks(struct task_struct *p); 37extern void cgroup_post_fork(struct task_struct *p); 38extern void cgroup_exit(struct task_struct *p, int run_callbacks); 39extern int cgroupstats_build(struct cgroupstats *stats, 40 struct dentry *dentry); 41extern int cgroup_load_subsys(struct cgroup_subsys *ss); 42extern void cgroup_unload_subsys(struct cgroup_subsys *ss); 43 44extern const struct file_operations proc_cgroup_operations; 45 46/* Define the enumeration of all builtin cgroup subsystems */ 47#define SUBSYS(_x) _x ## _subsys_id, 48enum cgroup_subsys_id { 49#include <linux/cgroup_subsys.h> 50 CGROUP_BUILTIN_SUBSYS_COUNT 51}; 52#undef SUBSYS 53/* 54 * This define indicates the maximum number of subsystems that can be loaded 55 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep 56 * track of all of them. 57 */ 58#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) 59 60/* Per-subsystem/per-cgroup state maintained by the system. */ 61struct cgroup_subsys_state { 62 /* 63 * The cgroup that this subsystem is attached to. Useful 64 * for subsystems that want to know about the cgroup 65 * hierarchy structure 66 */ 67 struct cgroup *cgroup; 68 69 /* 70 * State maintained by the cgroup system to allow subsystems 71 * to be "busy". Should be accessed via css_get(), 72 * css_tryget() and and css_put(). 73 */ 74 75 atomic_t refcnt; 76 77 unsigned long flags; 78 /* ID for this css, if possible */ 79 struct css_id __rcu *id; 80 81 /* Used to put @cgroup->dentry on the last css_put() */ 82 struct work_struct dput_work; 83}; 84 85/* bits in struct cgroup_subsys_state flags field */ 86enum { 87 CSS_ROOT, /* This CSS is the root of the subsystem */ 88 CSS_REMOVED, /* This CSS is dead */ 89 CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */ 90}; 91 92/* Caller must verify that the css is not for root cgroup */ 93static inline void __css_get(struct cgroup_subsys_state *css, int count) 94{ 95 atomic_add(count, &css->refcnt); 96} 97 98/* 99 * Call css_get() to hold a reference on the css; it can be used 100 * for a reference obtained via: 101 * - an existing ref-counted reference to the css 102 * - task->cgroups for a locked task 103 */ 104 105static inline void css_get(struct cgroup_subsys_state *css) 106{ 107 /* We don't need to reference count the root state */ 108 if (!test_bit(CSS_ROOT, &css->flags)) 109 __css_get(css, 1); 110} 111 112static inline bool css_is_removed(struct cgroup_subsys_state *css) 113{ 114 return test_bit(CSS_REMOVED, &css->flags); 115} 116 117/* 118 * Call css_tryget() to take a reference on a css if your existing 119 * (known-valid) reference isn't already ref-counted. Returns false if 120 * the css has been destroyed. 121 */ 122 123extern bool __css_tryget(struct cgroup_subsys_state *css); 124static inline bool css_tryget(struct cgroup_subsys_state *css) 125{ 126 if (test_bit(CSS_ROOT, &css->flags)) 127 return true; 128 return __css_tryget(css); 129} 130 131/* 132 * css_put() should be called to release a reference taken by 133 * css_get() or css_tryget() 134 */ 135 136extern void __css_put(struct cgroup_subsys_state *css); 137static inline void css_put(struct cgroup_subsys_state *css) 138{ 139 if (!test_bit(CSS_ROOT, &css->flags)) 140 __css_put(css); 141} 142 143/* bits in struct cgroup flags field */ 144enum { 145 /* Control Group is dead */ 146 CGRP_REMOVED, 147 /* 148 * Control Group has previously had a child cgroup or a task, 149 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) 150 */ 151 CGRP_RELEASABLE, 152 /* Control Group requires release notifications to userspace */ 153 CGRP_NOTIFY_ON_RELEASE, 154 /* 155 * A thread in rmdir() is wating for this cgroup. 156 */ 157 CGRP_WAIT_ON_RMDIR, 158 /* 159 * Clone cgroup values when creating a new child cgroup 160 */ 161 CGRP_CLONE_CHILDREN, 162}; 163 164struct cgroup { 165 unsigned long flags; /* "unsigned long" so bitops work */ 166 167 /* 168 * count users of this cgroup. >0 means busy, but doesn't 169 * necessarily indicate the number of tasks in the cgroup 170 */ 171 atomic_t count; 172 173 /* 174 * We link our 'sibling' struct into our parent's 'children'. 175 * Our children link their 'sibling' into our 'children'. 176 */ 177 struct list_head sibling; /* my parent's children */ 178 struct list_head children; /* my children */ 179 struct list_head files; /* my files */ 180 181 struct cgroup *parent; /* my parent */ 182 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ 183 184 /* Private pointers for each registered subsystem */ 185 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 186 187 struct cgroupfs_root *root; 188 struct cgroup *top_cgroup; 189 190 /* 191 * List of cg_cgroup_links pointing at css_sets with 192 * tasks in this cgroup. Protected by css_set_lock 193 */ 194 struct list_head css_sets; 195 196 struct list_head allcg_node; /* cgroupfs_root->allcg_list */ 197 struct list_head cft_q_node; /* used during cftype add/rm */ 198 199 /* 200 * Linked list running through all cgroups that can 201 * potentially be reaped by the release agent. Protected by 202 * release_list_lock 203 */ 204 struct list_head release_list; 205 206 /* 207 * list of pidlists, up to two for each namespace (one for procs, one 208 * for tasks); created on demand. 209 */ 210 struct list_head pidlists; 211 struct mutex pidlist_mutex; 212 213 /* For RCU-protected deletion */ 214 struct rcu_head rcu_head; 215 216 /* List of events which userspace want to receive */ 217 struct list_head event_list; 218 spinlock_t event_list_lock; 219}; 220 221/* 222 * A css_set is a structure holding pointers to a set of 223 * cgroup_subsys_state objects. This saves space in the task struct 224 * object and speeds up fork()/exit(), since a single inc/dec and a 225 * list_add()/del() can bump the reference count on the entire cgroup 226 * set for a task. 227 */ 228 229struct css_set { 230 231 /* Reference count */ 232 atomic_t refcount; 233 234 /* 235 * List running through all cgroup groups in the same hash 236 * slot. Protected by css_set_lock 237 */ 238 struct hlist_node hlist; 239 240 /* 241 * List running through all tasks using this cgroup 242 * group. Protected by css_set_lock 243 */ 244 struct list_head tasks; 245 246 /* 247 * List of cg_cgroup_link objects on link chains from 248 * cgroups referenced from this css_set. Protected by 249 * css_set_lock 250 */ 251 struct list_head cg_links; 252 253 /* 254 * Set of subsystem states, one for each subsystem. This array 255 * is immutable after creation apart from the init_css_set 256 * during subsystem registration (at boot time) and modular subsystem 257 * loading/unloading. 258 */ 259 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 260 261 /* For RCU-protected deletion */ 262 struct rcu_head rcu_head; 263}; 264 265/* 266 * cgroup_map_cb is an abstract callback API for reporting map-valued 267 * control files 268 */ 269 270struct cgroup_map_cb { 271 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); 272 void *state; 273}; 274 275/* 276 * struct cftype: handler definitions for cgroup control files 277 * 278 * When reading/writing to a file: 279 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 280 * - the 'cftype' of the file is file->f_dentry->d_fsdata 281 */ 282 283/* cftype->flags */ 284#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ 285#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create onp root cg */ 286 287#define MAX_CFTYPE_NAME 64 288 289struct cftype { 290 /* 291 * By convention, the name should begin with the name of the 292 * subsystem, followed by a period. Zero length string indicates 293 * end of cftype array. 294 */ 295 char name[MAX_CFTYPE_NAME]; 296 int private; 297 /* 298 * If not 0, file mode is set to this value, otherwise it will 299 * be figured out automatically 300 */ 301 umode_t mode; 302 303 /* 304 * If non-zero, defines the maximum length of string that can 305 * be passed to write_string; defaults to 64 306 */ 307 size_t max_write_len; 308 309 /* CFTYPE_* flags */ 310 unsigned int flags; 311 312 int (*open)(struct inode *inode, struct file *file); 313 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 314 struct file *file, 315 char __user *buf, size_t nbytes, loff_t *ppos); 316 /* 317 * read_u64() is a shortcut for the common case of returning a 318 * single integer. Use it in place of read() 319 */ 320 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); 321 /* 322 * read_s64() is a signed version of read_u64() 323 */ 324 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); 325 /* 326 * read_map() is used for defining a map of key/value 327 * pairs. It should call cb->fill(cb, key, value) for each 328 * entry. The key/value pairs (and their ordering) should not 329 * change between reboots. 330 */ 331 int (*read_map)(struct cgroup *cont, struct cftype *cft, 332 struct cgroup_map_cb *cb); 333 /* 334 * read_seq_string() is used for outputting a simple sequence 335 * using seqfile. 336 */ 337 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, 338 struct seq_file *m); 339 340 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 341 struct file *file, 342 const char __user *buf, size_t nbytes, loff_t *ppos); 343 344 /* 345 * write_u64() is a shortcut for the common case of accepting 346 * a single integer (as parsed by simple_strtoull) from 347 * userspace. Use in place of write(); return 0 or error. 348 */ 349 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); 350 /* 351 * write_s64() is a signed version of write_u64() 352 */ 353 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); 354 355 /* 356 * write_string() is passed a nul-terminated kernelspace 357 * buffer of maximum length determined by max_write_len. 358 * Returns 0 or -ve error code. 359 */ 360 int (*write_string)(struct cgroup *cgrp, struct cftype *cft, 361 const char *buffer); 362 /* 363 * trigger() callback can be used to get some kick from the 364 * userspace, when the actual string written is not important 365 * at all. The private field can be used to determine the 366 * kick type for multiplexing. 367 */ 368 int (*trigger)(struct cgroup *cgrp, unsigned int event); 369 370 int (*release)(struct inode *inode, struct file *file); 371 372 /* 373 * register_event() callback will be used to add new userspace 374 * waiter for changes related to the cftype. Implement it if 375 * you want to provide this functionality. Use eventfd_signal() 376 * on eventfd to send notification to userspace. 377 */ 378 int (*register_event)(struct cgroup *cgrp, struct cftype *cft, 379 struct eventfd_ctx *eventfd, const char *args); 380 /* 381 * unregister_event() callback will be called when userspace 382 * closes the eventfd or on cgroup removing. 383 * This callback must be implemented, if you want provide 384 * notification functionality. 385 */ 386 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, 387 struct eventfd_ctx *eventfd); 388}; 389 390/* 391 * cftype_sets describe cftypes belonging to a subsystem and are chained at 392 * cgroup_subsys->cftsets. Each cftset points to an array of cftypes 393 * terminated by zero length name. 394 */ 395struct cftype_set { 396 struct list_head node; /* chained at subsys->cftsets */ 397 const struct cftype *cfts; 398}; 399 400struct cgroup_scanner { 401 struct cgroup *cg; 402 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); 403 void (*process_task)(struct task_struct *p, 404 struct cgroup_scanner *scan); 405 struct ptr_heap *heap; 406 void *data; 407}; 408 409int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts); 410int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts); 411 412int cgroup_is_removed(const struct cgroup *cgrp); 413 414int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 415 416int cgroup_task_count(const struct cgroup *cgrp); 417 418/* Return true if cgrp is a descendant of the task's cgroup */ 419int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 420 421/* 422 * When the subsys has to access css and may add permanent refcnt to css, 423 * it should take care of racy conditions with rmdir(). Following set of 424 * functions, is for stop/restart rmdir if necessary. 425 * Because these will call css_get/put, "css" should be alive css. 426 * 427 * cgroup_exclude_rmdir(); 428 * ...do some jobs which may access arbitrary empty cgroup 429 * cgroup_release_and_wakeup_rmdir(); 430 * 431 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, 432 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. 433 */ 434 435void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); 436void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); 437 438/* 439 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 440 * methods. 441 */ 442struct cgroup_taskset; 443struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 444struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 445struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); 446int cgroup_taskset_size(struct cgroup_taskset *tset); 447 448/** 449 * cgroup_taskset_for_each - iterate cgroup_taskset 450 * @task: the loop cursor 451 * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all 452 * @tset: taskset to iterate 453 */ 454#define cgroup_taskset_for_each(task, skip_cgrp, tset) \ 455 for ((task) = cgroup_taskset_first((tset)); (task); \ 456 (task) = cgroup_taskset_next((tset))) \ 457 if (!(skip_cgrp) || \ 458 cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) 459 460/* 461 * Control Group subsystem type. 462 * See Documentation/cgroups/cgroups.txt for details 463 */ 464 465struct cgroup_subsys { 466 struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); 467 int (*pre_destroy)(struct cgroup *cgrp); 468 void (*destroy)(struct cgroup *cgrp); 469 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 470 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 471 void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 472 void (*fork)(struct task_struct *task); 473 void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, 474 struct task_struct *task); 475 void (*post_clone)(struct cgroup *cgrp); 476 void (*bind)(struct cgroup *root); 477 478 int subsys_id; 479 int active; 480 int disabled; 481 int early_init; 482 /* 483 * True if this subsys uses ID. ID is not available before cgroup_init() 484 * (not available in early_init time.) 485 */ 486 bool use_id; 487 488 /* 489 * If %true, cgroup removal will try to clear css refs by retrying 490 * ss->pre_destroy() until there's no css ref left. This behavior 491 * is strictly for backward compatibility and will be removed as 492 * soon as the current user (memcg) is updated. 493 * 494 * If %false, ss->pre_destroy() can't fail and cgroup removal won't 495 * wait for css refs to drop to zero before proceeding. 496 */ 497 bool __DEPRECATED_clear_css_refs; 498 499#define MAX_CGROUP_TYPE_NAMELEN 32 500 const char *name; 501 502 /* 503 * Link to parent, and list entry in parent's children. 504 * Protected by cgroup_lock() 505 */ 506 struct cgroupfs_root *root; 507 struct list_head sibling; 508 /* used when use_id == true */ 509 struct idr idr; 510 spinlock_t id_lock; 511 512 /* list of cftype_sets */ 513 struct list_head cftsets; 514 515 /* base cftypes, automatically [de]registered with subsys itself */ 516 struct cftype *base_cftypes; 517 struct cftype_set base_cftset; 518 519 /* should be defined only by modular subsystems */ 520 struct module *module; 521}; 522 523#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 524#include <linux/cgroup_subsys.h> 525#undef SUBSYS 526 527static inline struct cgroup_subsys_state *cgroup_subsys_state( 528 struct cgroup *cgrp, int subsys_id) 529{ 530 return cgrp->subsys[subsys_id]; 531} 532 533/* 534 * function to get the cgroup_subsys_state which allows for extra 535 * rcu_dereference_check() conditions, such as locks used during the 536 * cgroup_subsys::attach() methods. 537 */ 538#define task_subsys_state_check(task, subsys_id, __c) \ 539 rcu_dereference_check(task->cgroups->subsys[subsys_id], \ 540 lockdep_is_held(&task->alloc_lock) || \ 541 cgroup_lock_is_held() || (__c)) 542 543static inline struct cgroup_subsys_state * 544task_subsys_state(struct task_struct *task, int subsys_id) 545{ 546 return task_subsys_state_check(task, subsys_id, false); 547} 548 549static inline struct cgroup* task_cgroup(struct task_struct *task, 550 int subsys_id) 551{ 552 return task_subsys_state(task, subsys_id)->cgroup; 553} 554 555/* A cgroup_iter should be treated as an opaque object */ 556struct cgroup_iter { 557 struct list_head *cg_link; 558 struct list_head *task; 559}; 560 561/* 562 * To iterate across the tasks in a cgroup: 563 * 564 * 1) call cgroup_iter_start to initialize an iterator 565 * 566 * 2) call cgroup_iter_next() to retrieve member tasks until it 567 * returns NULL or until you want to end the iteration 568 * 569 * 3) call cgroup_iter_end() to destroy the iterator. 570 * 571 * Or, call cgroup_scan_tasks() to iterate through every task in a 572 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling 573 * the test_task() callback, but not while calling the process_task() 574 * callback. 575 */ 576void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); 577struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 578 struct cgroup_iter *it); 579void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); 580int cgroup_scan_tasks(struct cgroup_scanner *scan); 581int cgroup_attach_task(struct cgroup *, struct task_struct *); 582int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 583 584/* 585 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works 586 * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. 587 * CSS ID is assigned at cgroup allocation (create) automatically 588 * and removed when subsys calls free_css_id() function. This is because 589 * the lifetime of cgroup_subsys_state is subsys's matter. 590 * 591 * Looking up and scanning function should be called under rcu_read_lock(). 592 * Taking cgroup_mutex is not necessary for following calls. 593 * But the css returned by this routine can be "not populated yet" or "being 594 * destroyed". The caller should check css and cgroup's status. 595 */ 596 597/* 598 * Typically Called at ->destroy(), or somewhere the subsys frees 599 * cgroup_subsys_state. 600 */ 601void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); 602 603/* Find a cgroup_subsys_state which has given ID */ 604 605struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); 606 607/* 608 * Get a cgroup whose id is greater than or equal to id under tree of root. 609 * Returning a cgroup_subsys_state or NULL. 610 */ 611struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, 612 struct cgroup_subsys_state *root, int *foundid); 613 614/* Returns true if root is ancestor of cg */ 615bool css_is_ancestor(struct cgroup_subsys_state *cg, 616 const struct cgroup_subsys_state *root); 617 618/* Get id and depth of css */ 619unsigned short css_id(struct cgroup_subsys_state *css); 620unsigned short css_depth(struct cgroup_subsys_state *css); 621struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); 622 623#else /* !CONFIG_CGROUPS */ 624 625static inline int cgroup_init_early(void) { return 0; } 626static inline int cgroup_init(void) { return 0; } 627static inline void cgroup_fork(struct task_struct *p) {} 628static inline void cgroup_fork_callbacks(struct task_struct *p) {} 629static inline void cgroup_post_fork(struct task_struct *p) {} 630static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 631 632static inline void cgroup_lock(void) {} 633static inline void cgroup_unlock(void) {} 634static inline int cgroupstats_build(struct cgroupstats *stats, 635 struct dentry *dentry) 636{ 637 return -EINVAL; 638} 639 640/* No cgroups - nothing to do */ 641static inline int cgroup_attach_task_all(struct task_struct *from, 642 struct task_struct *t) 643{ 644 return 0; 645} 646 647#endif /* !CONFIG_CGROUPS */ 648 649#endif /* _LINUX_CGROUP_H */