at v2.6.38 20 kB view raw
1#ifndef _LINUX_CGROUP_H 2#define _LINUX_CGROUP_H 3/* 4 * cgroup interface 5 * 6 * Copyright (C) 2003 BULL SA 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 8 * 9 */ 10 11#include <linux/sched.h> 12#include <linux/cpumask.h> 13#include <linux/nodemask.h> 14#include <linux/rcupdate.h> 15#include <linux/cgroupstats.h> 16#include <linux/prio_heap.h> 17#include <linux/rwsem.h> 18#include <linux/idr.h> 19 20#ifdef CONFIG_CGROUPS 21 22struct cgroupfs_root; 23struct cgroup_subsys; 24struct inode; 25struct cgroup; 26struct css_id; 27 28extern int cgroup_init_early(void); 29extern int cgroup_init(void); 30extern void cgroup_lock(void); 31extern int cgroup_lock_is_held(void); 32extern bool cgroup_lock_live_group(struct cgroup *cgrp); 33extern void cgroup_unlock(void); 34extern void cgroup_fork(struct task_struct *p); 35extern void cgroup_fork_callbacks(struct task_struct *p); 36extern void cgroup_post_fork(struct task_struct *p); 37extern void cgroup_exit(struct task_struct *p, int run_callbacks); 38extern int cgroupstats_build(struct cgroupstats *stats, 39 struct dentry *dentry); 40extern int cgroup_load_subsys(struct cgroup_subsys *ss); 41extern void cgroup_unload_subsys(struct cgroup_subsys *ss); 42 43extern const struct file_operations proc_cgroup_operations; 44 45/* Define the enumeration of all builtin cgroup subsystems */ 46#define SUBSYS(_x) _x ## _subsys_id, 47enum cgroup_subsys_id { 48#include <linux/cgroup_subsys.h> 49 CGROUP_BUILTIN_SUBSYS_COUNT 50}; 51#undef SUBSYS 52/* 53 * This define indicates the maximum number of subsystems that can be loaded 54 * at once. We limit to this many since cgroupfs_root has subsys_bits to keep 55 * track of all of them. 56 */ 57#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) 58 59/* Per-subsystem/per-cgroup state maintained by the system. */ 60struct cgroup_subsys_state { 61 /* 62 * The cgroup that this subsystem is attached to. Useful 63 * for subsystems that want to know about the cgroup 64 * hierarchy structure 65 */ 66 struct cgroup *cgroup; 67 68 /* 69 * State maintained by the cgroup system to allow subsystems 70 * to be "busy". Should be accessed via css_get(), 71 * css_tryget() and and css_put(). 72 */ 73 74 atomic_t refcnt; 75 76 unsigned long flags; 77 /* ID for this css, if possible */ 78 struct css_id __rcu *id; 79}; 80 81/* bits in struct cgroup_subsys_state flags field */ 82enum { 83 CSS_ROOT, /* This CSS is the root of the subsystem */ 84 CSS_REMOVED, /* This CSS is dead */ 85}; 86 87/* Caller must verify that the css is not for root cgroup */ 88static inline void __css_get(struct cgroup_subsys_state *css, int count) 89{ 90 atomic_add(count, &css->refcnt); 91} 92 93/* 94 * Call css_get() to hold a reference on the css; it can be used 95 * for a reference obtained via: 96 * - an existing ref-counted reference to the css 97 * - task->cgroups for a locked task 98 */ 99 100static inline void css_get(struct cgroup_subsys_state *css) 101{ 102 /* We don't need to reference count the root state */ 103 if (!test_bit(CSS_ROOT, &css->flags)) 104 __css_get(css, 1); 105} 106 107static inline bool css_is_removed(struct cgroup_subsys_state *css) 108{ 109 return test_bit(CSS_REMOVED, &css->flags); 110} 111 112/* 113 * Call css_tryget() to take a reference on a css if your existing 114 * (known-valid) reference isn't already ref-counted. Returns false if 115 * the css has been destroyed. 116 */ 117 118static inline bool css_tryget(struct cgroup_subsys_state *css) 119{ 120 if (test_bit(CSS_ROOT, &css->flags)) 121 return true; 122 while (!atomic_inc_not_zero(&css->refcnt)) { 123 if (test_bit(CSS_REMOVED, &css->flags)) 124 return false; 125 cpu_relax(); 126 } 127 return true; 128} 129 130/* 131 * css_put() should be called to release a reference taken by 132 * css_get() or css_tryget() 133 */ 134 135extern void __css_put(struct cgroup_subsys_state *css, int count); 136static inline void css_put(struct cgroup_subsys_state *css) 137{ 138 if (!test_bit(CSS_ROOT, &css->flags)) 139 __css_put(css, 1); 140} 141 142/* bits in struct cgroup flags field */ 143enum { 144 /* Control Group is dead */ 145 CGRP_REMOVED, 146 /* 147 * Control Group has previously had a child cgroup or a task, 148 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) 149 */ 150 CGRP_RELEASABLE, 151 /* Control Group requires release notifications to userspace */ 152 CGRP_NOTIFY_ON_RELEASE, 153 /* 154 * A thread in rmdir() is wating for this cgroup. 155 */ 156 CGRP_WAIT_ON_RMDIR, 157 /* 158 * Clone cgroup values when creating a new child cgroup 159 */ 160 CGRP_CLONE_CHILDREN, 161}; 162 163/* which pidlist file are we talking about? */ 164enum cgroup_filetype { 165 CGROUP_FILE_PROCS, 166 CGROUP_FILE_TASKS, 167}; 168 169/* 170 * A pidlist is a list of pids that virtually represents the contents of one 171 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, 172 * a pair (one each for procs, tasks) for each pid namespace that's relevant 173 * to the cgroup. 174 */ 175struct cgroup_pidlist { 176 /* 177 * used to find which pidlist is wanted. doesn't change as long as 178 * this particular list stays in the list. 179 */ 180 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; 181 /* array of xids */ 182 pid_t *list; 183 /* how many elements the above list has */ 184 int length; 185 /* how many files are using the current array */ 186 int use_count; 187 /* each of these stored in a list by its cgroup */ 188 struct list_head links; 189 /* pointer to the cgroup we belong to, for list removal purposes */ 190 struct cgroup *owner; 191 /* protects the other fields */ 192 struct rw_semaphore mutex; 193}; 194 195struct cgroup { 196 unsigned long flags; /* "unsigned long" so bitops work */ 197 198 /* 199 * count users of this cgroup. >0 means busy, but doesn't 200 * necessarily indicate the number of tasks in the cgroup 201 */ 202 atomic_t count; 203 204 /* 205 * We link our 'sibling' struct into our parent's 'children'. 206 * Our children link their 'sibling' into our 'children'. 207 */ 208 struct list_head sibling; /* my parent's children */ 209 struct list_head children; /* my children */ 210 211 struct cgroup *parent; /* my parent */ 212 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ 213 214 /* Private pointers for each registered subsystem */ 215 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 216 217 struct cgroupfs_root *root; 218 struct cgroup *top_cgroup; 219 220 /* 221 * List of cg_cgroup_links pointing at css_sets with 222 * tasks in this cgroup. Protected by css_set_lock 223 */ 224 struct list_head css_sets; 225 226 /* 227 * Linked list running through all cgroups that can 228 * potentially be reaped by the release agent. Protected by 229 * release_list_lock 230 */ 231 struct list_head release_list; 232 233 /* 234 * list of pidlists, up to two for each namespace (one for procs, one 235 * for tasks); created on demand. 236 */ 237 struct list_head pidlists; 238 struct mutex pidlist_mutex; 239 240 /* For RCU-protected deletion */ 241 struct rcu_head rcu_head; 242 243 /* List of events which userspace want to recieve */ 244 struct list_head event_list; 245 spinlock_t event_list_lock; 246}; 247 248/* 249 * A css_set is a structure holding pointers to a set of 250 * cgroup_subsys_state objects. This saves space in the task struct 251 * object and speeds up fork()/exit(), since a single inc/dec and a 252 * list_add()/del() can bump the reference count on the entire cgroup 253 * set for a task. 254 */ 255 256struct css_set { 257 258 /* Reference count */ 259 atomic_t refcount; 260 261 /* 262 * List running through all cgroup groups in the same hash 263 * slot. Protected by css_set_lock 264 */ 265 struct hlist_node hlist; 266 267 /* 268 * List running through all tasks using this cgroup 269 * group. Protected by css_set_lock 270 */ 271 struct list_head tasks; 272 273 /* 274 * List of cg_cgroup_link objects on link chains from 275 * cgroups referenced from this css_set. Protected by 276 * css_set_lock 277 */ 278 struct list_head cg_links; 279 280 /* 281 * Set of subsystem states, one for each subsystem. This array 282 * is immutable after creation apart from the init_css_set 283 * during subsystem registration (at boot time) and modular subsystem 284 * loading/unloading. 285 */ 286 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 287 288 /* For RCU-protected deletion */ 289 struct rcu_head rcu_head; 290}; 291 292/* 293 * cgroup_map_cb is an abstract callback API for reporting map-valued 294 * control files 295 */ 296 297struct cgroup_map_cb { 298 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); 299 void *state; 300}; 301 302/* 303 * struct cftype: handler definitions for cgroup control files 304 * 305 * When reading/writing to a file: 306 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 307 * - the 'cftype' of the file is file->f_dentry->d_fsdata 308 */ 309 310#define MAX_CFTYPE_NAME 64 311struct cftype { 312 /* 313 * By convention, the name should begin with the name of the 314 * subsystem, followed by a period 315 */ 316 char name[MAX_CFTYPE_NAME]; 317 int private; 318 /* 319 * If not 0, file mode is set to this value, otherwise it will 320 * be figured out automatically 321 */ 322 mode_t mode; 323 324 /* 325 * If non-zero, defines the maximum length of string that can 326 * be passed to write_string; defaults to 64 327 */ 328 size_t max_write_len; 329 330 int (*open)(struct inode *inode, struct file *file); 331 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 332 struct file *file, 333 char __user *buf, size_t nbytes, loff_t *ppos); 334 /* 335 * read_u64() is a shortcut for the common case of returning a 336 * single integer. Use it in place of read() 337 */ 338 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); 339 /* 340 * read_s64() is a signed version of read_u64() 341 */ 342 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); 343 /* 344 * read_map() is used for defining a map of key/value 345 * pairs. It should call cb->fill(cb, key, value) for each 346 * entry. The key/value pairs (and their ordering) should not 347 * change between reboots. 348 */ 349 int (*read_map)(struct cgroup *cont, struct cftype *cft, 350 struct cgroup_map_cb *cb); 351 /* 352 * read_seq_string() is used for outputting a simple sequence 353 * using seqfile. 354 */ 355 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, 356 struct seq_file *m); 357 358 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 359 struct file *file, 360 const char __user *buf, size_t nbytes, loff_t *ppos); 361 362 /* 363 * write_u64() is a shortcut for the common case of accepting 364 * a single integer (as parsed by simple_strtoull) from 365 * userspace. Use in place of write(); return 0 or error. 366 */ 367 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); 368 /* 369 * write_s64() is a signed version of write_u64() 370 */ 371 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); 372 373 /* 374 * write_string() is passed a nul-terminated kernelspace 375 * buffer of maximum length determined by max_write_len. 376 * Returns 0 or -ve error code. 377 */ 378 int (*write_string)(struct cgroup *cgrp, struct cftype *cft, 379 const char *buffer); 380 /* 381 * trigger() callback can be used to get some kick from the 382 * userspace, when the actual string written is not important 383 * at all. The private field can be used to determine the 384 * kick type for multiplexing. 385 */ 386 int (*trigger)(struct cgroup *cgrp, unsigned int event); 387 388 int (*release)(struct inode *inode, struct file *file); 389 390 /* 391 * register_event() callback will be used to add new userspace 392 * waiter for changes related to the cftype. Implement it if 393 * you want to provide this functionality. Use eventfd_signal() 394 * on eventfd to send notification to userspace. 395 */ 396 int (*register_event)(struct cgroup *cgrp, struct cftype *cft, 397 struct eventfd_ctx *eventfd, const char *args); 398 /* 399 * unregister_event() callback will be called when userspace 400 * closes the eventfd or on cgroup removing. 401 * This callback must be implemented, if you want provide 402 * notification functionality. 403 */ 404 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, 405 struct eventfd_ctx *eventfd); 406}; 407 408struct cgroup_scanner { 409 struct cgroup *cg; 410 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); 411 void (*process_task)(struct task_struct *p, 412 struct cgroup_scanner *scan); 413 struct ptr_heap *heap; 414 void *data; 415}; 416 417/* 418 * Add a new file to the given cgroup directory. Should only be 419 * called by subsystems from within a populate() method 420 */ 421int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 422 const struct cftype *cft); 423 424/* 425 * Add a set of new files to the given cgroup directory. Should 426 * only be called by subsystems from within a populate() method 427 */ 428int cgroup_add_files(struct cgroup *cgrp, 429 struct cgroup_subsys *subsys, 430 const struct cftype cft[], 431 int count); 432 433int cgroup_is_removed(const struct cgroup *cgrp); 434 435int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 436 437int cgroup_task_count(const struct cgroup *cgrp); 438 439/* Return true if cgrp is a descendant of the task's cgroup */ 440int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 441 442/* 443 * When the subsys has to access css and may add permanent refcnt to css, 444 * it should take care of racy conditions with rmdir(). Following set of 445 * functions, is for stop/restart rmdir if necessary. 446 * Because these will call css_get/put, "css" should be alive css. 447 * 448 * cgroup_exclude_rmdir(); 449 * ...do some jobs which may access arbitrary empty cgroup 450 * cgroup_release_and_wakeup_rmdir(); 451 * 452 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, 453 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. 454 */ 455 456void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); 457void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); 458 459/* 460 * Control Group subsystem type. 461 * See Documentation/cgroups/cgroups.txt for details 462 */ 463 464struct cgroup_subsys { 465 struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, 466 struct cgroup *cgrp); 467 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 468 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 469 int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 470 struct task_struct *tsk, bool threadgroup); 471 void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 472 struct task_struct *tsk, bool threadgroup); 473 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 474 struct cgroup *old_cgrp, struct task_struct *tsk, 475 bool threadgroup); 476 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); 477 void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); 478 int (*populate)(struct cgroup_subsys *ss, 479 struct cgroup *cgrp); 480 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); 481 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); 482 483 int subsys_id; 484 int active; 485 int disabled; 486 int early_init; 487 /* 488 * True if this subsys uses ID. ID is not available before cgroup_init() 489 * (not available in early_init time.) 490 */ 491 bool use_id; 492#define MAX_CGROUP_TYPE_NAMELEN 32 493 const char *name; 494 495 /* 496 * Protects sibling/children links of cgroups in this 497 * hierarchy, plus protects which hierarchy (or none) the 498 * subsystem is a part of (i.e. root/sibling). To avoid 499 * potential deadlocks, the following operations should not be 500 * undertaken while holding any hierarchy_mutex: 501 * 502 * - allocating memory 503 * - initiating hotplug events 504 */ 505 struct mutex hierarchy_mutex; 506 struct lock_class_key subsys_key; 507 508 /* 509 * Link to parent, and list entry in parent's children. 510 * Protected by this->hierarchy_mutex and cgroup_lock() 511 */ 512 struct cgroupfs_root *root; 513 struct list_head sibling; 514 /* used when use_id == true */ 515 struct idr idr; 516 spinlock_t id_lock; 517 518 /* should be defined only by modular subsystems */ 519 struct module *module; 520}; 521 522#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 523#include <linux/cgroup_subsys.h> 524#undef SUBSYS 525 526static inline struct cgroup_subsys_state *cgroup_subsys_state( 527 struct cgroup *cgrp, int subsys_id) 528{ 529 return cgrp->subsys[subsys_id]; 530} 531 532/* 533 * function to get the cgroup_subsys_state which allows for extra 534 * rcu_dereference_check() conditions, such as locks used during the 535 * cgroup_subsys::attach() methods. 536 */ 537#define task_subsys_state_check(task, subsys_id, __c) \ 538 rcu_dereference_check(task->cgroups->subsys[subsys_id], \ 539 rcu_read_lock_held() || \ 540 lockdep_is_held(&task->alloc_lock) || \ 541 cgroup_lock_is_held() || (__c)) 542 543static inline struct cgroup_subsys_state * 544task_subsys_state(struct task_struct *task, int subsys_id) 545{ 546 return task_subsys_state_check(task, subsys_id, false); 547} 548 549static inline struct cgroup* task_cgroup(struct task_struct *task, 550 int subsys_id) 551{ 552 return task_subsys_state(task, subsys_id)->cgroup; 553} 554 555int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss, 556 char *nodename); 557 558/* A cgroup_iter should be treated as an opaque object */ 559struct cgroup_iter { 560 struct list_head *cg_link; 561 struct list_head *task; 562}; 563 564/* 565 * To iterate across the tasks in a cgroup: 566 * 567 * 1) call cgroup_iter_start to initialize an iterator 568 * 569 * 2) call cgroup_iter_next() to retrieve member tasks until it 570 * returns NULL or until you want to end the iteration 571 * 572 * 3) call cgroup_iter_end() to destroy the iterator. 573 * 574 * Or, call cgroup_scan_tasks() to iterate through every task in a 575 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling 576 * the test_task() callback, but not while calling the process_task() 577 * callback. 578 */ 579void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); 580struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 581 struct cgroup_iter *it); 582void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); 583int cgroup_scan_tasks(struct cgroup_scanner *scan); 584int cgroup_attach_task(struct cgroup *, struct task_struct *); 585int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 586 587static inline int cgroup_attach_task_current_cg(struct task_struct *tsk) 588{ 589 return cgroup_attach_task_all(current, tsk); 590} 591 592/* 593 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works 594 * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. 595 * CSS ID is assigned at cgroup allocation (create) automatically 596 * and removed when subsys calls free_css_id() function. This is because 597 * the lifetime of cgroup_subsys_state is subsys's matter. 598 * 599 * Looking up and scanning function should be called under rcu_read_lock(). 600 * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls. 601 * But the css returned by this routine can be "not populated yet" or "being 602 * destroyed". The caller should check css and cgroup's status. 603 */ 604 605/* 606 * Typically Called at ->destroy(), or somewhere the subsys frees 607 * cgroup_subsys_state. 608 */ 609void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); 610 611/* Find a cgroup_subsys_state which has given ID */ 612 613struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); 614 615/* 616 * Get a cgroup whose id is greater than or equal to id under tree of root. 617 * Returning a cgroup_subsys_state or NULL. 618 */ 619struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, 620 struct cgroup_subsys_state *root, int *foundid); 621 622/* Returns true if root is ancestor of cg */ 623bool css_is_ancestor(struct cgroup_subsys_state *cg, 624 const struct cgroup_subsys_state *root); 625 626/* Get id and depth of css */ 627unsigned short css_id(struct cgroup_subsys_state *css); 628unsigned short css_depth(struct cgroup_subsys_state *css); 629 630#else /* !CONFIG_CGROUPS */ 631 632static inline int cgroup_init_early(void) { return 0; } 633static inline int cgroup_init(void) { return 0; } 634static inline void cgroup_fork(struct task_struct *p) {} 635static inline void cgroup_fork_callbacks(struct task_struct *p) {} 636static inline void cgroup_post_fork(struct task_struct *p) {} 637static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 638 639static inline void cgroup_lock(void) {} 640static inline void cgroup_unlock(void) {} 641static inline int cgroupstats_build(struct cgroupstats *stats, 642 struct dentry *dentry) 643{ 644 return -EINVAL; 645} 646 647/* No cgroups - nothing to do */ 648static inline int cgroup_attach_task_all(struct task_struct *from, 649 struct task_struct *t) 650{ 651 return 0; 652} 653static inline int cgroup_attach_task_current_cg(struct task_struct *t) 654{ 655 return 0; 656} 657 658#endif /* !CONFIG_CGROUPS */ 659 660#endif /* _LINUX_CGROUP_H */