Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Add cgroupstats

This patch is inspired by the discussion at
http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics
as suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. The
patch is on top of 2.6.21-mm1 with Paul's cgroups v9 patches (forward
ported)

This patch implements per cgroup statistics infrastructure and re-uses
code from the taskstats interface. A new set of cgroup operations are
registered with commands and attributes. It should be very easy to
*extend* per cgroup statistics, by adding members to the cgroupstats
structure.

The current model for cgroupstats is a pull, a push model (to post
statistics on interesting events), should be very easy to add. Currently
user space requests for statistics by passing the cgroup file
descriptor. Statistics about the state of all the tasks in the cgroup
is returned to user space.

TODO's/NOTE:

This patch provides an infrastructure for implementing cgroup statistics.
Based on the needs of each controller, we can incrementally add more statistics,
event based support for notification of statistics, accumulation of taskstats
into cgroup statistics in the future.

Sample output

# ./cgroupstats -C /cgroup/a
sleeping 2, blocked 0, running 1, stopped 0, uninterruptible 0

# ./cgroupstats -C /cgroup/
sleeping 154, blocked 0, running 0, stopped 0, uninterruptible 0

If the approach looks good, I'll enhance and post the user space utility for
the same

Feedback, comments, test results are always welcome!

[akpm@linux-foundation.org: build fix]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Balbir Singh and committed by
Linus Torvalds
846c7bb0 c2e2c7fa

+241
+27
Documentation/accounting/cgroupstats.txt
··· 1 + Control Groupstats is inspired by the discussion at 2 + http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as 3 + suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. 4 + 5 + Per cgroup statistics infrastructure re-uses code from the taskstats 6 + interface. A new set of cgroup operations are registered with commands 7 + and attributes specific to cgroups. It should be very easy to 8 + extend per cgroup statistics, by adding members to the cgroupstats 9 + structure. 10 + 11 + The current model for cgroupstats is a pull, a push model (to post 12 + statistics on interesting events), should be very easy to add. Currently 13 + user space requests for statistics by passing the cgroup path. 14 + Statistics about the state of all the tasks in the cgroup is returned to 15 + user space. 16 + 17 + NOTE: We currently rely on delay accounting for extracting information 18 + about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this 19 + information will not be available. 20 + 21 + To extract cgroup statistics a utility very similar to getdelays.c 22 + has been developed, the sample output of the utility is shown below 23 + 24 + ~/balbir/cgroupstats # ./getdelays -C "/cgroup/a" 25 + sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0 26 + ~/balbir/cgroupstats # ./getdelays -C "/cgroup" 27 + sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
+1
include/linux/Kbuild
··· 47 47 header-y += coff.h 48 48 header-y += comstats.h 49 49 header-y += const.h 50 + header-y += cgroupstats.h 50 51 header-y += cycx_cfm.h 51 52 header-y += dlm_device.h 52 53 header-y += dlm_netlink.h
+8
include/linux/cgroup.h
··· 13 13 #include <linux/cpumask.h> 14 14 #include <linux/nodemask.h> 15 15 #include <linux/rcupdate.h> 16 + #include <linux/cgroupstats.h> 16 17 17 18 #ifdef CONFIG_CGROUPS 18 19 ··· 30 29 extern void cgroup_fork_callbacks(struct task_struct *p); 31 30 extern void cgroup_post_fork(struct task_struct *p); 32 31 extern void cgroup_exit(struct task_struct *p, int run_callbacks); 32 + extern int cgroupstats_build(struct cgroupstats *stats, 33 + struct dentry *dentry); 33 34 34 35 extern struct file_operations proc_cgroup_operations; 35 36 ··· 316 313 317 314 static inline void cgroup_lock(void) {} 318 315 static inline void cgroup_unlock(void) {} 316 + static inline int cgroupstats_build(struct cgroupstats *stats, 317 + struct dentry *dentry) 318 + { 319 + return -EINVAL; 320 + } 319 321 320 322 #endif /* !CONFIG_CGROUPS */ 321 323
+70
include/linux/cgroupstats.h
··· 1 + /* cgroupstats.h - exporting per-cgroup statistics 2 + * 3 + * Copyright IBM Corporation, 2007 4 + * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of version 2.1 of the GNU Lesser General Public License 8 + * as published by the Free Software Foundation. 9 + * 10 + * This program is distributed in the hope that it would be useful, but 11 + * WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 + */ 14 + 15 + #ifndef _LINUX_CGROUPSTATS_H 16 + #define _LINUX_CGROUPSTATS_H 17 + 18 + #include <linux/taskstats.h> 19 + 20 + /* 21 + * Data shared between user space and kernel space on a per cgroup 22 + * basis. This data is shared using taskstats. 23 + * 24 + * Most of these states are derived by looking at the task->state value 25 + * For the nr_io_wait state, a flag in the delay accounting structure 26 + * indicates that the task is waiting on IO 27 + * 28 + * Each member is aligned to a 8 byte boundary. 29 + */ 30 + struct cgroupstats { 31 + __u64 nr_sleeping; /* Number of tasks sleeping */ 32 + __u64 nr_running; /* Number of tasks running */ 33 + __u64 nr_stopped; /* Number of tasks in stopped state */ 34 + __u64 nr_uninterruptible; /* Number of tasks in uninterruptible */ 35 + /* state */ 36 + __u64 nr_io_wait; /* Number of tasks waiting on IO */ 37 + }; 38 + 39 + /* 40 + * Commands sent from userspace 41 + * Not versioned. New commands should only be inserted at the enum's end 42 + * prior to __CGROUPSTATS_CMD_MAX 43 + */ 44 + 45 + enum { 46 + CGROUPSTATS_CMD_UNSPEC = __TASKSTATS_CMD_MAX, /* Reserved */ 47 + CGROUPSTATS_CMD_GET, /* user->kernel request/get-response */ 48 + CGROUPSTATS_CMD_NEW, /* kernel->user event */ 49 + __CGROUPSTATS_CMD_MAX, 50 + }; 51 + 52 + #define CGROUPSTATS_CMD_MAX (__CGROUPSTATS_CMD_MAX - 1) 53 + 54 + enum { 55 + CGROUPSTATS_TYPE_UNSPEC = 0, /* Reserved */ 56 + CGROUPSTATS_TYPE_CGROUP_STATS, /* contains name + stats */ 57 + __CGROUPSTATS_TYPE_MAX, 58 + }; 59 + 60 + #define CGROUPSTATS_TYPE_MAX (__CGROUPSTATS_TYPE_MAX - 1) 61 + 62 + enum { 63 + CGROUPSTATS_CMD_ATTR_UNSPEC = 0, 64 + CGROUPSTATS_CMD_ATTR_FD, 65 + __CGROUPSTATS_CMD_ATTR_MAX, 66 + }; 67 + 68 + #define CGROUPSTATS_CMD_ATTR_MAX (__CGROUPSTATS_CMD_ATTR_MAX - 1) 69 + 70 + #endif /* _LINUX_CGROUPSTATS_H */
+13
include/linux/delayacct.h
··· 26 26 * Used to set current->delays->flags 27 27 */ 28 28 #define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ 29 + #define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ 29 30 30 31 #ifdef CONFIG_TASK_DELAY_ACCT 31 32 ··· 39 38 extern void __delayacct_blkio_end(void); 40 39 extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); 41 40 extern __u64 __delayacct_blkio_ticks(struct task_struct *); 41 + 42 + static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) 43 + { 44 + if (p->delays) 45 + return (p->delays->flags & DELAYACCT_PF_BLKIO); 46 + else 47 + return 0; 48 + } 42 49 43 50 static inline void delayacct_set_flag(int flag) 44 51 { ··· 80 71 81 72 static inline void delayacct_blkio_start(void) 82 73 { 74 + delayacct_set_flag(DELAYACCT_PF_BLKIO); 83 75 if (current->delays) 84 76 __delayacct_blkio_start(); 85 77 } ··· 89 79 { 90 80 if (current->delays) 91 81 __delayacct_blkio_end(); 82 + delayacct_clear_flag(DELAYACCT_PF_BLKIO); 92 83 } 93 84 94 85 static inline int delayacct_add_tsk(struct taskstats *d, ··· 126 115 struct task_struct *tsk) 127 116 { return 0; } 128 117 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) 118 + { return 0; } 119 + static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) 129 120 { return 0; } 130 121 #endif /* CONFIG_TASK_DELAY_ACCT */ 131 122
+55
kernel/cgroup.c
··· 44 44 #include <linux/string.h> 45 45 #include <linux/sort.h> 46 46 #include <linux/kmod.h> 47 + #include <linux/delayacct.h> 48 + #include <linux/cgroupstats.h> 49 + 47 50 #include <asm/atomic.h> 48 51 49 52 static DEFINE_MUTEX(cgroup_mutex); ··· 1767 1764 } 1768 1765 cgroup_iter_end(cont, &it); 1769 1766 return n; 1767 + } 1768 + 1769 + /** 1770 + * Build and fill cgroupstats so that taskstats can export it to user 1771 + * space. 1772 + * 1773 + * @stats: cgroupstats to fill information into 1774 + * @dentry: A dentry entry belonging to the cgroup for which stats have 1775 + * been requested. 1776 + */ 1777 + int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 1778 + { 1779 + int ret = -EINVAL; 1780 + struct cgroup *cont; 1781 + struct cgroup_iter it; 1782 + struct task_struct *tsk; 1783 + /* 1784 + * Validate dentry by checking the superblock operations 1785 + */ 1786 + if (dentry->d_sb->s_op != &cgroup_ops) 1787 + goto err; 1788 + 1789 + ret = 0; 1790 + cont = dentry->d_fsdata; 1791 + rcu_read_lock(); 1792 + 1793 + cgroup_iter_start(cont, &it); 1794 + while ((tsk = cgroup_iter_next(cont, &it))) { 1795 + switch (tsk->state) { 1796 + case TASK_RUNNING: 1797 + stats->nr_running++; 1798 + break; 1799 + case TASK_INTERRUPTIBLE: 1800 + stats->nr_sleeping++; 1801 + break; 1802 + case TASK_UNINTERRUPTIBLE: 1803 + stats->nr_uninterruptible++; 1804 + break; 1805 + case TASK_STOPPED: 1806 + stats->nr_stopped++; 1807 + break; 1808 + default: 1809 + if (delayacct_is_task_waiting_on_io(tsk)) 1810 + stats->nr_io_wait++; 1811 + break; 1812 + } 1813 + } 1814 + cgroup_iter_end(cont, &it); 1815 + 1816 + rcu_read_unlock(); 1817 + err: 1818 + return ret; 1770 1819 } 1771 1820 1772 1821 static int cmppid(const void *a, const void *b)
+67
kernel/taskstats.c
··· 22 22 #include <linux/delayacct.h> 23 23 #include <linux/cpumask.h> 24 24 #include <linux/percpu.h> 25 + #include <linux/cgroupstats.h> 26 + #include <linux/cgroup.h> 27 + #include <linux/fs.h> 28 + #include <linux/file.h> 25 29 #include <net/genetlink.h> 26 30 #include <asm/atomic.h> 27 31 ··· 52 48 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 49 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 50 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 51 + 52 + static struct nla_policy 53 + cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { 54 + [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 55 + }; 55 56 56 57 struct listener { 57 58 struct list_head list; ··· 381 372 return NULL; 382 373 } 383 374 375 + static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 376 + { 377 + int rc = 0; 378 + struct sk_buff *rep_skb; 379 + struct cgroupstats *stats; 380 + struct nlattr *na; 381 + size_t size; 382 + u32 fd; 383 + struct file *file; 384 + int fput_needed; 385 + 386 + na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 387 + if (!na) 388 + return -EINVAL; 389 + 390 + fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 391 + file = fget_light(fd, &fput_needed); 392 + if (file) { 393 + size = nla_total_size(sizeof(struct cgroupstats)); 394 + 395 + rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 396 + size); 397 + if (rc < 0) 398 + goto err; 399 + 400 + na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 401 + sizeof(struct cgroupstats)); 402 + stats = nla_data(na); 403 + memset(stats, 0, sizeof(*stats)); 404 + 405 + rc = cgroupstats_build(stats, file->f_dentry); 406 + if (rc < 0) 407 + goto err; 408 + 409 + fput_light(file, fput_needed); 410 + return send_reply(rep_skb, info->snd_pid); 411 + } 412 + 413 + err: 414 + if (file) 415 + fput_light(file, fput_needed); 416 + nlmsg_free(rep_skb); 417 + return rc; 418 + } 419 + 384 420 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 385 421 { 386 422 int rc = 0; ··· 576 522 .policy = taskstats_cmd_get_policy, 577 523 }; 578 524 525 + static struct genl_ops cgroupstats_ops = { 526 + .cmd = CGROUPSTATS_CMD_GET, 527 + .doit = cgroupstats_user_cmd, 528 + .policy = cgroupstats_cmd_get_policy, 529 + }; 530 + 579 531 /* Needed early in initialization */ 580 532 void __init taskstats_init_early(void) 581 533 { ··· 606 546 if (rc < 0) 607 547 goto err; 608 548 549 + rc = genl_register_ops(&family, &cgroupstats_ops); 550 + if (rc < 0) 551 + goto err_cgroup_ops; 552 + 609 553 family_registered = 1; 554 + printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 610 555 return 0; 556 + err_cgroup_ops: 557 + genl_unregister_ops(&family, &taskstats_ops); 611 558 err: 612 559 genl_unregister_family(&family); 613 560 return rc;