Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cpuacct: add per-cgroup utime/stime statistics

Add per-cgroup cpuacct controller statistics like the system and user
time consumed by the group of tasks.

Changelog:

v7
- Changed the name of the statistic from utime to user and from stime to
system so that in future we could easily add other statistics like irq,
softirq, steal times etc easily.

v6
- Fixed a bug in the error path of cpuacct_create() (pointed by Li Zefan).

v5
- In cpuacct_stats_show(), use cputime64_to_clock_t() since we are
operating on a 64bit variable here.

v4
- Remove comments in cpuacct_update_stats() which explained why rcu_read_lock()
was needed (as per Peter Zijlstra's review comments).
- Don't say that percpu_counter_read() is broken in Documentation/cpuacct.txt
as per KAMEZAWA Hiroyuki's review comments.

v3
- Fix a small race in the cpuacct hierarchy walk.

v2
- stime and utime now exported in clock_t units instead of msecs.
- Addressed the code review comments from Balbir and Li Zefan.
- Moved to -tip tree.

v1
- Moved the stime/utime accounting to cpuacct controller.

Earlier versions
- http://lkml.org/lkml/2009/2/25/129

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Balaji Rao <balajirrao@gmail.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>
LKML-Reference: <20090331043222.GA4093@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Bharata B Rao and committed by
Ingo Molnar
ef12fefa c5f8d995

+99 -6
+18
Documentation/cgroups/cpuacct.txt
··· 30 30 process (bash) into it. CPU time consumed by this bash and its children 31 31 can be obtained from g1/cpuacct.usage and the same is accumulated in 32 32 /cgroups/cpuacct.usage also. 33 + 34 + cpuacct.stat file lists a few statistics which further divide the 35 + CPU time obtained by the cgroup into user and system times. Currently 36 + the following statistics are supported: 37 + 38 + user: Time spent by tasks of the cgroup in user mode. 39 + system: Time spent by tasks of the cgroup in kernel mode. 40 + 41 + user and system are in USER_HZ unit. 42 + 43 + cpuacct controller uses percpu_counter interface to collect user and 44 + system times. This has two side effects: 45 + 46 + - It is theoretically possible to see wrong values for user and system times. 47 + This is because percpu_counter_read() on 32bit systems isn't safe 48 + against concurrent writes. 49 + - It is possible to see slightly outdated values for user and system times 50 + due to the batch processing nature of percpu_counter.
+81 -6
kernel/sched.c
··· 1393 1393 struct rq_iterator *iterator); 1394 1394 #endif 1395 1395 1396 + /* Time spent by the tasks of the cpu accounting group executing in ... */ 1397 + enum cpuacct_stat_index { 1398 + CPUACCT_STAT_USER, /* ... user mode */ 1399 + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 1400 + 1401 + CPUACCT_STAT_NSTATS, 1402 + }; 1403 + 1396 1404 #ifdef CONFIG_CGROUP_CPUACCT 1397 1405 static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1406 + static void cpuacct_update_stats(struct task_struct *tsk, 1407 + enum cpuacct_stat_index idx, cputime_t val); 1398 1408 #else 1399 1409 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1410 + static inline void cpuacct_update_stats(struct task_struct *tsk, 1411 + enum cpuacct_stat_index idx, cputime_t val) {} 1400 1412 #endif 1401 1413 1402 1414 static inline void inc_cpu_load(struct rq *rq, unsigned long load) ··· 4248 4236 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4249 4237 else 4250 4238 cpustat->user = cputime64_add(cpustat->user, tmp); 4239 + 4240 + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); 4251 4241 /* Account for user time used */ 4252 4242 acct_update_integrals(p); 4253 4243 } ··· 4310 4296 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 4311 4297 else 4312 4298 cpustat->system = cputime64_add(cpustat->system, tmp); 4299 + 4300 + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 4313 4301 4314 4302 /* Account for system time used */ 4315 4303 acct_update_integrals(p); ··· 9555 9539 struct cgroup_subsys_state css; 9556 9540 /* cpuusage holds pointer to a u64-type object on every cpu */ 9557 9541 u64 *cpuusage; 9542 + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 9558 9543 struct cpuacct *parent; 9559 9544 }; 9560 9545 ··· 9580 9563 struct cgroup_subsys *ss, struct cgroup *cgrp) 9581 9564 { 9582 9565 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 9566 + int i; 9583 9567 9584 9568 if (!ca) 9585 - return ERR_PTR(-ENOMEM); 9569 + goto out; 9586 9570 9587 9571 ca->cpuusage = alloc_percpu(u64); 9588 - if (!ca->cpuusage) { 9589 - kfree(ca); 9590 - return ERR_PTR(-ENOMEM); 9591 - } 9572 + if (!ca->cpuusage) 9573 + goto out_free_ca; 9574 + 9575 + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 9576 + if (percpu_counter_init(&ca->cpustat[i], 0)) 9577 + goto out_free_counters; 9592 9578 9593 9579 if (cgrp->parent) 9594 9580 ca->parent = cgroup_ca(cgrp->parent); 9595 9581 9596 9582 return &ca->css; 9583 + 9584 + out_free_counters: 9585 + while (--i >= 0) 9586 + percpu_counter_destroy(&ca->cpustat[i]); 9587 + free_percpu(ca->cpuusage); 9588 + out_free_ca: 9589 + kfree(ca); 9590 + out: 9591 + return ERR_PTR(-ENOMEM); 9597 9592 } 9598 9593 9599 9594 /* destroy an existing cpu accounting group */ ··· 9613 9584 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 9614 9585 { 9615 9586 struct cpuacct *ca = cgroup_ca(cgrp); 9587 + int i; 9616 9588 9589 + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 9590 + percpu_counter_destroy(&ca->cpustat[i]); 9617 9591 free_percpu(ca->cpuusage); 9618 9592 kfree(ca); 9619 9593 } ··· 9703 9671 return 0; 9704 9672 } 9705 9673 9674 + static const char *cpuacct_stat_desc[] = { 9675 + [CPUACCT_STAT_USER] = "user", 9676 + [CPUACCT_STAT_SYSTEM] = "system", 9677 + }; 9678 + 9679 + static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 9680 + struct cgroup_map_cb *cb) 9681 + { 9682 + struct cpuacct *ca = cgroup_ca(cgrp); 9683 + int i; 9684 + 9685 + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 9686 + s64 val = percpu_counter_read(&ca->cpustat[i]); 9687 + val = cputime64_to_clock_t(val); 9688 + cb->fill(cb, cpuacct_stat_desc[i], val); 9689 + } 9690 + return 0; 9691 + } 9692 + 9706 9693 static struct cftype files[] = { 9707 9694 { 9708 9695 .name = "usage", ··· 9732 9681 .name = "usage_percpu", 9733 9682 .read_seq_string = cpuacct_percpu_seq_read, 9734 9683 }, 9735 - 9684 + { 9685 + .name = "stat", 9686 + .read_map = cpuacct_stats_show, 9687 + }, 9736 9688 }; 9737 9689 9738 9690 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) ··· 9767 9713 *cpuusage += cputime; 9768 9714 } 9769 9715 9716 + rcu_read_unlock(); 9717 + } 9718 + 9719 + /* 9720 + * Charge the system/user time to the task's accounting group. 9721 + */ 9722 + static void cpuacct_update_stats(struct task_struct *tsk, 9723 + enum cpuacct_stat_index idx, cputime_t val) 9724 + { 9725 + struct cpuacct *ca; 9726 + 9727 + if (unlikely(!cpuacct_subsys.active)) 9728 + return; 9729 + 9730 + rcu_read_lock(); 9731 + ca = task_ca(tsk); 9732 + 9733 + do { 9734 + percpu_counter_add(&ca->cpustat[idx], val); 9735 + ca = ca->parent; 9736 + } while (ca); 9770 9737 rcu_read_unlock(); 9771 9738 } 9772 9739