Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] idle cputime accounting

The cpu time spent by the idle process actually doing something is
currently accounted as idle time. This is plain wrong, the architectures
that support VIRT_CPU_ACCOUNTING=y can do better: distinguish between the
time spent doing nothing and the time spent by idle doing work. The first
is accounted with account_idle_time and the second with account_system_time.
The architectures that use the account_xxx_time interface directly and not
the account_xxx_ticks interface now need to do the check for the idle
process in their arch code. In particular to improve the system vs true
idle time accounting the arch code needs to measure the true idle time
instead of just testing for the idle process.
To improve the tick based accounting as well we would need an architecture
primitive that can tell us if the pt_regs of the interrupted context
points to the magic instruction that halts the cpu.

In addition idle time is no more added to the stime of the idle process.
This field now contains the system time of the idle process as it should
be. On systems without VIRT_CPU_ACCOUNTING this will always be zero as
every tick that occurs while idle is running will be accounted as idle
time.

This patch contains the necessary common code changes to be able to
distinguish idle system time and true idle time. The architectures with
support for VIRT_CPU_ACCOUNTING need some changes to exploit this.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

+114 -54
+8 -2
arch/ia64/kernel/time.c
··· 93 93 now = ia64_get_itc(); 94 94 95 95 delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp)); 96 - account_system_time(prev, 0, delta_stime, delta_stime); 96 + if (idle_task(smp_processor_id()) != prev) 97 + account_system_time(prev, 0, delta_stime, delta_stime); 98 + else 99 + account_idle_time(delta_stime); 97 100 98 101 if (pi->ac_utime) { 99 102 delta_utime = cycle_to_cputime(pi->ac_utime); ··· 123 120 now = ia64_get_itc(); 124 121 125 122 delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); 126 - account_system_time(tsk, 0, delta_stime, delta_stime); 123 + if (irq_count() || idle_task(smp_processor_id()) != tsk) 124 + account_system_time(tsk, 0, delta_stime, delta_stime); 125 + else 126 + account_idle_time(delta_stime); 127 127 ti->ac_stime = 0; 128 128 129 129 ti->ac_stamp = now;
+1
arch/powerpc/kernel/process.c
··· 33 33 #include <linux/mqueue.h> 34 34 #include <linux/hardirq.h> 35 35 #include <linux/utsname.h> 36 + #include <linux/kernel_stat.h> 36 37 37 38 #include <asm/pgtable.h> 38 39 #include <asm/uaccess.h>
+10 -3
arch/powerpc/kernel/time.c
··· 256 256 delta += sys_time; 257 257 get_paca()->system_time = 0; 258 258 } 259 - account_system_time(tsk, 0, delta, deltascaled); 259 + if (in_irq() || idle_task(smp_processor_id()) != tsk) 260 + account_system_time(tsk, 0, delta, deltascaled); 261 + else 262 + account_idle_time(delta); 260 263 per_cpu(cputime_last_delta, smp_processor_id()) = delta; 261 264 per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; 262 265 local_irq_restore(flags); ··· 338 335 tb = mftb(); 339 336 purr = mfspr(SPRN_PURR); 340 337 stolen = (tb - pme->tb) - (purr - pme->purr); 341 - if (stolen > 0) 342 - account_steal_time(current, stolen); 338 + if (stolen > 0) { 339 + if (idle_task(smp_processor_id()) != current) 340 + account_steal_time(stolen); 341 + else 342 + account_idle_time(stolen); 343 + } 343 344 pme->tb = tb; 344 345 pme->purr = purr; 345 346 }
+16 -4
arch/s390/kernel/vtime.c
··· 55 55 cputime = S390_lowcore.system_timer >> 12; 56 56 S390_lowcore.system_timer -= cputime << 12; 57 57 S390_lowcore.steal_clock -= cputime << 12; 58 - account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime); 58 + if (idle_task(smp_processor_id()) != current) 59 + account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime); 60 + else 61 + account_idle_time(cputime); 59 62 60 63 cputime = S390_lowcore.steal_clock; 61 64 if ((__s64) cputime > 0) { 62 65 cputime >>= 12; 63 66 S390_lowcore.steal_clock -= cputime << 12; 64 - account_steal_time(tsk, cputime); 67 + if (idle_task(smp_processor_id()) != current) 68 + account_steal_time(cputime); 69 + else 70 + account_idle_time(cputime); 65 71 } 66 72 } 67 73 ··· 93 87 cputime = S390_lowcore.system_timer >> 12; 94 88 S390_lowcore.system_timer -= cputime << 12; 95 89 S390_lowcore.steal_clock -= cputime << 12; 96 - account_system_time(tsk, 0, cputime, cputime); 90 + if (idle_task(smp_processor_id()) != current) 91 + account_system_time(tsk, 0, cputime, cputime); 92 + else 93 + account_idle_time(cputime); 97 94 } 98 95 99 96 /* ··· 116 107 cputime = S390_lowcore.system_timer >> 12; 117 108 S390_lowcore.system_timer -= cputime << 12; 118 109 S390_lowcore.steal_clock -= cputime << 12; 119 - account_system_time(tsk, 0, cputime, cputime); 110 + if (in_irq() || idle_task(smp_processor_id()) != current) 111 + account_system_time(tsk, 0, cputime, cputime); 112 + else 113 + account_idle_time(cputime); 120 114 } 121 115 EXPORT_SYMBOL_GPL(account_system_vtime); 122 116
+4 -6
arch/x86/xen/time.c
··· 132 132 *snap = state; 133 133 134 134 /* Add the appropriate number of ticks of stolen time, 135 - including any left-overs from last time. Passing NULL to 136 - account_steal_time accounts the time as stolen. */ 135 + including any left-overs from last time. */ 137 136 stolen = runnable + offline + __get_cpu_var(residual_stolen); 138 137 139 138 if (stolen < 0) ··· 140 141 141 142 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 142 143 __get_cpu_var(residual_stolen) = stolen; 143 - account_steal_time(NULL, ticks); 144 + account_steal_ticks(ticks); 144 145 145 146 /* Add the appropriate number of ticks of blocked time, 146 - including any left-overs from last time. Passing idle to 147 - account_steal_time accounts the time as idle/wait. */ 147 + including any left-overs from last time. */ 148 148 blocked += __get_cpu_var(residual_blocked); 149 149 150 150 if (blocked < 0) ··· 151 153 152 154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 153 155 __get_cpu_var(residual_blocked) = blocked; 154 - account_steal_time(idle_task(smp_processor_id()), ticks); 156 + account_idle_ticks(ticks); 155 157 } 156 158 157 159 /*
+6 -1
include/linux/kernel_stat.h
··· 81 81 extern unsigned long long task_delta_exec(struct task_struct *); 82 82 extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 83 83 extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 84 - extern void account_steal_time(struct task_struct *, cputime_t); 84 + extern void account_steal_time(cputime_t); 85 + extern void account_idle_time(cputime_t); 86 + 87 + extern void account_process_tick(struct task_struct *, int user); 88 + extern void account_steal_ticks(unsigned long ticks); 89 + extern void account_idle_ticks(unsigned long ticks); 85 90 86 91 #endif /* _LINUX_KERNEL_STAT_H */
-1
include/linux/sched.h
··· 284 284 285 285 extern void cpu_init (void); 286 286 extern void trap_init(void); 287 - extern void account_process_tick(struct task_struct *task, int user); 288 287 extern void update_process_times(int user); 289 288 extern void scheduler_tick(void); 290 289
+63 -17
kernel/sched.c
··· 4139 4139 cputime_t cputime, cputime_t cputime_scaled) 4140 4140 { 4141 4141 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4142 - struct rq *rq = this_rq(); 4143 4142 cputime64_t tmp; 4144 4143 4145 4144 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { ··· 4157 4158 cpustat->irq = cputime64_add(cpustat->irq, tmp); 4158 4159 else if (softirq_count()) 4159 4160 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 4160 - else if (p != rq->idle) 4161 - cpustat->system = cputime64_add(cpustat->system, tmp); 4162 - else if (atomic_read(&rq->nr_iowait) > 0) 4163 - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4164 4161 else 4165 - cpustat->idle = cputime64_add(cpustat->idle, tmp); 4162 + cpustat->system = cputime64_add(cpustat->system, tmp); 4163 + 4166 4164 /* Account for system time used */ 4167 4165 acct_update_integrals(p); 4168 4166 } 4169 4167 4170 4168 /* 4171 4169 * Account for involuntary wait time. 4172 - * @p: the process from which the cpu time has been stolen 4173 4170 * @steal: the cpu time spent in involuntary wait 4174 4171 */ 4175 - void account_steal_time(struct task_struct *p, cputime_t steal) 4172 + void account_steal_time(cputime_t cputime) 4176 4173 { 4177 4174 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4178 - cputime64_t tmp = cputime_to_cputime64(steal); 4175 + cputime64_t cputime64 = cputime_to_cputime64(cputime); 4176 + 4177 + cpustat->steal = cputime64_add(cpustat->steal, cputime64); 4178 + } 4179 + 4180 + /* 4181 + * Account for idle time. 4182 + * @cputime: the cpu time spent in idle wait 4183 + */ 4184 + void account_idle_time(cputime_t cputime) 4185 + { 4186 + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4187 + cputime64_t cputime64 = cputime_to_cputime64(cputime); 4179 4188 struct rq *rq = this_rq(); 4180 4189 4181 - if (p == rq->idle) { 4182 - p->stime = cputime_add(p->stime, steal); 4183 - if (atomic_read(&rq->nr_iowait) > 0) 4184 - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4185 - else 4186 - cpustat->idle = cputime64_add(cpustat->idle, tmp); 4187 - } else 4188 - cpustat->steal = cputime64_add(cpustat->steal, tmp); 4190 + if (atomic_read(&rq->nr_iowait) > 0) 4191 + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 4192 + else 4193 + cpustat->idle = cputime64_add(cpustat->idle, cputime64); 4189 4194 } 4195 + 4196 + #ifndef CONFIG_VIRT_CPU_ACCOUNTING 4197 + 4198 + /* 4199 + * Account a single tick of cpu time. 4200 + * @p: the process that the cpu time gets accounted to 4201 + * @user_tick: indicates if the tick is a user or a system tick 4202 + */ 4203 + void account_process_tick(struct task_struct *p, int user_tick) 4204 + { 4205 + cputime_t one_jiffy = jiffies_to_cputime(1); 4206 + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); 4207 + struct rq *rq = this_rq(); 4208 + 4209 + if (user_tick) 4210 + account_user_time(p, one_jiffy, one_jiffy_scaled); 4211 + else if (p != rq->idle) 4212 + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 4213 + one_jiffy_scaled); 4214 + else 4215 + account_idle_time(one_jiffy); 4216 + } 4217 + 4218 + /* 4219 + * Account multiple ticks of steal time. 4220 + * @p: the process from which the cpu time has been stolen 4221 + * @ticks: number of stolen ticks 4222 + */ 4223 + void account_steal_ticks(unsigned long ticks) 4224 + { 4225 + account_steal_time(jiffies_to_cputime(ticks)); 4226 + } 4227 + 4228 + /* 4229 + * Account multiple ticks of idle time. 4230 + * @ticks: number of stolen ticks 4231 + */ 4232 + void account_idle_ticks(unsigned long ticks) 4233 + { 4234 + account_idle_time(jiffies_to_cputime(ticks)); 4235 + } 4236 + 4237 + #endif 4190 4238 4191 4239 /* 4192 4240 * Use precise platform statistics if available:
+6 -7
kernel/time/tick-sched.c
··· 419 419 { 420 420 int cpu = smp_processor_id(); 421 421 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 422 + #ifndef CONFIG_VIRT_CPU_ACCOUNTING 422 423 unsigned long ticks; 423 - cputime_t cputime; 424 + #endif 424 425 ktime_t now; 425 426 426 427 local_irq_disable(); ··· 443 442 tick_do_update_jiffies64(now); 444 443 cpu_clear(cpu, nohz_cpu_mask); 445 444 445 + #ifndef CONFIG_VIRT_CPU_ACCOUNTING 446 446 /* 447 447 * We stopped the tick in idle. Update process times would miss the 448 448 * time we slept as update_process_times does only a 1 tick ··· 453 451 /* 454 452 * We might be one off. Do not randomly account a huge number of ticks! 455 453 */ 456 - if (ticks && ticks < LONG_MAX) { 457 - add_preempt_count(HARDIRQ_OFFSET); 458 - cputime = jiffies_to_cputime(ticks); 459 - account_system_time(current, HARDIRQ_OFFSET, cputime, cputime); 460 - sub_preempt_count(HARDIRQ_OFFSET); 461 - } 454 + if (ticks && ticks < LONG_MAX) 455 + account_idle_ticks(ticks); 456 + #endif 462 457 463 458 touch_softlockup_watchdog(); 464 459 /*
-13
kernel/timer.c
··· 1018 1018 } 1019 1019 #endif 1020 1020 1021 - #ifndef CONFIG_VIRT_CPU_ACCOUNTING 1022 - void account_process_tick(struct task_struct *p, int user_tick) 1023 - { 1024 - cputime_t one_jiffy = jiffies_to_cputime(1); 1025 - 1026 - if (user_tick) 1027 - account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy)); 1028 - else 1029 - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 1030 - cputime_to_scaled(one_jiffy)); 1031 - } 1032 - #endif 1033 - 1034 1021 /* 1035 1022 * Called from the timer interrupt handler to charge one tick to the current 1036 1023 * process. user_tick is 1 if the tick is user time, 0 for system.