sched: fix process time monotonicity

Spencer reported a problem where utime and stime were going negative despite
the fixes in commit b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa. The suspected
reason for the problem is that signal_struct maintains it's own utime and
stime (of exited tasks), these are not updated using the new task_utime()
routine, hence sig->utime can go backwards and cause the same problem
to occur (sig->utime, adds tsk->utime and not task_utime()). This patch
fixes the problem

TODO: using max(task->prev_utime, derived utime) works for now, but a more
generic solution is to implement cputime_max() and use the cputime_gt()
function for comparison.

Reported-by: spencer@bluehost.com
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Balbir Singh and committed by Ingo Molnar 49048622 56c7426b

+66 -62
-59
fs/proc/array.c
··· 337 337 return 0; 338 338 } 339 339 340 - /* 341 - * Use precise platform statistics if available: 342 - */ 343 - #ifdef CONFIG_VIRT_CPU_ACCOUNTING 344 - static cputime_t task_utime(struct task_struct *p) 345 - { 346 - return p->utime; 347 - } 348 - 349 - static cputime_t task_stime(struct task_struct *p) 350 - { 351 - return p->stime; 352 - } 353 - #else 354 - static cputime_t task_utime(struct task_struct *p) 355 - { 356 - clock_t utime = cputime_to_clock_t(p->utime), 357 - total = utime + cputime_to_clock_t(p->stime); 358 - u64 temp; 359 - 360 - /* 361 - * Use CFS's precise accounting: 362 - */ 363 - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 364 - 365 - if (total) { 366 - temp *= utime; 367 - do_div(temp, total); 368 - } 369 - utime = (clock_t)temp; 370 - 371 - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 372 - return p->prev_utime; 373 - } 374 - 375 - static cputime_t task_stime(struct task_struct *p) 376 - { 377 - clock_t stime; 378 - 379 - /* 380 - * Use CFS's precise accounting. (we subtract utime from 381 - * the total, to make sure the total observed by userspace 382 - * grows monotonically - apps rely on that): 383 - */ 384 - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - 385 - cputime_to_clock_t(task_utime(p)); 386 - 387 - if (stime >= 0) 388 - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 389 - 390 - return p->prev_stime; 391 - } 392 - #endif 393 - 394 - static cputime_t task_gtime(struct task_struct *p) 395 - { 396 - return p->gtime; 397 - } 398 - 399 340 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, 400 341 struct pid *pid, struct task_struct *task, int whole) 401 342 {
+4
include/linux/sched.h
··· 1475 1475 __put_task_struct(t); 1476 1476 } 1477 1477 1478 + extern cputime_t task_utime(struct task_struct *p); 1479 + extern cputime_t task_stime(struct task_struct *p); 1480 + extern cputime_t task_gtime(struct task_struct *p); 1481 + 1478 1482 /* 1479 1483 * Per process flags 1480 1484 */
+3 -3
kernel/exit.c
··· 112 112 * We won't ever get here for the group leader, since it 113 113 * will have been the last reference on the signal_struct. 114 114 */ 115 - sig->utime = cputime_add(sig->utime, tsk->utime); 116 - sig->stime = cputime_add(sig->stime, tsk->stime); 117 - sig->gtime = cputime_add(sig->gtime, tsk->gtime); 115 + sig->utime = cputime_add(sig->utime, task_utime(tsk)); 116 + sig->stime = cputime_add(sig->stime, task_stime(tsk)); 117 + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 118 118 sig->min_flt += tsk->min_flt; 119 119 sig->maj_flt += tsk->maj_flt; 120 120 sig->nvcsw += tsk->nvcsw;
+59
kernel/sched.c
··· 4179 4179 } 4180 4180 4181 4181 /* 4182 + * Use precise platform statistics if available: 4183 + */ 4184 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING 4185 + cputime_t task_utime(struct task_struct *p) 4186 + { 4187 + return p->utime; 4188 + } 4189 + 4190 + cputime_t task_stime(struct task_struct *p) 4191 + { 4192 + return p->stime; 4193 + } 4194 + #else 4195 + cputime_t task_utime(struct task_struct *p) 4196 + { 4197 + clock_t utime = cputime_to_clock_t(p->utime), 4198 + total = utime + cputime_to_clock_t(p->stime); 4199 + u64 temp; 4200 + 4201 + /* 4202 + * Use CFS's precise accounting: 4203 + */ 4204 + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 4205 + 4206 + if (total) { 4207 + temp *= utime; 4208 + do_div(temp, total); 4209 + } 4210 + utime = (clock_t)temp; 4211 + 4212 + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 4213 + return p->prev_utime; 4214 + } 4215 + 4216 + cputime_t task_stime(struct task_struct *p) 4217 + { 4218 + clock_t stime; 4219 + 4220 + /* 4221 + * Use CFS's precise accounting. (we subtract utime from 4222 + * the total, to make sure the total observed by userspace 4223 + * grows monotonically - apps rely on that): 4224 + */ 4225 + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - 4226 + cputime_to_clock_t(task_utime(p)); 4227 + 4228 + if (stime >= 0) 4229 + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 4230 + 4231 + return p->prev_stime; 4232 + } 4233 + #endif 4234 + 4235 + inline cputime_t task_gtime(struct task_struct *p) 4236 + { 4237 + return p->gtime; 4238 + } 4239 + 4240 + /* 4182 4241 * This function gets called by the timer code, with HZ frequency. 4183 4242 * We call it with interrupts disabled. 4184 4243 *