Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

posixtimers, sched: Fix posix clock monotonicity

Impact: Regression fix (against clock_gettime() backwarding bug)

This patch re-introduces a couple of functions, task_sched_runtime
and thread_group_sched_runtime, which was once removed at the
time of 2.6.28-rc1.

These functions protect the sampling of thread/process clock with
rq lock. This rq lock is required not to update rq->clock during
the sampling.

i.e.
The clock_gettime() may return
((accounted runtime before update) + (delta after update))
that is less than what it should be.

v2 -> v3:
- Rename static helper function __task_delta_exec()
to do_task_delta_exec() since -tip tree already has
a __task_delta_exec() of different version.

v1 -> v2:
- Revises comments of function and patch description.
- Add note about accuracy of thread group's runtime.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org [2.6.28.x][2.6.29.x]
LKML-Reference: <49D1CC93.4080401@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Hidetoshi Seto and committed by
Ingo Molnar
c5f8d995 13b8bd0a

+61 -11
+4 -3
kernel/posix-cpu-timers.c
··· 224 224 cpu->cpu = virt_ticks(p); 225 225 break; 226 226 case CPUCLOCK_SCHED: 227 - cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); 227 + cpu->sched = task_sched_runtime(p); 228 228 break; 229 229 } 230 230 return 0; ··· 240 240 { 241 241 struct task_cputime cputime; 242 242 243 - thread_group_cputime(p, &cputime); 244 243 switch (CPUCLOCK_WHICH(which_clock)) { 245 244 default: 246 245 return -EINVAL; 247 246 case CPUCLOCK_PROF: 247 + thread_group_cputime(p, &cputime); 248 248 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 249 249 break; 250 250 case CPUCLOCK_VIRT: 251 + thread_group_cputime(p, &cputime); 251 252 cpu->cpu = cputime.utime; 252 253 break; 253 254 case CPUCLOCK_SCHED: 254 - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 255 + cpu->sched = thread_group_sched_runtime(p); 255 256 break; 256 257 } 257 258 return 0;
+57 -8
kernel/sched.c
··· 4139 4139 EXPORT_PER_CPU_SYMBOL(kstat); 4140 4140 4141 4141 /* 4142 - * Return any ns on the sched_clock that have not yet been banked in 4142 + * Return any ns on the sched_clock that have not yet been accounted in 4143 4143 * @p in case that task is currently running. 4144 + * 4145 + * Called with task_rq_lock() held on @rq. 4144 4146 */ 4147 + static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 4148 + { 4149 + u64 ns = 0; 4150 + 4151 + if (task_current(rq, p)) { 4152 + update_rq_clock(rq); 4153 + ns = rq->clock - p->se.exec_start; 4154 + if ((s64)ns < 0) 4155 + ns = 0; 4156 + } 4157 + 4158 + return ns; 4159 + } 4160 + 4145 4161 unsigned long long task_delta_exec(struct task_struct *p) 4146 4162 { 4147 4163 unsigned long flags; ··· 4165 4149 u64 ns = 0; 4166 4150 4167 4151 rq = task_rq_lock(p, &flags); 4152 + ns = do_task_delta_exec(p, rq); 4153 + task_rq_unlock(rq, &flags); 4168 4154 4169 - if (task_current(rq, p)) { 4170 - u64 delta_exec; 4155 + return ns; 4156 + } 4171 4157 4172 - update_rq_clock(rq); 4173 - delta_exec = rq->clock - p->se.exec_start; 4174 - if ((s64)delta_exec > 0) 4175 - ns = delta_exec; 4176 - } 4158 + /* 4159 + * Return accounted runtime for the task. 4160 + * In case the task is currently running, return the runtime plus current's 4161 + * pending runtime that have not been accounted yet. 4162 + */ 4163 + unsigned long long task_sched_runtime(struct task_struct *p) 4164 + { 4165 + unsigned long flags; 4166 + struct rq *rq; 4167 + u64 ns = 0; 4177 4168 4169 + rq = task_rq_lock(p, &flags); 4170 + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 4171 + task_rq_unlock(rq, &flags); 4172 + 4173 + return ns; 4174 + } 4175 + 4176 + /* 4177 + * Return sum_exec_runtime for the thread group. 4178 + * In case the task is currently running, return the sum plus current's 4179 + * pending runtime that have not been accounted yet. 4180 + * 4181 + * Note that the thread group might have other running tasks as well, 4182 + * so the return value not includes other pending runtime that other 4183 + * running tasks might have. 4184 + */ 4185 + unsigned long long thread_group_sched_runtime(struct task_struct *p) 4186 + { 4187 + struct task_cputime totals; 4188 + unsigned long flags; 4189 + struct rq *rq; 4190 + u64 ns; 4191 + 4192 + rq = task_rq_lock(p, &flags); 4193 + thread_group_cputime(p, &totals); 4194 + ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 4178 4195 task_rq_unlock(rq, &flags); 4179 4196 4180 4197 return ns;