Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/clock: Provide better clock continuity

When switching between the unstable and stable variants it is
currently possible that clock discontinuities occur.

And while these will mostly be 'small', attempt to do better.

As observed on my IVB-EP, the sched_clock() is ~1.5s ahead of the
ktime_get_ns() based timeline at the point of switchover
(sched_clock_init_late()) after SMP bringup.

Equally, when the TSC is later found to be unstable -- typically
because SMM tries to hide its SMI latencies by mucking with the TSC --
we want to avoid large jumps.

Since the clocksource watchdog reports the issue after the fact we
cannot exactly fix up time, but since SMI latencies are typically
small (~10ns range), the discontinuity is mainly due to drift between
sched_clock() and ktime_get_ns() (which on my desktop is ~79s over
24days).

I dislike this patch because it adds overhead to the good case in
favour of dealing with badness. But given the widespread failure of
TSC stability this is worth it.

Note that in case the TSC makes drastic jumps after SMP bringup we're
still hosed. There's just not much we can do in that case without
stupid overhead.

If we were to somehow expose tsc_clocksource_reliable (which is hard
because this code is also used on ia64 and parisc) we could avoid some
of the newly introduced overhead.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Peter Zijlstra and committed by
Ingo Molnar
5680d809 9881b024

+65 -34
+65 -34
kernel/sched/clock.c
··· 86 86 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); 87 87 static int __sched_clock_stable_early; 88 88 89 + /* 90 + * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset 91 + */ 92 + static __read_mostly u64 raw_offset; 93 + static __read_mostly u64 gtod_offset; 94 + 95 + struct sched_clock_data { 96 + u64 tick_raw; 97 + u64 tick_gtod; 98 + u64 clock; 99 + }; 100 + 101 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); 102 + 103 + static inline struct sched_clock_data *this_scd(void) 104 + { 105 + return this_cpu_ptr(&sched_clock_data); 106 + } 107 + 108 + static inline struct sched_clock_data *cpu_sdc(int cpu) 109 + { 110 + return &per_cpu(sched_clock_data, cpu); 111 + } 112 + 89 113 int sched_clock_stable(void) 90 114 { 91 115 return static_branch_likely(&__sched_clock_stable); ··· 117 93 118 94 static void __set_sched_clock_stable(void) 119 95 { 96 + struct sched_clock_data *scd = this_scd(); 97 + 98 + /* 99 + * Attempt to make the (initial) unstable->stable transition continuous. 100 + */ 101 + raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw); 102 + 103 + printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", 104 + scd->tick_gtod, gtod_offset, 105 + scd->tick_raw, raw_offset); 106 + 120 107 static_branch_enable(&__sched_clock_stable); 121 108 tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); 122 109 } ··· 152 117 153 118 static void __clear_sched_clock_stable(struct work_struct *work) 154 119 { 155 - /* XXX worry about clock continuity */ 120 + struct sched_clock_data *scd = this_scd(); 121 + 122 + /* 123 + * Attempt to make the stable->unstable transition continuous. 124 + * 125 + * Trouble is, this is typically called from the TSC watchdog 126 + * timer, which is late per definition. This means the tick 127 + * values can already be screwy. 128 + * 129 + * Still do what we can. 130 + */ 131 + gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod); 132 + 133 + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", 134 + scd->tick_gtod, gtod_offset, 135 + scd->tick_raw, raw_offset); 136 + 156 137 static_branch_disable(&__sched_clock_stable); 157 138 tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); 158 139 } ··· 185 134 schedule_work(&sched_clock_work); 186 135 } 187 136 188 - struct sched_clock_data { 189 - u64 tick_raw; 190 - u64 tick_gtod; 191 - u64 clock; 192 - }; 193 - 194 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); 195 - 196 - static inline struct sched_clock_data *this_scd(void) 197 - { 198 - return this_cpu_ptr(&sched_clock_data); 199 - } 200 - 201 - static inline struct sched_clock_data *cpu_sdc(int cpu) 202 - { 203 - return &per_cpu(sched_clock_data, cpu); 204 - } 205 - 206 137 void sched_clock_init_late(void) 207 138 { 208 139 sched_clock_running = 2; 209 - 210 140 /* 211 141 * Ensure that it is impossible to not do a static_key update. 212 142 * ··· 242 210 * scd->tick_gtod + TICK_NSEC); 243 211 */ 244 212 245 - clock = scd->tick_gtod + delta; 213 + clock = scd->tick_gtod + gtod_offset + delta; 246 214 min_clock = wrap_max(scd->tick_gtod, old_clock); 247 215 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); 248 216 ··· 328 296 u64 clock; 329 297 330 298 if (sched_clock_stable()) 331 - return sched_clock(); 299 + return sched_clock() + raw_offset; 332 300 333 301 if (unlikely(!sched_clock_running)) 334 302 return 0ull; ··· 349 317 void sched_clock_tick(void) 350 318 { 351 319 struct sched_clock_data *scd; 352 - u64 now, now_gtod; 353 - 354 - if (sched_clock_stable()) 355 - return; 356 - 357 - if (unlikely(!sched_clock_running)) 358 - return; 359 320 360 321 WARN_ON_ONCE(!irqs_disabled()); 361 322 323 + /* 324 + * Update these values even if sched_clock_stable(), because it can 325 + * become unstable at any point in time at which point we need some 326 + * values to fall back on. 327 + * 328 + * XXX arguably we can skip this if we expose tsc_clocksource_reliable 329 + */ 362 330 scd = this_scd(); 363 - now_gtod = ktime_to_ns(ktime_get()); 364 - now = sched_clock(); 331 + scd->tick_raw = sched_clock(); 332 + scd->tick_gtod = ktime_get_ns(); 365 333 366 - scd->tick_raw = now; 367 - scd->tick_gtod = now_gtod; 368 - sched_clock_local(scd); 334 + if (!sched_clock_stable() && likely(sched_clock_running)) 335 + sched_clock_local(scd); 369 336 } 370 337 371 338 /*