Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

hrtimer: Prevent stale expiry time in hrtimer_interrupt()

hrtimer_interrupt() has the following subtle issue:

hrtimer_interrupt()
lock(cpu_base);
expires_next = KTIME_MAX;

expire_timers(CLOCK_MONOTONIC);
expires = get_next_timer(CLOCK_MONOTONIC);
if (expires < expires_next)
expires_next = expires;

expire_timers(CLOCK_REALTIME);
unlock(cpu_base);
wakeup()
hrtimer_start(CLOCK_MONOTONIC, newtimer);
lock(cpu_base();
expires = get_next_timer(CLOCK_REALTIME);
if (expires < expires_next)
expires_next = expires;

So because we already evaluated the next expiring timer of
CLOCK_MONOTONIC we ignore that the expiry time of newtimer might be
earlier than the overall next expiry time in hrtimer_interrupt().

To solve this, remove the caching of the next expiry value from
hrtimer_interrupt() and reevaluate all active clock bases for the next
expiry value. To avoid another code duplication, create a shared
evaluation function and use it for hrtimer_get_next_event(),
hrtimer_force_reprogram() and hrtimer_interrupt().

There is another subtlety in this mechanism:

While hrtimer_interrupt() is running, we want to avoid to touch the
hardware device because we will reprogram it anyway at the end of
hrtimer_interrupt(). This works nicely for hrtimers which get rearmed
via the HRTIMER_RESTART mechanism, because we drop out when the
callback on that CPU is running. But that fails, if a new timer gets
enqueued like in the example above.

This has another implication: While hrtimer_interrupt() is running we
refuse remote enqueueing of timers - see hrtimer_interrupt() and
hrtimer_check_target().

hrtimer_interrupt() tries to prevent this by setting cpu_base->expires
to KTIME_MAX, but that fails if a new timer gets queued.

Prevent both the hardware access and the remote enqueue
explicitely. We can loosen the restriction on the remote enqueue now
due to reevaluation of the next expiry value, but that needs a
seperate patch.

Folded in a fix from Vignesh Radhakrishnan.

Reported-and-tested-by: Stanislav Fomichev <stfomichev@yandex-team.ru>
Based-on-patch-by: Stanislav Fomichev <stfomichev@yandex-team.ru>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: vigneshr@codeaurora.org
Cc: john.stultz@linaro.org
Cc: viresh.kumar@linaro.org
Cc: fweisbec@gmail.com
Cc: cl@linux.com
Cc: stuart.w.hayes@gmail.com
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1501202049190.5526@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

+52 -58
+2
include/linux/hrtimer.h
··· 170 170 * @clock_was_set: Indicates that clock was set from irq context. 171 171 * @expires_next: absolute time of the next event which was scheduled 172 172 * via clock_set_next_event() 173 + * @in_hrtirq: hrtimer_interrupt() is currently executing 173 174 * @hres_active: State of high resolution mode 174 175 * @hang_detected: The last hrtimer interrupt detected a hang 175 176 * @nr_events: Total number of hrtimer interrupt events ··· 186 185 unsigned int clock_was_set; 187 186 #ifdef CONFIG_HIGH_RES_TIMERS 188 187 ktime_t expires_next; 188 + int in_hrtirq; 189 189 int hres_active; 190 190 int hang_detected; 191 191 unsigned long nr_events;
+50 -58
kernel/time/hrtimer.c
··· 440 440 trace_hrtimer_cancel(timer); 441 441 } 442 442 443 + #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) 444 + ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) 445 + { 446 + struct hrtimer_clock_base *base = cpu_base->clock_base; 447 + ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; 448 + int i; 449 + 450 + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 451 + struct timerqueue_node *next; 452 + struct hrtimer *timer; 453 + 454 + next = timerqueue_getnext(&base->active); 455 + if (!next) 456 + continue; 457 + 458 + timer = container_of(next, struct hrtimer, node); 459 + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 460 + if (expires.tv64 < expires_next.tv64) 461 + expires_next = expires; 462 + } 463 + /* 464 + * clock_was_set() might have changed base->offset of any of 465 + * the clock bases so the result might be negative. Fix it up 466 + * to prevent a false positive in clockevents_program_event(). 467 + */ 468 + if (expires_next.tv64 < 0) 469 + expires_next.tv64 = 0; 470 + return expires_next; 471 + } 472 + #endif 473 + 443 474 /* High resolution timer related functions */ 444 475 #ifdef CONFIG_HIGH_RES_TIMERS 445 476 ··· 519 488 static void 520 489 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) 521 490 { 522 - int i; 523 - struct hrtimer_clock_base *base = cpu_base->clock_base; 524 - ktime_t expires, expires_next; 525 - 526 - expires_next.tv64 = KTIME_MAX; 527 - 528 - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 529 - struct hrtimer *timer; 530 - struct timerqueue_node *next; 531 - 532 - next = timerqueue_getnext(&base->active); 533 - if (!next) 534 - continue; 535 - timer = container_of(next, struct hrtimer, node); 536 - 537 - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 538 - /* 539 - * clock_was_set() has changed base->offset so the 540 - * result might be negative. Fix it up to prevent a 541 - * false positive in clockevents_program_event() 542 - */ 543 - if (expires.tv64 < 0) 544 - expires.tv64 = 0; 545 - if (expires.tv64 < expires_next.tv64) 546 - expires_next = expires; 547 - } 491 + ktime_t expires_next = __hrtimer_get_next_event(cpu_base); 548 492 549 493 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) 550 494 return; ··· 590 584 return -ETIME; 591 585 592 586 if (expires.tv64 >= cpu_base->expires_next.tv64) 587 + return 0; 588 + 589 + /* 590 + * When the target cpu of the timer is currently executing 591 + * hrtimer_interrupt(), then we do not touch the clock event 592 + * device. hrtimer_interrupt() will reevaluate all clock bases 593 + * before reprogramming the device. 594 + */ 595 + if (cpu_base->in_hrtirq) 593 596 return 0; 594 597 595 598 /* ··· 1119 1104 ktime_t hrtimer_get_next_event(void) 1120 1105 { 1121 1106 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1122 - struct hrtimer_clock_base *base = cpu_base->clock_base; 1123 - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; 1107 + ktime_t mindelta = { .tv64 = KTIME_MAX }; 1124 1108 unsigned long flags; 1125 - int i; 1126 1109 1127 1110 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1128 1111 1129 - if (!hrtimer_hres_active()) { 1130 - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1131 - struct hrtimer *timer; 1132 - struct timerqueue_node *next; 1133 - 1134 - next = timerqueue_getnext(&base->active); 1135 - if (!next) 1136 - continue; 1137 - 1138 - timer = container_of(next, struct hrtimer, node); 1139 - delta.tv64 = hrtimer_get_expires_tv64(timer); 1140 - delta = ktime_sub(delta, base->get_time()); 1141 - if (delta.tv64 < mindelta.tv64) 1142 - mindelta.tv64 = delta.tv64; 1143 - } 1144 - } 1112 + if (!hrtimer_hres_active()) 1113 + mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), 1114 + ktime_get()); 1145 1115 1146 1116 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1147 1117 ··· 1253 1253 raw_spin_lock(&cpu_base->lock); 1254 1254 entry_time = now = hrtimer_update_base(cpu_base); 1255 1255 retry: 1256 - expires_next.tv64 = KTIME_MAX; 1256 + cpu_base->in_hrtirq = 1; 1257 1257 /* 1258 1258 * We set expires_next to KTIME_MAX here with cpu_base->lock 1259 1259 * held to prevent that a timer is enqueued in our queue via ··· 1291 1291 * are right-of a not yet expired timer, because that 1292 1292 * timer will have to trigger a wakeup anyway. 1293 1293 */ 1294 - 1295 - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { 1296 - ktime_t expires; 1297 - 1298 - expires = ktime_sub(hrtimer_get_expires(timer), 1299 - base->offset); 1300 - if (expires.tv64 < 0) 1301 - expires.tv64 = KTIME_MAX; 1302 - if (expires.tv64 < expires_next.tv64) 1303 - expires_next = expires; 1294 + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) 1304 1295 break; 1305 - } 1306 1296 1307 1297 __run_hrtimer(timer, &basenow); 1308 1298 } 1309 1299 } 1310 - 1300 + /* Reevaluate the clock bases for the next expiry */ 1301 + expires_next = __hrtimer_get_next_event(cpu_base); 1311 1302 /* 1312 1303 * Store the new expiry value so the migration code can verify 1313 1304 * against it. 1314 1305 */ 1315 1306 cpu_base->expires_next = expires_next; 1307 + cpu_base->in_hrtirq = 0; 1316 1308 raw_spin_unlock(&cpu_base->lock); 1317 1309 1318 1310 /* Reprogramming necessary ? */