Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/hyperv: Fix hv tsc page based sched_clock for hibernation

read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
bigger than the variable hv_sched_clock_offset, which is cached during
early boot, but depending on the timing this assumption may be false
when a hibernated VM starts again (the clock counter starts from 0
again) and is resuming back (Note: hv_init_tsc_clocksource() is not
called during hibernation/resume); consequently,
read_hv_sched_clock_tsc() may return a negative integer (which is
interpreted as a huge positive integer since the return type is u64)
and new kernel messages are prefixed with huge timestamps before
read_hv_sched_clock_tsc() grows big enough (which typically takes
several seconds).

Fix the issue by saving the Hyper-V clock counter just before the
suspend, and using it to correct the hv_sched_clock_offset in
resume. This makes hv tsc page based sched_clock continuous and ensures
that post resume, it starts from where it left off during suspend.
Override x86_platform.save_sched_clock_state and
x86_platform.restore_sched_clock_state routines to correct this as soon
as possible.

Note: if Invariant TSC is available, the issue doesn't happen because
1) we don't register read_hv_sched_clock_tsc() for sched clock:
See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
clocksource and sched clock setup");
2) the common x86 code adjusts TSC similarly: see
__restore_processor_state() -> tsc_verify_tsc_adjust(true) and
x86_platform.restore_sched_clock_state().

Cc: stable@vger.kernel.org
Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
Co-developed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com>

authored by

Naman Jain and committed by
Wei Liu
bcc80dec cb1b78f1

+73 -1
+58
arch/x86/kernel/cpu/mshyperv.c
··· 223 223 hyperv_cleanup(); 224 224 } 225 225 #endif /* CONFIG_CRASH_DUMP */ 226 + 227 + static u64 hv_ref_counter_at_suspend; 228 + static void (*old_save_sched_clock_state)(void); 229 + static void (*old_restore_sched_clock_state)(void); 230 + 231 + /* 232 + * Hyper-V clock counter resets during hibernation. Save and restore clock 233 + * offset during suspend/resume, while also considering the time passed 234 + * before suspend. This is to make sure that sched_clock using hv tsc page 235 + * based clocksource, proceeds from where it left off during suspend and 236 + * it shows correct time for the timestamps of kernel messages after resume. 237 + */ 238 + static void save_hv_clock_tsc_state(void) 239 + { 240 + hv_ref_counter_at_suspend = hv_read_reference_counter(); 241 + } 242 + 243 + static void restore_hv_clock_tsc_state(void) 244 + { 245 + /* 246 + * Adjust the offsets used by hv tsc clocksource to 247 + * account for the time spent before hibernation. 248 + * adjusted value = reference counter (time) at suspend 249 + * - reference counter (time) now. 250 + */ 251 + hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); 252 + } 253 + 254 + /* 255 + * Functions to override save_sched_clock_state and restore_sched_clock_state 256 + * functions of x86_platform. The Hyper-V clock counter is reset during 257 + * suspend-resume and the offset used to measure time needs to be 258 + * corrected, post resume. 259 + */ 260 + static void hv_save_sched_clock_state(void) 261 + { 262 + old_save_sched_clock_state(); 263 + save_hv_clock_tsc_state(); 264 + } 265 + 266 + static void hv_restore_sched_clock_state(void) 267 + { 268 + restore_hv_clock_tsc_state(); 269 + old_restore_sched_clock_state(); 270 + } 271 + 272 + static void __init x86_setup_ops_for_tsc_pg_clock(void) 273 + { 274 + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 275 + return; 276 + 277 + old_save_sched_clock_state = x86_platform.save_sched_clock_state; 278 + x86_platform.save_sched_clock_state = hv_save_sched_clock_state; 279 + 280 + old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; 281 + x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; 282 + } 226 283 #endif /* CONFIG_HYPERV */ 227 284 228 285 static uint32_t __init ms_hyperv_platform(void) ··· 636 579 637 580 /* Register Hyper-V specific clocksource */ 638 581 hv_init_clocksource(); 582 + x86_setup_ops_for_tsc_pg_clock(); 639 583 hv_vtl_init_platform(); 640 584 #endif 641 585 /*
+13 -1
drivers/clocksource/hyperv_timer.c
··· 27 27 #include <asm/mshyperv.h> 28 28 29 29 static struct clock_event_device __percpu *hv_clock_event; 30 - static u64 hv_sched_clock_offset __ro_after_init; 30 + /* Note: offset can hold negative values after hibernation. */ 31 + static u64 hv_sched_clock_offset __read_mostly; 31 32 32 33 /* 33 34 * If false, we're using the old mechanism for stimer0 interrupts ··· 469 468 tsc_msr.enable = 1; 470 469 tsc_msr.pfn = tsc_pfn; 471 470 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 471 + } 472 + 473 + /* 474 + * Called during resume from hibernation, from overridden 475 + * x86_platform.restore_sched_clock_state routine. This is to adjust offsets 476 + * used to calculate time for hv tsc page based sched_clock, to account for 477 + * time spent before hibernation. 478 + */ 479 + void hv_adj_sched_clock_offset(u64 offset) 480 + { 481 + hv_sched_clock_offset -= offset; 472 482 } 473 483 474 484 #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
+2
include/clocksource/hyperv_timer.h
··· 38 38 extern unsigned long hv_get_tsc_pfn(void); 39 39 extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); 40 40 41 + extern void hv_adj_sched_clock_offset(u64 offset); 42 + 41 43 static __always_inline bool 42 44 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, 43 45 u64 *cur_tsc, u64 *time)