at master 18 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2 3/* 4 * Clocksource driver for the synthetic counter and timers 5 * provided by the Hyper-V hypervisor to guest VMs, as described 6 * in the Hyper-V Top Level Functional Spec (TLFS). This driver 7 * is instruction set architecture independent. 8 * 9 * Copyright (C) 2019, Microsoft, Inc. 10 * 11 * Author: Michael Kelley <mikelley@microsoft.com> 12 */ 13 14#include <linux/percpu.h> 15#include <linux/cpumask.h> 16#include <linux/clockchips.h> 17#include <linux/clocksource.h> 18#include <linux/sched_clock.h> 19#include <linux/mm.h> 20#include <linux/cpuhotplug.h> 21#include <linux/interrupt.h> 22#include <linux/irq.h> 23#include <linux/acpi.h> 24#include <linux/hyperv.h> 25#include <linux/export.h> 26#include <clocksource/hyperv_timer.h> 27#include <hyperv/hvhdk.h> 28#include <asm/mshyperv.h> 29 30static struct clock_event_device __percpu *hv_clock_event; 31/* Note: offset can hold negative values after hibernation. */ 32static u64 hv_sched_clock_offset __read_mostly; 33 34/* 35 * If false, we're using the old mechanism for stimer0 interrupts 36 * where it sends a VMbus message when it expires. The old 37 * mechanism is used when running on older versions of Hyper-V 38 * that don't support Direct Mode. While Hyper-V provides 39 * four stimer's per CPU, Linux uses only stimer0. 40 * 41 * Because Direct Mode does not require processing a VMbus 42 * message, stimer interrupts can be enabled earlier in the 43 * process of booting a CPU, and consistent with when timer 44 * interrupts are enabled for other clocksource drivers. 45 * However, for legacy versions of Hyper-V when Direct Mode 46 * is not enabled, setting up stimer interrupts must be 47 * delayed until VMbus is initialized and can process the 48 * interrupt message. 49 */ 50static bool direct_mode_enabled; 51 52static int stimer0_irq = -1; 53static int stimer0_message_sint; 54static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); 55 56/* 57 * Common code for stimer0 interrupts coming via Direct Mode or 58 * as a VMbus message. 59 */ 60void hv_stimer0_isr(void) 61{ 62 struct clock_event_device *ce; 63 64 ce = this_cpu_ptr(hv_clock_event); 65 ce->event_handler(ce); 66} 67EXPORT_SYMBOL_GPL(hv_stimer0_isr); 68 69/* 70 * stimer0 interrupt handler for architectures that support 71 * per-cpu interrupts, which also implies Direct Mode. 72 */ 73static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) 74{ 75 hv_stimer0_isr(); 76 return IRQ_HANDLED; 77} 78 79static int hv_ce_set_next_event(unsigned long delta, 80 struct clock_event_device *evt) 81{ 82 u64 current_tick; 83 84 current_tick = hv_read_reference_counter(); 85 current_tick += delta; 86 hv_set_msr(HV_MSR_STIMER0_COUNT, current_tick); 87 return 0; 88} 89 90static int hv_ce_shutdown(struct clock_event_device *evt) 91{ 92 hv_set_msr(HV_MSR_STIMER0_COUNT, 0); 93 hv_set_msr(HV_MSR_STIMER0_CONFIG, 0); 94 if (direct_mode_enabled && stimer0_irq >= 0) 95 disable_percpu_irq(stimer0_irq); 96 97 return 0; 98} 99 100static int hv_ce_set_oneshot(struct clock_event_device *evt) 101{ 102 union hv_stimer_config timer_cfg; 103 104 timer_cfg.as_uint64 = 0; 105 timer_cfg.enable = 1; 106 timer_cfg.auto_enable = 1; 107 if (direct_mode_enabled) { 108 /* 109 * When it expires, the timer will directly interrupt 110 * on the specified hardware vector/IRQ. 111 */ 112 timer_cfg.direct_mode = 1; 113 timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; 114 if (stimer0_irq >= 0) 115 enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); 116 } else { 117 /* 118 * When it expires, the timer will generate a VMbus message, 119 * to be handled by the normal VMbus interrupt handler. 120 */ 121 timer_cfg.direct_mode = 0; 122 timer_cfg.sintx = stimer0_message_sint; 123 } 124 hv_set_msr(HV_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); 125 return 0; 126} 127 128/* 129 * hv_stimer_init - Per-cpu initialization of the clockevent 130 */ 131static int hv_stimer_init(unsigned int cpu) 132{ 133 struct clock_event_device *ce; 134 135 if (!hv_clock_event) 136 return 0; 137 138 ce = per_cpu_ptr(hv_clock_event, cpu); 139 ce->name = "Hyper-V clockevent"; 140 ce->features = CLOCK_EVT_FEAT_ONESHOT; 141 ce->cpumask = cpumask_of(cpu); 142 143 /* 144 * Lower the rating of the Hyper-V timer in a TDX VM without paravisor, 145 * so the local APIC timer (lapic_clockevent) is the default timer in 146 * such a VM. The Hyper-V timer is not preferred in such a VM because 147 * it depends on the slow VM Reference Counter MSR (the Hyper-V TSC 148 * page is not enbled in such a VM because the VM uses Invariant TSC 149 * as a better clocksource and it's challenging to mark the Hyper-V 150 * TSC page shared in very early boot). 151 */ 152 if (!ms_hyperv.paravisor_present && hv_isolation_type_tdx()) 153 ce->rating = 90; 154 else 155 ce->rating = 1000; 156 157 ce->set_state_shutdown = hv_ce_shutdown; 158 ce->set_state_oneshot = hv_ce_set_oneshot; 159 ce->set_next_event = hv_ce_set_next_event; 160 161 clockevents_config_and_register(ce, 162 HV_CLOCK_HZ, 163 HV_MIN_DELTA_TICKS, 164 HV_MAX_MAX_DELTA_TICKS); 165 return 0; 166} 167 168/* 169 * hv_stimer_cleanup - Per-cpu cleanup of the clockevent 170 */ 171int hv_stimer_cleanup(unsigned int cpu) 172{ 173 struct clock_event_device *ce; 174 175 if (!hv_clock_event) 176 return 0; 177 178 /* 179 * In the legacy case where Direct Mode is not enabled 180 * (which can only be on x86/64), stimer cleanup happens 181 * relatively early in the CPU offlining process. We 182 * must unbind the stimer-based clockevent device so 183 * that the LAPIC timer can take over until clockevents 184 * are no longer needed in the offlining process. Note 185 * that clockevents_unbind_device() eventually calls 186 * hv_ce_shutdown(). 187 * 188 * The unbind should not be done when Direct Mode is 189 * enabled because we may be on an architecture where 190 * there are no other clockevent devices to fallback to. 191 */ 192 ce = per_cpu_ptr(hv_clock_event, cpu); 193 if (direct_mode_enabled) 194 hv_ce_shutdown(ce); 195 else 196 clockevents_unbind_device(ce, cpu); 197 198 return 0; 199} 200EXPORT_SYMBOL_GPL(hv_stimer_cleanup); 201 202/* 203 * These placeholders are overridden by arch specific code on 204 * architectures that need special setup of the stimer0 IRQ because 205 * they don't support per-cpu IRQs (such as x86/x64). 206 */ 207void __weak hv_setup_stimer0_handler(void (*handler)(void)) 208{ 209}; 210 211void __weak hv_remove_stimer0_handler(void) 212{ 213}; 214 215#ifdef CONFIG_ACPI 216/* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ 217static int hv_setup_stimer0_irq(void) 218{ 219 int ret; 220 221 ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, 222 ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); 223 if (ret < 0) { 224 pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); 225 return ret; 226 } 227 stimer0_irq = ret; 228 229 ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, 230 "Hyper-V stimer0", &stimer0_evt); 231 if (ret) { 232 pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", 233 stimer0_irq, ret); 234 acpi_unregister_gsi(stimer0_irq); 235 stimer0_irq = -1; 236 } 237 return ret; 238} 239 240static void hv_remove_stimer0_irq(void) 241{ 242 if (stimer0_irq == -1) { 243 hv_remove_stimer0_handler(); 244 } else { 245 free_percpu_irq(stimer0_irq, &stimer0_evt); 246 acpi_unregister_gsi(stimer0_irq); 247 stimer0_irq = -1; 248 } 249} 250#else 251static int hv_setup_stimer0_irq(void) 252{ 253 return 0; 254} 255 256static void hv_remove_stimer0_irq(void) 257{ 258} 259#endif 260 261/* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ 262int hv_stimer_alloc(bool have_percpu_irqs) 263{ 264 int ret; 265 266 /* 267 * Synthetic timers are always available except on old versions of 268 * Hyper-V on x86. In that case, return as error as Linux will use a 269 * clockevent based on emulated LAPIC timer hardware. 270 */ 271 if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) 272 return -EINVAL; 273 274 hv_clock_event = alloc_percpu(struct clock_event_device); 275 if (!hv_clock_event) 276 return -ENOMEM; 277 278 direct_mode_enabled = ms_hyperv.misc_features & 279 HV_STIMER_DIRECT_MODE_AVAILABLE; 280 281 /* 282 * If Direct Mode isn't enabled, the remainder of the initialization 283 * is done later by hv_stimer_legacy_init() 284 */ 285 if (!direct_mode_enabled) 286 return 0; 287 288 if (have_percpu_irqs) { 289 ret = hv_setup_stimer0_irq(); 290 if (ret) 291 goto free_clock_event; 292 } else { 293 hv_setup_stimer0_handler(hv_stimer0_isr); 294 } 295 296 /* 297 * Since we are in Direct Mode, stimer initialization 298 * can be done now with a CPUHP value in the same range 299 * as other clockevent devices. 300 */ 301 ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, 302 "clockevents/hyperv/stimer:starting", 303 hv_stimer_init, hv_stimer_cleanup); 304 if (ret < 0) { 305 hv_remove_stimer0_irq(); 306 goto free_clock_event; 307 } 308 return ret; 309 310free_clock_event: 311 free_percpu(hv_clock_event); 312 hv_clock_event = NULL; 313 return ret; 314} 315EXPORT_SYMBOL_GPL(hv_stimer_alloc); 316 317/* 318 * hv_stimer_legacy_init -- Called from the VMbus driver to handle 319 * the case when Direct Mode is not enabled, and the stimer 320 * must be initialized late in the CPU onlining process. 321 * 322 */ 323void hv_stimer_legacy_init(unsigned int cpu, int sint) 324{ 325 if (direct_mode_enabled) 326 return; 327 328 /* 329 * This function gets called by each vCPU, so setting the 330 * global stimer_message_sint value each time is conceptually 331 * not ideal, but the value passed in is always the same and 332 * it avoids introducing yet another interface into this 333 * clocksource driver just to set the sint in the legacy case. 334 */ 335 stimer0_message_sint = sint; 336 (void)hv_stimer_init(cpu); 337} 338EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); 339 340/* 341 * hv_stimer_legacy_cleanup -- Called from the VMbus driver to 342 * handle the case when Direct Mode is not enabled, and the 343 * stimer must be cleaned up early in the CPU offlining 344 * process. 345 */ 346void hv_stimer_legacy_cleanup(unsigned int cpu) 347{ 348 if (direct_mode_enabled) 349 return; 350 (void)hv_stimer_cleanup(cpu); 351} 352EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); 353 354/* 355 * Do a global cleanup of clockevents for the cases of kexec and 356 * vmbus exit 357 */ 358void hv_stimer_global_cleanup(void) 359{ 360 int cpu; 361 362 /* 363 * hv_stime_legacy_cleanup() will stop the stimer if Direct 364 * Mode is not enabled, and fallback to the LAPIC timer. 365 */ 366 for_each_present_cpu(cpu) { 367 hv_stimer_legacy_cleanup(cpu); 368 } 369 370 if (!hv_clock_event) 371 return; 372 373 if (direct_mode_enabled) { 374 cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); 375 hv_remove_stimer0_irq(); 376 stimer0_irq = -1; 377 } 378 free_percpu(hv_clock_event); 379 hv_clock_event = NULL; 380 381} 382EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 383 384static __always_inline u64 read_hv_clock_msr(void) 385{ 386 /* 387 * Read the partition counter to get the current tick count. This count 388 * is set to 0 when the partition is created and is incremented in 100 389 * nanosecond units. 390 * 391 * Use hv_raw_get_msr() because this function is used from 392 * noinstr. Notable; while HV_MSR_TIME_REF_COUNT is a synthetic 393 * register it doesn't need the GHCB path. 394 */ 395 return hv_raw_get_msr(HV_MSR_TIME_REF_COUNT); 396} 397 398/* 399 * Code and definitions for the Hyper-V clocksources. Two 400 * clocksources are defined: one that reads the Hyper-V defined MSR, and 401 * the other that uses the TSC reference page feature as defined in the 402 * TLFS. The MSR version is for compatibility with old versions of 403 * Hyper-V and 32-bit x86. The TSC reference page version is preferred. 404 */ 405 406static union { 407 struct ms_hyperv_tsc_page page; 408 u8 reserved[PAGE_SIZE]; 409} tsc_pg __bss_decrypted __aligned(PAGE_SIZE); 410 411static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; 412static unsigned long tsc_pfn; 413 414unsigned long hv_get_tsc_pfn(void) 415{ 416 return tsc_pfn; 417} 418EXPORT_SYMBOL_GPL(hv_get_tsc_pfn); 419 420struct ms_hyperv_tsc_page *hv_get_tsc_page(void) 421{ 422 return tsc_page; 423} 424EXPORT_SYMBOL_GPL(hv_get_tsc_page); 425 426static __always_inline u64 read_hv_clock_tsc(void) 427{ 428 u64 cur_tsc, time; 429 430 /* 431 * The Hyper-V Top-Level Function Spec (TLFS), section Timers, 432 * subsection Refererence Counter, guarantees that the TSC and MSR 433 * times are in sync and monotonic. Therefore we can fall back 434 * to the MSR in case the TSC page indicates unavailability. 435 */ 436 if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time)) 437 time = read_hv_clock_msr(); 438 439 return time; 440} 441 442static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) 443{ 444 return read_hv_clock_tsc(); 445} 446 447static u64 noinstr read_hv_sched_clock_tsc(void) 448{ 449 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 450 (NSEC_PER_SEC / HV_CLOCK_HZ); 451} 452 453static void suspend_hv_clock_tsc(struct clocksource *arg) 454{ 455 union hv_reference_tsc_msr tsc_msr; 456 457 /* Disable the TSC page */ 458 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 459 tsc_msr.enable = 0; 460 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 461} 462 463 464static void resume_hv_clock_tsc(struct clocksource *arg) 465{ 466 union hv_reference_tsc_msr tsc_msr; 467 468 /* Re-enable the TSC page */ 469 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 470 tsc_msr.enable = 1; 471 tsc_msr.pfn = tsc_pfn; 472 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 473} 474 475/* 476 * Called during resume from hibernation, from overridden 477 * x86_platform.restore_sched_clock_state routine. This is to adjust offsets 478 * used to calculate time for hv tsc page based sched_clock, to account for 479 * time spent before hibernation. 480 */ 481void hv_adj_sched_clock_offset(u64 offset) 482{ 483 hv_sched_clock_offset -= offset; 484} 485 486#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 487static int hv_cs_enable(struct clocksource *cs) 488{ 489 vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); 490 return 0; 491} 492#endif 493 494static struct clocksource hyperv_cs_tsc = { 495 .name = "hyperv_clocksource_tsc_page", 496 .rating = 500, 497 .read = read_hv_clock_tsc_cs, 498 .mask = CLOCKSOURCE_MASK(64), 499 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 500 .suspend= suspend_hv_clock_tsc, 501 .resume = resume_hv_clock_tsc, 502#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 503 .enable = hv_cs_enable, 504 .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, 505#else 506 .vdso_clock_mode = VDSO_CLOCKMODE_NONE, 507#endif 508}; 509 510static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 511{ 512 return read_hv_clock_msr(); 513} 514 515static struct clocksource hyperv_cs_msr = { 516 .name = "hyperv_clocksource_msr", 517 .rating = 495, 518 .read = read_hv_clock_msr_cs, 519 .mask = CLOCKSOURCE_MASK(64), 520 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 521}; 522 523/* 524 * Reference to pv_ops must be inline so objtool 525 * detection of noinstr violations can work correctly. 526 */ 527#ifdef CONFIG_GENERIC_SCHED_CLOCK 528static __always_inline void hv_setup_sched_clock(void *sched_clock) 529{ 530 /* 531 * We're on an architecture with generic sched clock (not x86/x64). 532 * The Hyper-V sched clock read function returns nanoseconds, not 533 * the normal 100ns units of the Hyper-V synthetic clock. 534 */ 535 sched_clock_register(sched_clock, 64, NSEC_PER_SEC); 536} 537#elif defined CONFIG_PARAVIRT 538static __always_inline void hv_setup_sched_clock(void *sched_clock) 539{ 540 /* We're on x86/x64 *and* using PV ops */ 541 paravirt_set_sched_clock(sched_clock); 542} 543#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ 544static __always_inline void hv_setup_sched_clock(void *sched_clock) {} 545#endif /* CONFIG_GENERIC_SCHED_CLOCK */ 546 547static void __init hv_init_tsc_clocksource(void) 548{ 549 union hv_reference_tsc_msr tsc_msr; 550 551 /* 552 * When running as a guest partition: 553 * 554 * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly 555 * handles frequency and offset changes due to live migration, 556 * pause/resume, and other VM management operations. So lower the 557 * Hyper-V Reference TSC rating, causing the generic TSC to be used. 558 * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference 559 * TSC will be preferred over the virtualized ARM64 arch counter. 560 * 561 * When running as the root partition: 562 * 563 * There is no HV_ACCESS_TSC_INVARIANT feature. Always lower the rating 564 * of the Hyper-V Reference TSC. 565 */ 566 if ((ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) || 567 hv_root_partition()) { 568 hyperv_cs_tsc.rating = 250; 569 hyperv_cs_msr.rating = 245; 570 } 571 572 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 573 return; 574 575 hv_read_reference_counter = read_hv_clock_tsc; 576 577 /* 578 * TSC page mapping works differently in root compared to guest. 579 * - In guest partition the guest PFN has to be passed to the 580 * hypervisor. 581 * - In root partition it's other way around: it has to map the PFN 582 * provided by the hypervisor. 583 * But it can't be mapped right here as it's too early and MMU isn't 584 * ready yet. So, we only set the enable bit here and will remap the 585 * page later in hv_remap_tsc_clocksource(). 586 * 587 * It worth mentioning, that TSC clocksource read function 588 * (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when 589 * TSC page is zeroed (which is the case until the PFN is remapped) and 590 * thus TSC clocksource will work even without the real TSC page 591 * mapped. 592 */ 593 tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); 594 if (hv_root_partition()) 595 tsc_pfn = tsc_msr.pfn; 596 else 597 tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); 598 tsc_msr.enable = 1; 599 tsc_msr.pfn = tsc_pfn; 600 hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); 601 602 clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); 603 604 /* 605 * If TSC is invariant, then let it stay as the sched clock since it 606 * will be faster than reading the TSC page. But if not invariant, use 607 * the TSC page so that live migrations across hosts with different 608 * frequencies is handled correctly. 609 */ 610 if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) { 611 hv_sched_clock_offset = hv_read_reference_counter(); 612 hv_setup_sched_clock(read_hv_sched_clock_tsc); 613 } 614} 615 616void __init hv_init_clocksource(void) 617{ 618 /* 619 * Try to set up the TSC page clocksource, then the MSR clocksource. 620 * At least one of these will always be available except on very old 621 * versions of Hyper-V on x86. In that case we won't have a Hyper-V 622 * clocksource, but Linux will still run with a clocksource based 623 * on the emulated PIT or LAPIC timer. 624 * 625 * Never use the MSR clocksource as sched clock. It's too slow. 626 * Better to use the native sched clock as the fallback. 627 */ 628 hv_init_tsc_clocksource(); 629 630 if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) 631 clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); 632} 633 634void __init hv_remap_tsc_clocksource(void) 635{ 636 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 637 return; 638 639 if (!hv_root_partition()) { 640 WARN(1, "%s: attempt to remap TSC page in guest partition\n", 641 __func__); 642 return; 643 } 644 645 tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg), 646 MEMREMAP_WB); 647 if (!tsc_page) 648 pr_err("Failed to remap Hyper-V TSC page.\n"); 649}