at v6.4 17 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2 3/* 4 * Clocksource driver for the synthetic counter and timers 5 * provided by the Hyper-V hypervisor to guest VMs, as described 6 * in the Hyper-V Top Level Functional Spec (TLFS). This driver 7 * is instruction set architecture independent. 8 * 9 * Copyright (C) 2019, Microsoft, Inc. 10 * 11 * Author: Michael Kelley <mikelley@microsoft.com> 12 */ 13 14#include <linux/percpu.h> 15#include <linux/cpumask.h> 16#include <linux/clockchips.h> 17#include <linux/clocksource.h> 18#include <linux/sched_clock.h> 19#include <linux/mm.h> 20#include <linux/cpuhotplug.h> 21#include <linux/interrupt.h> 22#include <linux/irq.h> 23#include <linux/acpi.h> 24#include <linux/hyperv.h> 25#include <clocksource/hyperv_timer.h> 26#include <asm/hyperv-tlfs.h> 27#include <asm/mshyperv.h> 28 29static struct clock_event_device __percpu *hv_clock_event; 30static u64 hv_sched_clock_offset __ro_after_init; 31 32/* 33 * If false, we're using the old mechanism for stimer0 interrupts 34 * where it sends a VMbus message when it expires. The old 35 * mechanism is used when running on older versions of Hyper-V 36 * that don't support Direct Mode. While Hyper-V provides 37 * four stimer's per CPU, Linux uses only stimer0. 38 * 39 * Because Direct Mode does not require processing a VMbus 40 * message, stimer interrupts can be enabled earlier in the 41 * process of booting a CPU, and consistent with when timer 42 * interrupts are enabled for other clocksource drivers. 43 * However, for legacy versions of Hyper-V when Direct Mode 44 * is not enabled, setting up stimer interrupts must be 45 * delayed until VMbus is initialized and can process the 46 * interrupt message. 47 */ 48static bool direct_mode_enabled; 49 50static int stimer0_irq = -1; 51static int stimer0_message_sint; 52static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); 53 54/* 55 * Common code for stimer0 interrupts coming via Direct Mode or 56 * as a VMbus message. 57 */ 58void hv_stimer0_isr(void) 59{ 60 struct clock_event_device *ce; 61 62 ce = this_cpu_ptr(hv_clock_event); 63 ce->event_handler(ce); 64} 65EXPORT_SYMBOL_GPL(hv_stimer0_isr); 66 67/* 68 * stimer0 interrupt handler for architectures that support 69 * per-cpu interrupts, which also implies Direct Mode. 70 */ 71static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) 72{ 73 hv_stimer0_isr(); 74 return IRQ_HANDLED; 75} 76 77static int hv_ce_set_next_event(unsigned long delta, 78 struct clock_event_device *evt) 79{ 80 u64 current_tick; 81 82 current_tick = hv_read_reference_counter(); 83 current_tick += delta; 84 hv_set_register(HV_REGISTER_STIMER0_COUNT, current_tick); 85 return 0; 86} 87 88static int hv_ce_shutdown(struct clock_event_device *evt) 89{ 90 hv_set_register(HV_REGISTER_STIMER0_COUNT, 0); 91 hv_set_register(HV_REGISTER_STIMER0_CONFIG, 0); 92 if (direct_mode_enabled && stimer0_irq >= 0) 93 disable_percpu_irq(stimer0_irq); 94 95 return 0; 96} 97 98static int hv_ce_set_oneshot(struct clock_event_device *evt) 99{ 100 union hv_stimer_config timer_cfg; 101 102 timer_cfg.as_uint64 = 0; 103 timer_cfg.enable = 1; 104 timer_cfg.auto_enable = 1; 105 if (direct_mode_enabled) { 106 /* 107 * When it expires, the timer will directly interrupt 108 * on the specified hardware vector/IRQ. 109 */ 110 timer_cfg.direct_mode = 1; 111 timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; 112 if (stimer0_irq >= 0) 113 enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); 114 } else { 115 /* 116 * When it expires, the timer will generate a VMbus message, 117 * to be handled by the normal VMbus interrupt handler. 118 */ 119 timer_cfg.direct_mode = 0; 120 timer_cfg.sintx = stimer0_message_sint; 121 } 122 hv_set_register(HV_REGISTER_STIMER0_CONFIG, timer_cfg.as_uint64); 123 return 0; 124} 125 126/* 127 * hv_stimer_init - Per-cpu initialization of the clockevent 128 */ 129static int hv_stimer_init(unsigned int cpu) 130{ 131 struct clock_event_device *ce; 132 133 if (!hv_clock_event) 134 return 0; 135 136 ce = per_cpu_ptr(hv_clock_event, cpu); 137 ce->name = "Hyper-V clockevent"; 138 ce->features = CLOCK_EVT_FEAT_ONESHOT; 139 ce->cpumask = cpumask_of(cpu); 140 ce->rating = 1000; 141 ce->set_state_shutdown = hv_ce_shutdown; 142 ce->set_state_oneshot = hv_ce_set_oneshot; 143 ce->set_next_event = hv_ce_set_next_event; 144 145 clockevents_config_and_register(ce, 146 HV_CLOCK_HZ, 147 HV_MIN_DELTA_TICKS, 148 HV_MAX_MAX_DELTA_TICKS); 149 return 0; 150} 151 152/* 153 * hv_stimer_cleanup - Per-cpu cleanup of the clockevent 154 */ 155int hv_stimer_cleanup(unsigned int cpu) 156{ 157 struct clock_event_device *ce; 158 159 if (!hv_clock_event) 160 return 0; 161 162 /* 163 * In the legacy case where Direct Mode is not enabled 164 * (which can only be on x86/64), stimer cleanup happens 165 * relatively early in the CPU offlining process. We 166 * must unbind the stimer-based clockevent device so 167 * that the LAPIC timer can take over until clockevents 168 * are no longer needed in the offlining process. Note 169 * that clockevents_unbind_device() eventually calls 170 * hv_ce_shutdown(). 171 * 172 * The unbind should not be done when Direct Mode is 173 * enabled because we may be on an architecture where 174 * there are no other clockevent devices to fallback to. 175 */ 176 ce = per_cpu_ptr(hv_clock_event, cpu); 177 if (direct_mode_enabled) 178 hv_ce_shutdown(ce); 179 else 180 clockevents_unbind_device(ce, cpu); 181 182 return 0; 183} 184EXPORT_SYMBOL_GPL(hv_stimer_cleanup); 185 186/* 187 * These placeholders are overridden by arch specific code on 188 * architectures that need special setup of the stimer0 IRQ because 189 * they don't support per-cpu IRQs (such as x86/x64). 190 */ 191void __weak hv_setup_stimer0_handler(void (*handler)(void)) 192{ 193}; 194 195void __weak hv_remove_stimer0_handler(void) 196{ 197}; 198 199#ifdef CONFIG_ACPI 200/* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ 201static int hv_setup_stimer0_irq(void) 202{ 203 int ret; 204 205 ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, 206 ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); 207 if (ret < 0) { 208 pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); 209 return ret; 210 } 211 stimer0_irq = ret; 212 213 ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, 214 "Hyper-V stimer0", &stimer0_evt); 215 if (ret) { 216 pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", 217 stimer0_irq, ret); 218 acpi_unregister_gsi(stimer0_irq); 219 stimer0_irq = -1; 220 } 221 return ret; 222} 223 224static void hv_remove_stimer0_irq(void) 225{ 226 if (stimer0_irq == -1) { 227 hv_remove_stimer0_handler(); 228 } else { 229 free_percpu_irq(stimer0_irq, &stimer0_evt); 230 acpi_unregister_gsi(stimer0_irq); 231 stimer0_irq = -1; 232 } 233} 234#else 235static int hv_setup_stimer0_irq(void) 236{ 237 return 0; 238} 239 240static void hv_remove_stimer0_irq(void) 241{ 242} 243#endif 244 245/* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ 246int hv_stimer_alloc(bool have_percpu_irqs) 247{ 248 int ret; 249 250 /* 251 * Synthetic timers are always available except on old versions of 252 * Hyper-V on x86. In that case, return as error as Linux will use a 253 * clockevent based on emulated LAPIC timer hardware. 254 */ 255 if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) 256 return -EINVAL; 257 258 hv_clock_event = alloc_percpu(struct clock_event_device); 259 if (!hv_clock_event) 260 return -ENOMEM; 261 262 direct_mode_enabled = ms_hyperv.misc_features & 263 HV_STIMER_DIRECT_MODE_AVAILABLE; 264 265 /* 266 * If Direct Mode isn't enabled, the remainder of the initialization 267 * is done later by hv_stimer_legacy_init() 268 */ 269 if (!direct_mode_enabled) 270 return 0; 271 272 if (have_percpu_irqs) { 273 ret = hv_setup_stimer0_irq(); 274 if (ret) 275 goto free_clock_event; 276 } else { 277 hv_setup_stimer0_handler(hv_stimer0_isr); 278 } 279 280 /* 281 * Since we are in Direct Mode, stimer initialization 282 * can be done now with a CPUHP value in the same range 283 * as other clockevent devices. 284 */ 285 ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, 286 "clockevents/hyperv/stimer:starting", 287 hv_stimer_init, hv_stimer_cleanup); 288 if (ret < 0) { 289 hv_remove_stimer0_irq(); 290 goto free_clock_event; 291 } 292 return ret; 293 294free_clock_event: 295 free_percpu(hv_clock_event); 296 hv_clock_event = NULL; 297 return ret; 298} 299EXPORT_SYMBOL_GPL(hv_stimer_alloc); 300 301/* 302 * hv_stimer_legacy_init -- Called from the VMbus driver to handle 303 * the case when Direct Mode is not enabled, and the stimer 304 * must be initialized late in the CPU onlining process. 305 * 306 */ 307void hv_stimer_legacy_init(unsigned int cpu, int sint) 308{ 309 if (direct_mode_enabled) 310 return; 311 312 /* 313 * This function gets called by each vCPU, so setting the 314 * global stimer_message_sint value each time is conceptually 315 * not ideal, but the value passed in is always the same and 316 * it avoids introducing yet another interface into this 317 * clocksource driver just to set the sint in the legacy case. 318 */ 319 stimer0_message_sint = sint; 320 (void)hv_stimer_init(cpu); 321} 322EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); 323 324/* 325 * hv_stimer_legacy_cleanup -- Called from the VMbus driver to 326 * handle the case when Direct Mode is not enabled, and the 327 * stimer must be cleaned up early in the CPU offlining 328 * process. 329 */ 330void hv_stimer_legacy_cleanup(unsigned int cpu) 331{ 332 if (direct_mode_enabled) 333 return; 334 (void)hv_stimer_cleanup(cpu); 335} 336EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); 337 338/* 339 * Do a global cleanup of clockevents for the cases of kexec and 340 * vmbus exit 341 */ 342void hv_stimer_global_cleanup(void) 343{ 344 int cpu; 345 346 /* 347 * hv_stime_legacy_cleanup() will stop the stimer if Direct 348 * Mode is not enabled, and fallback to the LAPIC timer. 349 */ 350 for_each_present_cpu(cpu) { 351 hv_stimer_legacy_cleanup(cpu); 352 } 353 354 if (!hv_clock_event) 355 return; 356 357 if (direct_mode_enabled) { 358 cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); 359 hv_remove_stimer0_irq(); 360 stimer0_irq = -1; 361 } 362 free_percpu(hv_clock_event); 363 hv_clock_event = NULL; 364 365} 366EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 367 368/* 369 * Code and definitions for the Hyper-V clocksources. Two 370 * clocksources are defined: one that reads the Hyper-V defined MSR, and 371 * the other that uses the TSC reference page feature as defined in the 372 * TLFS. The MSR version is for compatibility with old versions of 373 * Hyper-V and 32-bit x86. The TSC reference page version is preferred. 374 */ 375 376static union { 377 struct ms_hyperv_tsc_page page; 378 u8 reserved[PAGE_SIZE]; 379} tsc_pg __aligned(PAGE_SIZE); 380 381static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; 382static unsigned long tsc_pfn; 383 384unsigned long hv_get_tsc_pfn(void) 385{ 386 return tsc_pfn; 387} 388EXPORT_SYMBOL_GPL(hv_get_tsc_pfn); 389 390struct ms_hyperv_tsc_page *hv_get_tsc_page(void) 391{ 392 return tsc_page; 393} 394EXPORT_SYMBOL_GPL(hv_get_tsc_page); 395 396static u64 notrace read_hv_clock_tsc(void) 397{ 398 u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); 399 400 if (current_tick == U64_MAX) 401 current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT); 402 403 return current_tick; 404} 405 406static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) 407{ 408 return read_hv_clock_tsc(); 409} 410 411static u64 notrace read_hv_sched_clock_tsc(void) 412{ 413 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 414 (NSEC_PER_SEC / HV_CLOCK_HZ); 415} 416 417static void suspend_hv_clock_tsc(struct clocksource *arg) 418{ 419 union hv_reference_tsc_msr tsc_msr; 420 421 /* Disable the TSC page */ 422 tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); 423 tsc_msr.enable = 0; 424 hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); 425} 426 427 428static void resume_hv_clock_tsc(struct clocksource *arg) 429{ 430 union hv_reference_tsc_msr tsc_msr; 431 432 /* Re-enable the TSC page */ 433 tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); 434 tsc_msr.enable = 1; 435 tsc_msr.pfn = tsc_pfn; 436 hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); 437} 438 439#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 440static int hv_cs_enable(struct clocksource *cs) 441{ 442 vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); 443 return 0; 444} 445#endif 446 447static struct clocksource hyperv_cs_tsc = { 448 .name = "hyperv_clocksource_tsc_page", 449 .rating = 500, 450 .read = read_hv_clock_tsc_cs, 451 .mask = CLOCKSOURCE_MASK(64), 452 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 453 .suspend= suspend_hv_clock_tsc, 454 .resume = resume_hv_clock_tsc, 455#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 456 .enable = hv_cs_enable, 457 .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, 458#else 459 .vdso_clock_mode = VDSO_CLOCKMODE_NONE, 460#endif 461}; 462 463static u64 notrace read_hv_clock_msr(void) 464{ 465 /* 466 * Read the partition counter to get the current tick count. This count 467 * is set to 0 when the partition is created and is incremented in 468 * 100 nanosecond units. 469 */ 470 return hv_get_register(HV_REGISTER_TIME_REF_COUNT); 471} 472 473static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 474{ 475 return read_hv_clock_msr(); 476} 477 478static u64 notrace read_hv_sched_clock_msr(void) 479{ 480 return (read_hv_clock_msr() - hv_sched_clock_offset) * 481 (NSEC_PER_SEC / HV_CLOCK_HZ); 482} 483 484static struct clocksource hyperv_cs_msr = { 485 .name = "hyperv_clocksource_msr", 486 .rating = 500, 487 .read = read_hv_clock_msr_cs, 488 .mask = CLOCKSOURCE_MASK(64), 489 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 490}; 491 492/* 493 * Reference to pv_ops must be inline so objtool 494 * detection of noinstr violations can work correctly. 495 */ 496#ifdef CONFIG_GENERIC_SCHED_CLOCK 497static __always_inline void hv_setup_sched_clock(void *sched_clock) 498{ 499 /* 500 * We're on an architecture with generic sched clock (not x86/x64). 501 * The Hyper-V sched clock read function returns nanoseconds, not 502 * the normal 100ns units of the Hyper-V synthetic clock. 503 */ 504 sched_clock_register(sched_clock, 64, NSEC_PER_SEC); 505} 506#elif defined CONFIG_PARAVIRT 507static __always_inline void hv_setup_sched_clock(void *sched_clock) 508{ 509 /* We're on x86/x64 *and* using PV ops */ 510 paravirt_set_sched_clock(sched_clock); 511} 512#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ 513static __always_inline void hv_setup_sched_clock(void *sched_clock) {} 514#endif /* CONFIG_GENERIC_SCHED_CLOCK */ 515 516static bool __init hv_init_tsc_clocksource(void) 517{ 518 union hv_reference_tsc_msr tsc_msr; 519 520 /* 521 * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly 522 * handles frequency and offset changes due to live migration, 523 * pause/resume, and other VM management operations. So lower the 524 * Hyper-V Reference TSC rating, causing the generic TSC to be used. 525 * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference 526 * TSC will be preferred over the virtualized ARM64 arch counter. 527 * While the Hyper-V MSR clocksource won't be used since the 528 * Reference TSC clocksource is present, change its rating as 529 * well for consistency. 530 */ 531 if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { 532 hyperv_cs_tsc.rating = 250; 533 hyperv_cs_msr.rating = 250; 534 } 535 536 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 537 return false; 538 539 hv_read_reference_counter = read_hv_clock_tsc; 540 541 /* 542 * TSC page mapping works differently in root compared to guest. 543 * - In guest partition the guest PFN has to be passed to the 544 * hypervisor. 545 * - In root partition it's other way around: it has to map the PFN 546 * provided by the hypervisor. 547 * But it can't be mapped right here as it's too early and MMU isn't 548 * ready yet. So, we only set the enable bit here and will remap the 549 * page later in hv_remap_tsc_clocksource(). 550 * 551 * It worth mentioning, that TSC clocksource read function 552 * (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when 553 * TSC page is zeroed (which is the case until the PFN is remapped) and 554 * thus TSC clocksource will work even without the real TSC page 555 * mapped. 556 */ 557 tsc_msr.as_uint64 = hv_get_register(HV_REGISTER_REFERENCE_TSC); 558 if (hv_root_partition) 559 tsc_pfn = tsc_msr.pfn; 560 else 561 tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); 562 tsc_msr.enable = 1; 563 tsc_msr.pfn = tsc_pfn; 564 hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr.as_uint64); 565 566 clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); 567 568 hv_sched_clock_offset = hv_read_reference_counter(); 569 hv_setup_sched_clock(read_hv_sched_clock_tsc); 570 571 return true; 572} 573 574void __init hv_init_clocksource(void) 575{ 576 /* 577 * Try to set up the TSC page clocksource. If it succeeds, we're 578 * done. Otherwise, set up the MSR clocksource. At least one of 579 * these will always be available except on very old versions of 580 * Hyper-V on x86. In that case we won't have a Hyper-V 581 * clocksource, but Linux will still run with a clocksource based 582 * on the emulated PIT or LAPIC timer. 583 */ 584 if (hv_init_tsc_clocksource()) 585 return; 586 587 if (!(ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)) 588 return; 589 590 hv_read_reference_counter = read_hv_clock_msr; 591 clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); 592 593 hv_sched_clock_offset = hv_read_reference_counter(); 594 hv_setup_sched_clock(read_hv_sched_clock_msr); 595} 596 597void __init hv_remap_tsc_clocksource(void) 598{ 599 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 600 return; 601 602 if (!hv_root_partition) { 603 WARN(1, "%s: attempt to remap TSC page in guest partition\n", 604 __func__); 605 return; 606 } 607 608 tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg), 609 MEMREMAP_WB); 610 if (!tsc_page) 611 pr_err("Failed to remap Hyper-V TSC page.\n"); 612}