ptp: Add support for the AMZNC10C 'vmclock' device

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The vmclock device addresses the problem of live migration with
precision clocks. The tolerances of a hardware counter (e.g. TSC) are
typically around ±50PPM. A guest will use NTP/PTP/PPS to discipline that
counter against an external source of 'real' time, and track the precise
frequency of the counter as it changes with environmental conditions.

When a guest is live migrated, anything it knows about the frequency of
the underlying counter becomes invalid. It may move from a host where
the counter running at -50PPM of its nominal frequency, to a host where
it runs at +50PPM. There will also be a step change in the value of the
counter, as the correctness of its absolute value at migration is
limited by the accuracy of the source and destination host's time
synchronization.

In its simplest form, the device merely advertises a 'disruption_marker'
which indicates that the guest should throw away any NTP synchronization
it thinks it has, and start again.

Because the shared memory region can be exposed all the way to userspace
through the /dev/vmclock0 node, applications can still use time from a
fast vDSO 'system call', and check the disruption marker to be sure that
their timestamp is indeed truthful.

The structure also allows for the precise time, as known by the host, to
be exposed directly to guests so that they don't have to wait for NTP to
resync from scratch. The PTP driver consumes this information if present.
Like the KVM PTP clock, this PTP driver can convert TSC-based cross
timestamps into KVM clock values. Unlike the KVM PTP clock, it does so
only when such is actually helpful.

The values and fields are based on the nascent virtio-rtc specification,
and the intent is that a version (hopefully precisely this version) of
this structure will be included as an optional part of that spec. In the
meantime, this driver supports the simple ACPI form of the device which
is being shipped in certain commercial hypervisors (and submitted for
inclusion in QEMU).

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

David Woodhouse and committed by

David S. Miller 2 years ago 20503272 f31fd0b3

+818

5 changed files

expand all

MAINTAINERS

drivers

ptp

Kconfig

Makefile

ptp_vmclock.c

include

uapi

linux

vmclock-abi.h

MAINTAINERS

··· 18683 18683 F: drivers/ptp/ptp_vclock.c 18684 18684 F: net/ethtool/phc_vclocks.c 18685 18685 18686 + PTP VMCLOCK SUPPORT 18687 + M: David Woodhouse <dwmw2@infradead.org> 18688 + L: netdev@vger.kernel.org 18689 + S: Maintained 18690 + F: drivers/ptp/ptp_vmclock.c 18691 + F: include/uapi/linux/vmclock-abi.h 18692 + 18686 18693 PTRACE SUPPORT 18687 18694 M: Oleg Nesterov <oleg@redhat.com> 18688 18695 S: Maintained

+13

drivers/ptp/Kconfig

··· 131 131 To compile this driver as a module, choose M here: the module 132 132 will be called ptp_kvm. 133 133 134 + config PTP_1588_CLOCK_VMCLOCK 135 + tristate "Virtual machine PTP clock" 136 + depends on X86_TSC || ARM_ARCH_TIMER 137 + depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 138 + default y 139 + help 140 + This driver adds support for using a virtual precision clock 141 + advertised by the hypervisor. This clock is only useful in virtual 142 + machines where such a device is present. 143 + 144 + To compile this driver as a module, choose M here: the module 145 + will be called ptp_vmclock. 146 + 134 147 config PTP_1588_CLOCK_IDT82P33 135 148 tristate "IDT 82P33xxx PTP clock" 136 149 depends on PTP_1588_CLOCK && I2C

drivers/ptp/Makefile

··· 11 11 obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o 12 12 obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o 13 13 obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o 14 + obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK) += ptp_vmclock.o 14 15 obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp-qoriq.o 15 16 ptp-qoriq-y += ptp_qoriq.o 16 17 ptp-qoriq-$(CONFIG_DEBUG_FS) += ptp_qoriq_debugfs.o

+615

drivers/ptp/ptp_vmclock.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Virtual PTP 1588 clock for use with LM-safe VMclock device. 4 + * 5 + * Copyright © 2024 Amazon.com, Inc. or its affiliates. 6 + */ 7 + 8 + #include <linux/acpi.h> 9 + #include <linux/device.h> 10 + #include <linux/err.h> 11 + #include <linux/file.h> 12 + #include <linux/fs.h> 13 + #include <linux/init.h> 14 + #include <linux/kernel.h> 15 + #include <linux/miscdevice.h> 16 + #include <linux/mm.h> 17 + #include <linux/module.h> 18 + #include <linux/platform_device.h> 19 + #include <linux/slab.h> 20 + 21 + #include <uapi/linux/vmclock-abi.h> 22 + 23 + #include <linux/ptp_clock_kernel.h> 24 + 25 + #ifdef CONFIG_X86 26 + #include <asm/pvclock.h> 27 + #include <asm/kvmclock.h> 28 + #endif 29 + 30 + #ifdef CONFIG_KVM_GUEST 31 + #define SUPPORT_KVMCLOCK 32 + #endif 33 + 34 + static DEFINE_IDA(vmclock_ida); 35 + 36 + ACPI_MODULE_NAME("vmclock"); 37 + 38 + struct vmclock_state { 39 + struct resource res; 40 + struct vmclock_abi *clk; 41 + struct miscdevice miscdev; 42 + struct ptp_clock_info ptp_clock_info; 43 + struct ptp_clock *ptp_clock; 44 + enum clocksource_ids cs_id, sys_cs_id; 45 + int index; 46 + char *name; 47 + }; 48 + 49 + #define VMCLOCK_MAX_WAIT ms_to_ktime(100) 50 + 51 + /* Require at least the flags field to be present. All else can be optional. */ 52 + #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad) 53 + 54 + #define VMCLOCK_FIELD_PRESENT(_c, _f) \ 55 + (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \ 56 + sizeof((_c)->_f))) 57 + 58 + /* 59 + * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64 60 + * and add the fractional second part of the reference time. 61 + * 62 + * The result is a 128-bit value, the top 64 bits of which are seconds, and 63 + * the low 64 bits are (seconds >> 64). 64 + */ 65 + static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, 66 + uint64_t period, uint8_t shift, 67 + uint64_t frac_sec) 68 + { 69 + unsigned __int128 res = (unsigned __int128)delta * period; 70 + 71 + res >>= shift; 72 + res += frac_sec; 73 + *res_hi = res >> 64; 74 + return (uint64_t)res; 75 + } 76 + 77 + static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) 78 + { 79 + if (likely(clk->time_type == VMCLOCK_TIME_UTC)) 80 + return true; 81 + 82 + if (clk->time_type == VMCLOCK_TIME_TAI && 83 + (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) { 84 + if (sec) 85 + *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec); 86 + return true; 87 + } 88 + return false; 89 + } 90 + 91 + static int vmclock_get_crosststamp(struct vmclock_state *st, 92 + struct ptp_system_timestamp *sts, 93 + struct system_counterval_t *system_counter, 94 + struct timespec64 *tspec) 95 + { 96 + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); 97 + struct system_time_snapshot systime_snapshot; 98 + uint64_t cycle, delta, seq, frac_sec; 99 + 100 + #ifdef CONFIG_X86 101 + /* 102 + * We'd expect the hypervisor to know this and to report the clock 103 + * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid. 104 + */ 105 + if (check_tsc_unstable()) 106 + return -EINVAL; 107 + #endif 108 + 109 + while (1) { 110 + seq = le32_to_cpu(st->clk->seq_count) & ~1ULL; 111 + 112 + /* 113 + * This pairs with a write barrier in the hypervisor 114 + * which populates this structure. 115 + */ 116 + virt_rmb(); 117 + 118 + if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE) 119 + return -EINVAL; 120 + 121 + /* 122 + * When invoked for gettimex64(), fill in the pre/post system 123 + * times. The simple case is when system time is based on the 124 + * same counter as st->cs_id, in which case all three times 125 + * will be derived from the *same* counter value. 126 + * 127 + * If the system isn't using the same counter, then the value 128 + * from ktime_get_snapshot() will still be used as pre_ts, and 129 + * ptp_read_system_postts() is called to populate postts after 130 + * calling get_cycles(). 131 + * 132 + * The conversion to timespec64 happens further down, outside 133 + * the seq_count loop. 134 + */ 135 + if (sts) { 136 + ktime_get_snapshot(&systime_snapshot); 137 + if (systime_snapshot.cs_id == st->cs_id) { 138 + cycle = systime_snapshot.cycles; 139 + } else { 140 + cycle = get_cycles(); 141 + ptp_read_system_postts(sts); 142 + } 143 + } else { 144 + cycle = get_cycles(); 145 + } 146 + 147 + delta = cycle - le64_to_cpu(st->clk->counter_value); 148 + 149 + frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta, 150 + le64_to_cpu(st->clk->counter_period_frac_sec), 151 + st->clk->counter_period_shift, 152 + le64_to_cpu(st->clk->time_frac_sec)); 153 + tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64); 154 + tspec->tv_sec += le64_to_cpu(st->clk->time_sec); 155 + 156 + if (!tai_adjust(st->clk, &tspec->tv_sec)) 157 + return -EINVAL; 158 + 159 + /* 160 + * This pairs with a write barrier in the hypervisor 161 + * which populates this structure. 162 + */ 163 + virt_rmb(); 164 + if (seq == le32_to_cpu(st->clk->seq_count)) 165 + break; 166 + 167 + if (ktime_after(ktime_get(), deadline)) 168 + return -ETIMEDOUT; 169 + } 170 + 171 + if (system_counter) { 172 + system_counter->cycles = cycle; 173 + system_counter->cs_id = st->cs_id; 174 + } 175 + 176 + if (sts) { 177 + sts->pre_ts = ktime_to_timespec64(systime_snapshot.real); 178 + if (systime_snapshot.cs_id == st->cs_id) 179 + sts->post_ts = sts->pre_ts; 180 + } 181 + 182 + return 0; 183 + } 184 + 185 + #ifdef SUPPORT_KVMCLOCK 186 + /* 187 + * In the case where the system is using the KVM clock for timekeeping, convert 188 + * the TSC value into a KVM clock time in order to return a paired reading that 189 + * get_device_system_crosststamp() can cope with. 190 + */ 191 + static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st, 192 + struct ptp_system_timestamp *sts, 193 + struct system_counterval_t *system_counter, 194 + struct timespec64 *tspec) 195 + { 196 + struct pvclock_vcpu_time_info *pvti = this_cpu_pvti(); 197 + unsigned int pvti_ver; 198 + int ret; 199 + 200 + preempt_disable_notrace(); 201 + 202 + do { 203 + pvti_ver = pvclock_read_begin(pvti); 204 + 205 + ret = vmclock_get_crosststamp(st, sts, system_counter, tspec); 206 + if (ret) 207 + break; 208 + 209 + system_counter->cycles = __pvclock_read_cycles(pvti, 210 + system_counter->cycles); 211 + system_counter->cs_id = CSID_X86_KVM_CLK; 212 + 213 + /* 214 + * This retry should never really happen; if the TSC is 215 + * stable and reliable enough across vCPUS that it is sane 216 + * for the hypervisor to expose a VMCLOCK device which uses 217 + * it as the reference counter, then the KVM clock sohuld be 218 + * in 'master clock mode' and basically never changed. But 219 + * the KVM clock is a fickle and often broken thing, so do 220 + * it "properly" just in case. 221 + */ 222 + } while (pvclock_read_retry(pvti, pvti_ver)); 223 + 224 + preempt_enable_notrace(); 225 + 226 + return ret; 227 + } 228 + #endif 229 + 230 + static int ptp_vmclock_get_time_fn(ktime_t *device_time, 231 + struct system_counterval_t *system_counter, 232 + void *ctx) 233 + { 234 + struct vmclock_state *st = ctx; 235 + struct timespec64 tspec; 236 + int ret; 237 + 238 + #ifdef SUPPORT_KVMCLOCK 239 + if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK) 240 + ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter, 241 + &tspec); 242 + else 243 + #endif 244 + ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec); 245 + 246 + if (!ret) 247 + *device_time = timespec64_to_ktime(tspec); 248 + 249 + return ret; 250 + } 251 + 252 + static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp, 253 + struct system_device_crosststamp *xtstamp) 254 + { 255 + struct vmclock_state *st = container_of(ptp, struct vmclock_state, 256 + ptp_clock_info); 257 + int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st, 258 + NULL, xtstamp); 259 + #ifdef SUPPORT_KVMCLOCK 260 + /* 261 + * On x86, the KVM clock may be used for the system time. We can 262 + * actually convert a TSC reading to that, and return a paired 263 + * timestamp that get_device_system_crosststamp() *can* handle. 264 + */ 265 + if (ret == -ENODEV) { 266 + struct system_time_snapshot systime_snapshot; 267 + 268 + ktime_get_snapshot(&systime_snapshot); 269 + 270 + if (systime_snapshot.cs_id == CSID_X86_TSC || 271 + systime_snapshot.cs_id == CSID_X86_KVM_CLK) { 272 + WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id); 273 + ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, 274 + st, NULL, xtstamp); 275 + } 276 + } 277 + #endif 278 + return ret; 279 + } 280 + 281 + /* 282 + * PTP clock operations 283 + */ 284 + 285 + static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta) 286 + { 287 + return -EOPNOTSUPP; 288 + } 289 + 290 + static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta) 291 + { 292 + return -EOPNOTSUPP; 293 + } 294 + 295 + static int ptp_vmclock_settime(struct ptp_clock_info *ptp, 296 + const struct timespec64 *ts) 297 + { 298 + return -EOPNOTSUPP; 299 + } 300 + 301 + static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, 302 + struct ptp_system_timestamp *sts) 303 + { 304 + struct vmclock_state *st = container_of(ptp, struct vmclock_state, 305 + ptp_clock_info); 306 + 307 + return vmclock_get_crosststamp(st, sts, NULL, ts); 308 + } 309 + 310 + static int ptp_vmclock_enable(struct ptp_clock_info *ptp, 311 + struct ptp_clock_request *rq, int on) 312 + { 313 + return -EOPNOTSUPP; 314 + } 315 + 316 + static const struct ptp_clock_info ptp_vmclock_info = { 317 + .owner = THIS_MODULE, 318 + .max_adj = 0, 319 + .n_ext_ts = 0, 320 + .n_pins = 0, 321 + .pps = 0, 322 + .adjfine = ptp_vmclock_adjfine, 323 + .adjtime = ptp_vmclock_adjtime, 324 + .gettimex64 = ptp_vmclock_gettimex, 325 + .settime64 = ptp_vmclock_settime, 326 + .enable = ptp_vmclock_enable, 327 + .getcrosststamp = ptp_vmclock_getcrosststamp, 328 + }; 329 + 330 + static struct ptp_clock *vmclock_ptp_register(struct device *dev, 331 + struct vmclock_state *st) 332 + { 333 + enum clocksource_ids cs_id; 334 + 335 + if (IS_ENABLED(CONFIG_ARM64) && 336 + st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) { 337 + /* Can we check it's the virtual counter? */ 338 + cs_id = CSID_ARM_ARCH_COUNTER; 339 + } else if (IS_ENABLED(CONFIG_X86) && 340 + st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) { 341 + cs_id = CSID_X86_TSC; 342 + } else { 343 + return NULL; 344 + } 345 + 346 + /* Only UTC, or TAI with offset */ 347 + if (!tai_adjust(st->clk, NULL)) { 348 + dev_info(dev, "vmclock does not provide unambiguous UTC\n"); 349 + return NULL; 350 + } 351 + 352 + st->sys_cs_id = cs_id; 353 + st->cs_id = cs_id; 354 + st->ptp_clock_info = ptp_vmclock_info; 355 + strscpy(st->ptp_clock_info.name, st->name); 356 + 357 + return ptp_clock_register(&st->ptp_clock_info, dev); 358 + } 359 + 360 + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) 361 + { 362 + struct vmclock_state *st = container_of(fp->private_data, 363 + struct vmclock_state, miscdev); 364 + 365 + if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) 366 + return -EROFS; 367 + 368 + if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff) 369 + return -EINVAL; 370 + 371 + if (io_remap_pfn_range(vma, vma->vm_start, 372 + st->res.start >> PAGE_SHIFT, PAGE_SIZE, 373 + vma->vm_page_prot)) 374 + return -EAGAIN; 375 + 376 + return 0; 377 + } 378 + 379 + static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, 380 + size_t count, loff_t *ppos) 381 + { 382 + struct vmclock_state *st = container_of(fp->private_data, 383 + struct vmclock_state, miscdev); 384 + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); 385 + size_t max_count; 386 + uint32_t seq; 387 + 388 + if (*ppos >= PAGE_SIZE) 389 + return 0; 390 + 391 + max_count = PAGE_SIZE - *ppos; 392 + if (count > max_count) 393 + count = max_count; 394 + 395 + while (1) { 396 + seq = le32_to_cpu(st->clk->seq_count) & ~1U; 397 + /* Pairs with hypervisor wmb */ 398 + virt_rmb(); 399 + 400 + if (copy_to_user(buf, ((char *)st->clk) + *ppos, count)) 401 + return -EFAULT; 402 + 403 + /* Pairs with hypervisor wmb */ 404 + virt_rmb(); 405 + if (seq == le32_to_cpu(st->clk->seq_count)) 406 + break; 407 + 408 + if (ktime_after(ktime_get(), deadline)) 409 + return -ETIMEDOUT; 410 + } 411 + 412 + *ppos += count; 413 + return count; 414 + } 415 + 416 + static const struct file_operations vmclock_miscdev_fops = { 417 + .mmap = vmclock_miscdev_mmap, 418 + .read = vmclock_miscdev_read, 419 + }; 420 + 421 + /* module operations */ 422 + 423 + static void vmclock_remove(struct platform_device *pdev) 424 + { 425 + struct device *dev = &pdev->dev; 426 + struct vmclock_state *st = dev_get_drvdata(dev); 427 + 428 + if (st->ptp_clock) 429 + ptp_clock_unregister(st->ptp_clock); 430 + 431 + if (st->miscdev.minor != MISC_DYNAMIC_MINOR) 432 + misc_deregister(&st->miscdev); 433 + } 434 + 435 + static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) 436 + { 437 + struct vmclock_state *st = data; 438 + struct resource_win win; 439 + struct resource *res = &win.res; 440 + 441 + if (ares->type == ACPI_RESOURCE_TYPE_END_TAG) 442 + return AE_OK; 443 + 444 + /* There can be only one */ 445 + if (resource_type(&st->res) == IORESOURCE_MEM) 446 + return AE_ERROR; 447 + 448 + if (acpi_dev_resource_memory(ares, res) || 449 + acpi_dev_resource_address_space(ares, &win)) { 450 + 451 + if (resource_type(res) != IORESOURCE_MEM || 452 + resource_size(res) < sizeof(st->clk)) 453 + return AE_ERROR; 454 + 455 + st->res = *res; 456 + return AE_OK; 457 + } 458 + 459 + return AE_ERROR; 460 + } 461 + 462 + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) 463 + { 464 + struct acpi_device *adev = ACPI_COMPANION(dev); 465 + acpi_status status; 466 + 467 + /* 468 + * This should never happen as this function is only called when 469 + * has_acpi_companion(dev) is true, but the logic is sufficiently 470 + * complex that Coverity can't see the tautology. 471 + */ 472 + if (!adev) 473 + return -ENODEV; 474 + 475 + status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, 476 + vmclock_acpi_resources, st); 477 + if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) { 478 + dev_err(dev, "failed to get resources\n"); 479 + return -ENODEV; 480 + } 481 + 482 + return 0; 483 + } 484 + 485 + static void vmclock_put_idx(void *data) 486 + { 487 + struct vmclock_state *st = data; 488 + 489 + ida_free(&vmclock_ida, st->index); 490 + } 491 + 492 + static int vmclock_probe(struct platform_device *pdev) 493 + { 494 + struct device *dev = &pdev->dev; 495 + struct vmclock_state *st; 496 + int ret; 497 + 498 + st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL); 499 + if (!st) 500 + return -ENOMEM; 501 + 502 + if (has_acpi_companion(dev)) 503 + ret = vmclock_probe_acpi(dev, st); 504 + else 505 + ret = -EINVAL; /* Only ACPI for now */ 506 + 507 + if (ret) { 508 + dev_info(dev, "Failed to obtain physical address: %d\n", ret); 509 + goto out; 510 + } 511 + 512 + if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) { 513 + dev_info(dev, "Region too small (0x%llx)\n", 514 + resource_size(&st->res)); 515 + ret = -EINVAL; 516 + goto out; 517 + } 518 + st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res), 519 + MEMREMAP_WB | MEMREMAP_DEC); 520 + if (IS_ERR(st->clk)) { 521 + ret = PTR_ERR(st->clk); 522 + dev_info(dev, "failed to map shared memory\n"); 523 + st->clk = NULL; 524 + goto out; 525 + } 526 + 527 + if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || 528 + le32_to_cpu(st->clk->size) > resource_size(&st->res) || 529 + le16_to_cpu(st->clk->version) != 1) { 530 + dev_info(dev, "vmclock magic fields invalid\n"); 531 + ret = -EINVAL; 532 + goto out; 533 + } 534 + 535 + ret = ida_alloc(&vmclock_ida, GFP_KERNEL); 536 + if (ret < 0) 537 + goto out; 538 + 539 + st->index = ret; 540 + ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); 541 + if (ret) 542 + goto out; 543 + 544 + st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index); 545 + if (!st->name) { 546 + ret = -ENOMEM; 547 + goto out; 548 + } 549 + 550 + /* 551 + * If the structure is big enough, it can be mapped to userspace. 552 + * Theoretically a guest OS even using larger pages could still 553 + * use 4KiB PTEs to map smaller MMIO regions like this, but let's 554 + * cross that bridge if/when we come to it. 555 + */ 556 + if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) { 557 + st->miscdev.minor = MISC_DYNAMIC_MINOR; 558 + st->miscdev.fops = &vmclock_miscdev_fops; 559 + st->miscdev.name = st->name; 560 + 561 + ret = misc_register(&st->miscdev); 562 + if (ret) 563 + goto out; 564 + } 565 + 566 + /* If there is valid clock information, register a PTP clock */ 567 + if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) { 568 + /* Can return a silent NULL, or an error. */ 569 + st->ptp_clock = vmclock_ptp_register(dev, st); 570 + if (IS_ERR(st->ptp_clock)) { 571 + ret = PTR_ERR(st->ptp_clock); 572 + st->ptp_clock = NULL; 573 + vmclock_remove(pdev); 574 + goto out; 575 + } 576 + } 577 + 578 + if (!st->miscdev.minor && !st->ptp_clock) { 579 + /* Neither miscdev nor PTP registered */ 580 + dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n"); 581 + ret = -ENODEV; 582 + goto out; 583 + } 584 + 585 + dev_info(dev, "%s: registered %s%s%s\n", st->name, 586 + st->miscdev.minor ? "miscdev" : "", 587 + (st->miscdev.minor && st->ptp_clock) ? ", " : "", 588 + st->ptp_clock ? "PTP" : ""); 589 + 590 + dev_set_drvdata(dev, st); 591 + 592 + out: 593 + return ret; 594 + } 595 + 596 + static const struct acpi_device_id vmclock_acpi_ids[] = { 597 + { "AMZNC10C", 0 }, 598 + {} 599 + }; 600 + MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); 601 + 602 + static struct platform_driver vmclock_platform_driver = { 603 + .probe = vmclock_probe, 604 + .remove_new = vmclock_remove, 605 + .driver = { 606 + .name = "vmclock", 607 + .acpi_match_table = vmclock_acpi_ids, 608 + }, 609 + }; 610 + 611 + module_platform_driver(vmclock_platform_driver) 612 + 613 + MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>"); 614 + MODULE_DESCRIPTION("PTP clock using VMCLOCK"); 615 + MODULE_LICENSE("GPL");

+182

include/uapi/linux/vmclock-abi.h

··· 1 + /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ 2 + 3 + /* 4 + * This structure provides a vDSO-style clock to VM guests, exposing the 5 + * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch 6 + * counter, etc.) and real time. It is designed to address the problem of 7 + * live migration, which other clock enlightenments do not. 8 + * 9 + * When a guest is live migrated, this affects the clock in two ways. 10 + * 11 + * First, even between identical hosts the actual frequency of the underlying 12 + * counter will change within the tolerances of its specification (typically 13 + * ±50PPM, or 4 seconds a day). This frequency also varies over time on the 14 + * same host, but can be tracked by NTP as it generally varies slowly. With 15 + * live migration there is a step change in the frequency, with no warning. 16 + * 17 + * Second, there may be a step change in the value of the counter itself, as 18 + * its accuracy is limited by the precision of the NTP synchronization on the 19 + * source and destination hosts. 20 + * 21 + * So any calibration (NTP, PTP, etc.) which the guest has done on the source 22 + * host before migration is invalid, and needs to be redone on the new host. 23 + * 24 + * In its most basic mode, this structure provides only an indication to the 25 + * guest that live migration has occurred. This allows the guest to know that 26 + * its clock is invalid and take remedial action. For applications that need 27 + * reliable accurate timestamps (e.g. distributed databases), the structure 28 + * can be mapped all the way to userspace. This allows the application to see 29 + * directly for itself that the clock is disrupted and take appropriate 30 + * action, even when using a vDSO-style method to get the time instead of a 31 + * system call. 32 + * 33 + * In its more advanced mode. this structure can also be used to expose the 34 + * precise relationship of the CPU counter to real time, as calibrated by the 35 + * host. This means that userspace applications can have accurate time 36 + * immediately after live migration, rather than having to pause operations 37 + * and wait for NTP to recover. This mode does, of course, rely on the 38 + * counter being reliable and consistent across CPUs. 39 + * 40 + * Note that this must be true UTC, never with smeared leap seconds. If a 41 + * guest wishes to construct a smeared clock, it can do so. Presenting a 42 + * smeared clock through this interface would be problematic because it 43 + * actually messes with the apparent counter *period*. A linear smearing 44 + * of 1 ms per second would effectively tweak the counter period by 1000PPM 45 + * at the start/end of the smearing period, while a sinusoidal smear would 46 + * basically be impossible to represent. 47 + * 48 + * This structure is offered with the intent that it be adopted into the 49 + * nascent virtio-rtc standard, as a virtio-rtc that does not address the live 50 + * migration problem seems a little less than fit for purpose. For that 51 + * reason, certain fields use precisely the same numeric definitions as in 52 + * the virtio-rtc proposal. The structure can also be exposed through an ACPI 53 + * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for 54 + * the fact that it uses a real _CRS to convey the address of the structure 55 + * (which should be a full page, to allow for mapping directly to userspace). 56 + */ 57 + 58 + #ifndef __VMCLOCK_ABI_H__ 59 + #define __VMCLOCK_ABI_H__ 60 + 61 + #include <linux/types.h> 62 + 63 + struct vmclock_abi { 64 + /* CONSTANT FIELDS */ 65 + __le32 magic; 66 + #define VMCLOCK_MAGIC 0x4b4c4356 /* "VCLK" */ 67 + __le32 size; /* Size of region containing this structure */ 68 + __le16 version; /* 1 */ 69 + __u8 counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */ 70 + #define VMCLOCK_COUNTER_ARM_VCNT 0 71 + #define VMCLOCK_COUNTER_X86_TSC 1 72 + #define VMCLOCK_COUNTER_INVALID 0xff 73 + __u8 time_type; /* Matches VIRTIO_RTC_TYPE_xxx */ 74 + #define VMCLOCK_TIME_UTC 0 /* Since 1970-01-01 00:00:00z */ 75 + #define VMCLOCK_TIME_TAI 1 /* Since 1970-01-01 00:00:00z */ 76 + #define VMCLOCK_TIME_MONOTONIC 2 /* Since undefined epoch */ 77 + #define VMCLOCK_TIME_INVALID_SMEARED 3 /* Not supported */ 78 + #define VMCLOCK_TIME_INVALID_MAYBE_SMEARED 4 /* Not supported */ 79 + 80 + /* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */ 81 + __le32 seq_count; /* Low bit means an update is in progress */ 82 + /* 83 + * This field changes to another non-repeating value when the CPU 84 + * counter is disrupted, for example on live migration. This lets 85 + * the guest know that it should discard any calibration it has 86 + * performed of the counter against external sources (NTP/PTP/etc.). 87 + */ 88 + __le64 disruption_marker; 89 + __le64 flags; 90 + /* Indicates that the tai_offset_sec field is valid */ 91 + #define VMCLOCK_FLAG_TAI_OFFSET_VALID (1 << 0) 92 + /* 93 + * Optionally used to notify guests of pending maintenance events. 94 + * A guest which provides latency-sensitive services may wish to 95 + * remove itself from service if an event is coming up. Two flags 96 + * indicate the approximate imminence of the event. 97 + */ 98 + #define VMCLOCK_FLAG_DISRUPTION_SOON (1 << 1) /* About a day */ 99 + #define VMCLOCK_FLAG_DISRUPTION_IMMINENT (1 << 2) /* About an hour */ 100 + #define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID (1 << 3) 101 + #define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID (1 << 4) 102 + #define VMCLOCK_FLAG_TIME_ESTERROR_VALID (1 << 5) 103 + #define VMCLOCK_FLAG_TIME_MAXERROR_VALID (1 << 6) 104 + /* 105 + * If the MONOTONIC flag is set then (other than leap seconds) it is 106 + * guaranteed that the time calculated according this structure at 107 + * any given moment shall never appear to be later than the time 108 + * calculated via the structure at any *later* moment. 109 + * 110 + * In particular, a timestamp based on a counter reading taken 111 + * immediately after setting the low bit of seq_count (and the 112 + * associated memory barrier), using the previously-valid time and 113 + * period fields, shall never be later than a timestamp based on 114 + * a counter reading taken immediately before *clearing* the low 115 + * bit again after the update, using the about-to-be-valid fields. 116 + */ 117 + #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) 118 + 119 + __u8 pad[2]; 120 + __u8 clock_status; 121 + #define VMCLOCK_STATUS_UNKNOWN 0 122 + #define VMCLOCK_STATUS_INITIALIZING 1 123 + #define VMCLOCK_STATUS_SYNCHRONIZED 2 124 + #define VMCLOCK_STATUS_FREERUNNING 3 125 + #define VMCLOCK_STATUS_UNRELIABLE 4 126 + 127 + /* 128 + * The time exposed through this device is never smeared. This field 129 + * corresponds to the 'subtype' field in virtio-rtc, which indicates 130 + * the smearing method. However in this case it provides a *hint* to 131 + * the guest operating system, such that *if* the guest OS wants to 132 + * provide its users with an alternative clock which does not follow 133 + * UTC, it may do so in a fashion consistent with the other systems 134 + * in the nearby environment. 135 + */ 136 + __u8 leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */ 137 + #define VMCLOCK_SMEARING_STRICT 0 138 + #define VMCLOCK_SMEARING_NOON_LINEAR 1 139 + #define VMCLOCK_SMEARING_UTC_SLS 2 140 + __le16 tai_offset_sec; /* Actually two's complement signed */ 141 + __u8 leap_indicator; 142 + /* 143 + * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined 144 + * in the current draft of virtio-rtc, but since smearing cannot be 145 + * used with the shared memory device, some values are not used. 146 + * 147 + * The _POST_POS and _POST_NEG values allow the guest to perform 148 + * its own smearing during the day or so after a leap second when 149 + * such smearing may need to continue being applied for a leap 150 + * second which is now theoretically "historical". 151 + */ 152 + #define VMCLOCK_LEAP_NONE 0x00 /* No known nearby leap second */ 153 + #define VMCLOCK_LEAP_PRE_POS 0x01 /* Positive leap second at EOM */ 154 + #define VMCLOCK_LEAP_PRE_NEG 0x02 /* Negative leap second at EOM */ 155 + #define VMCLOCK_LEAP_POS 0x03 /* Set during 23:59:60 second */ 156 + #define VMCLOCK_LEAP_POST_POS 0x04 157 + #define VMCLOCK_LEAP_POST_NEG 0x05 158 + 159 + /* Bit shift for counter_period_frac_sec and its error rate */ 160 + __u8 counter_period_shift; 161 + /* 162 + * Paired values of counter and UTC at a given point in time. 163 + */ 164 + __le64 counter_value; 165 + /* 166 + * Counter period, and error margin of same. The unit of these 167 + * fields is 1/2^(64 + counter_period_shift) of a second. 168 + */ 169 + __le64 counter_period_frac_sec; 170 + __le64 counter_period_esterror_rate_frac_sec; 171 + __le64 counter_period_maxerror_rate_frac_sec; 172 + 173 + /* 174 + * Time according to time_type field above. 175 + */ 176 + __le64 time_sec; /* Seconds since time_type epoch */ 177 + __le64 time_frac_sec; /* Units of 1/2^64 of a second */ 178 + __le64 time_esterror_nanosec; 179 + __le64 time_maxerror_nanosec; 180 + }; 181 + 182 + #endif /* __VMCLOCK_ABI_H__ */