Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: x86: Expose TSC offset controls to userspace

To date, VMM-directed TSC synchronization and migration has been a bit
messy. KVM has some baked-in heuristics around TSC writes to infer if
the VMM is attempting to synchronize. This is problematic, as it depends
on host userspace writing to the guest's TSC within 1 second of the last
write.

A much cleaner approach to configuring the guest's views of the TSC is to
simply migrate the TSC offset for every vCPU. Offsets are idempotent,
and thus not subject to change depending on when the VMM actually
reads/writes values from/to KVM. The VMM can then read the TSC once with
KVM_GET_CLOCK to capture a (realtime, host_tsc) pair at the instant when
the guest is paused.

Cc: David Matlack <dmatlack@google.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Oliver Upton <oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210916181538.968978-8-oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

authored by

Oliver Upton and committed by
Paolo Bonzini
828ca896 58d4277b

+178
+57
Documentation/virt/kvm/devices/vcpu.rst
··· 161 161 base address must be 64 byte aligned and exist within a valid guest memory 162 162 region. See Documentation/virt/kvm/arm/pvtime.rst for more information 163 163 including the layout of the stolen time structure. 164 + 165 + 4. GROUP: KVM_VCPU_TSC_CTRL 166 + =========================== 167 + 168 + :Architectures: x86 169 + 170 + 4.1 ATTRIBUTE: KVM_VCPU_TSC_OFFSET 171 + 172 + :Parameters: 64-bit unsigned TSC offset 173 + 174 + Returns: 175 + 176 + ======= ====================================== 177 + -EFAULT Error reading/writing the provided 178 + parameter address. 179 + -ENXIO Attribute not supported 180 + ======= ====================================== 181 + 182 + Specifies the guest's TSC offset relative to the host's TSC. The guest's 183 + TSC is then derived by the following equation: 184 + 185 + guest_tsc = host_tsc + KVM_VCPU_TSC_OFFSET 186 + 187 + This attribute is useful for the precise migration of a guest's TSC. The 188 + following describes a possible algorithm to use for the migration of a 189 + guest's TSC: 190 + 191 + From the source VMM process: 192 + 193 + 1. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_0), 194 + kvmclock nanoseconds (k_0), and realtime nanoseconds (r_0). 195 + 196 + 2. Read the KVM_VCPU_TSC_OFFSET attribute for every vCPU to record the 197 + guest TSC offset (off_n). 198 + 199 + 3. Invoke the KVM_GET_TSC_KHZ ioctl to record the frequency of the 200 + guest's TSC (freq). 201 + 202 + From the destination VMM process: 203 + 204 + 4. Invoke the KVM_SET_CLOCK ioctl, providing the kvmclock nanoseconds 205 + (k_0) and realtime nanoseconds (r_0) in their respective fields. 206 + Ensure that the KVM_CLOCK_REALTIME flag is set in the provided 207 + structure. KVM will advance the VM's kvmclock to account for elapsed 208 + time since recording the clock values. 209 + 210 + 5. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_1) and 211 + kvmclock nanoseconds (k_1). 212 + 213 + 6. Adjust the guest TSC offsets for every vCPU to account for (1) time 214 + elapsed since recording state and (2) difference in TSCs between the 215 + source and destination machine: 216 + 217 + new_off_n = t_0 + off_n + (k_1 - k_0) * freq - t_1 218 + 219 + 7. Write the KVM_VCPU_TSC_OFFSET attribute for every vCPU with the 220 + respective value derived in the previous step.
+1
arch/x86/include/asm/kvm_host.h
··· 1095 1095 u64 last_tsc_nsec; 1096 1096 u64 last_tsc_write; 1097 1097 u32 last_tsc_khz; 1098 + u64 last_tsc_offset; 1098 1099 u64 cur_tsc_nsec; 1099 1100 u64 cur_tsc_write; 1100 1101 u64 cur_tsc_offset;
+4
arch/x86/include/uapi/asm/kvm.h
··· 504 504 #define KVM_PMU_EVENT_ALLOW 0 505 505 #define KVM_PMU_EVENT_DENY 1 506 506 507 + /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ 508 + #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ 509 + #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ 510 + 507 511 #endif /* _ASM_X86_KVM_H */
+116
arch/x86/kvm/x86.c
··· 2454 2454 kvm->arch.last_tsc_nsec = ns; 2455 2455 kvm->arch.last_tsc_write = tsc; 2456 2456 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; 2457 + kvm->arch.last_tsc_offset = offset; 2457 2458 2458 2459 vcpu->arch.last_guest_tsc = tsc; 2459 2460 ··· 4055 4054 case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: 4056 4055 case KVM_CAP_SREGS2: 4057 4056 case KVM_CAP_EXIT_ON_EMULATION_FAILURE: 4057 + case KVM_CAP_VCPU_ATTRIBUTES: 4058 4058 r = 1; 4059 4059 break; 4060 4060 case KVM_CAP_EXIT_HYPERCALL: ··· 4920 4918 return 0; 4921 4919 } 4922 4920 4921 + static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, 4922 + struct kvm_device_attr *attr) 4923 + { 4924 + int r; 4925 + 4926 + switch (attr->attr) { 4927 + case KVM_VCPU_TSC_OFFSET: 4928 + r = 0; 4929 + break; 4930 + default: 4931 + r = -ENXIO; 4932 + } 4933 + 4934 + return r; 4935 + } 4936 + 4937 + static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, 4938 + struct kvm_device_attr *attr) 4939 + { 4940 + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; 4941 + int r; 4942 + 4943 + if ((u64)(unsigned long)uaddr != attr->addr) 4944 + return -EFAULT; 4945 + 4946 + switch (attr->attr) { 4947 + case KVM_VCPU_TSC_OFFSET: 4948 + r = -EFAULT; 4949 + if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) 4950 + break; 4951 + r = 0; 4952 + break; 4953 + default: 4954 + r = -ENXIO; 4955 + } 4956 + 4957 + return r; 4958 + } 4959 + 4960 + static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, 4961 + struct kvm_device_attr *attr) 4962 + { 4963 + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; 4964 + struct kvm *kvm = vcpu->kvm; 4965 + int r; 4966 + 4967 + if ((u64)(unsigned long)uaddr != attr->addr) 4968 + return -EFAULT; 4969 + 4970 + switch (attr->attr) { 4971 + case KVM_VCPU_TSC_OFFSET: { 4972 + u64 offset, tsc, ns; 4973 + unsigned long flags; 4974 + bool matched; 4975 + 4976 + r = -EFAULT; 4977 + if (get_user(offset, uaddr)) 4978 + break; 4979 + 4980 + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 4981 + 4982 + matched = (vcpu->arch.virtual_tsc_khz && 4983 + kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && 4984 + kvm->arch.last_tsc_offset == offset); 4985 + 4986 + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; 4987 + ns = get_kvmclock_base_ns(); 4988 + 4989 + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); 4990 + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 4991 + 4992 + r = 0; 4993 + break; 4994 + } 4995 + default: 4996 + r = -ENXIO; 4997 + } 4998 + 4999 + return r; 5000 + } 5001 + 5002 + static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, 5003 + unsigned int ioctl, 5004 + void __user *argp) 5005 + { 5006 + struct kvm_device_attr attr; 5007 + int r; 5008 + 5009 + if (copy_from_user(&attr, argp, sizeof(attr))) 5010 + return -EFAULT; 5011 + 5012 + if (attr.group != KVM_VCPU_TSC_CTRL) 5013 + return -ENXIO; 5014 + 5015 + switch (ioctl) { 5016 + case KVM_HAS_DEVICE_ATTR: 5017 + r = kvm_arch_tsc_has_attr(vcpu, &attr); 5018 + break; 5019 + case KVM_GET_DEVICE_ATTR: 5020 + r = kvm_arch_tsc_get_attr(vcpu, &attr); 5021 + break; 5022 + case KVM_SET_DEVICE_ATTR: 5023 + r = kvm_arch_tsc_set_attr(vcpu, &attr); 5024 + break; 5025 + } 5026 + 5027 + return r; 5028 + } 5029 + 4923 5030 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, 4924 5031 struct kvm_enable_cap *cap) 4925 5032 { ··· 5483 5372 r = __set_sregs2(vcpu, u.sregs2); 5484 5373 break; 5485 5374 } 5375 + case KVM_HAS_DEVICE_ATTR: 5376 + case KVM_GET_DEVICE_ATTR: 5377 + case KVM_SET_DEVICE_ATTR: 5378 + r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); 5379 + break; 5486 5380 default: 5487 5381 r = -EINVAL; 5488 5382 }