Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: x86: Emulate IA32_TSC_ADJUST MSR

CPUID.7.0.EBX[1]=1 indicates IA32_TSC_ADJUST MSR 0x3b is supported

Basic design is to emulate the MSR by allowing reads and writes to a guest
vcpu specific location to store the value of the emulated MSR while adding
the value to the vmcs tsc_offset. In this way the IA32_TSC_ADJUST value will
be included in all reads to the TSC MSR whether through rdmsr or rdtsc. This
is of course as long as the "use TSC counter offsetting" VM-execution control
is enabled as well as the IA32_TSC_ADJUST control.

However, because hardware will only return the TSC + IA32_TSC_ADJUST +
vmsc tsc_offset for a guest process when it does and rdtsc (with the correct
settings) the value of our virtualized IA32_TSC_ADJUST must be stored in one
of these three locations. The argument against storing it in the actual MSR
is performance. This is likely to be seldom used while the save/restore is
required on every transition. IA32_TSC_ADJUST was created as a way to solve
some issues with writing TSC itself so that is not an option either.

The remaining option, defined above as our solution has the problem of
returning incorrect vmcs tsc_offset values (unless we intercept and fix, not
done here) as mentioned above. However, more problematic is that storing the
data in vmcs tsc_offset will have a different semantic effect on the system
than does using the actual MSR. This is illustrated in the following example:

The hypervisor set the IA32_TSC_ADJUST, then the guest sets it and a guest
process performs a rdtsc. In this case the guest process will get
TSC + IA32_TSC_ADJUST_hyperviser + vmsc tsc_offset including
IA32_TSC_ADJUST_guest. While the total system semantics changed the semantics
as seen by the guest do not and hence this will not cause a problem.

Signed-off-by: Will Auld <will.auld@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

authored by

Will Auld and committed by
Marcelo Tosatti
ba904635 8fe8ab46

+53
+1
arch/x86/include/asm/cpufeature.h
··· 202 202 203 203 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 204 204 #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 205 + #define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ 205 206 #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ 206 207 #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ 207 208 #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
+2
arch/x86/include/asm/kvm_host.h
··· 444 444 s8 virtual_tsc_shift; 445 445 u32 virtual_tsc_mult; 446 446 u32 virtual_tsc_khz; 447 + s64 ia32_tsc_adjust_msr; 447 448 448 449 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 449 450 unsigned nmi_pending; /* NMI queued after currently running handler */ ··· 712 711 bool (*has_wbinvd_exit)(void); 713 712 714 713 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); 714 + u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); 715 715 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 716 716 717 717 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
+1
arch/x86/include/asm/msr-index.h
··· 236 236 #define MSR_IA32_EBL_CR_POWERON 0x0000002a 237 237 #define MSR_EBC_FREQUENCY_ID 0x0000002c 238 238 #define MSR_IA32_FEATURE_CONTROL 0x0000003a 239 + #define MSR_IA32_TSC_ADJUST 0x0000003b 239 240 240 241 #define FEATURE_CONTROL_LOCKED (1<<0) 241 242 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
+2
arch/x86/kvm/cpuid.c
··· 320 320 if (index == 0) { 321 321 entry->ebx &= kvm_supported_word9_x86_features; 322 322 cpuid_mask(&entry->ebx, 9); 323 + // TSC_ADJUST is emulated 324 + entry->ebx |= F(TSC_ADJUST); 323 325 } else 324 326 entry->ebx = 0; 325 327 entry->eax = 0;
+8
arch/x86/kvm/cpuid.h
··· 28 28 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 29 29 } 30 30 31 + static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) 32 + { 33 + struct kvm_cpuid_entry2 *best; 34 + 35 + best = kvm_find_cpuid_entry(vcpu, 7, 0); 36 + return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); 37 + } 38 + 31 39 static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) 32 40 { 33 41 struct kvm_cpuid_entry2 *best;
+8
arch/x86/kvm/svm.c
··· 1009 1009 svm->tsc_ratio = ratio; 1010 1010 } 1011 1011 1012 + static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) 1013 + { 1014 + struct vcpu_svm *svm = to_svm(vcpu); 1015 + 1016 + return svm->vmcb->control.tsc_offset; 1017 + } 1018 + 1012 1019 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1013 1020 { 1014 1021 struct vcpu_svm *svm = to_svm(vcpu); ··· 4311 4304 .has_wbinvd_exit = svm_has_wbinvd_exit, 4312 4305 4313 4306 .set_tsc_khz = svm_set_tsc_khz, 4307 + .read_tsc_offset = svm_read_tsc_offset, 4314 4308 .write_tsc_offset = svm_write_tsc_offset, 4315 4309 .adjust_tsc_offset = svm_adjust_tsc_offset, 4316 4310 .compute_tsc_offset = svm_compute_tsc_offset,
+9
arch/x86/kvm/vmx.c
··· 1884 1884 WARN(1, "user requested TSC rate below hardware speed\n"); 1885 1885 } 1886 1886 1887 + static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) 1888 + { 1889 + return vmcs_read64(TSC_OFFSET); 1890 + } 1891 + 1887 1892 /* 1888 1893 * writes 'offset' into guest's timestamp counter offset register 1889 1894 */ ··· 2269 2264 vcpu->arch.pat = data; 2270 2265 break; 2271 2266 } 2267 + ret = kvm_set_msr_common(vcpu, msr_info); 2268 + break; 2269 + case MSR_IA32_TSC_ADJUST: 2272 2270 ret = kvm_set_msr_common(vcpu, msr_info); 2273 2271 break; 2274 2272 case MSR_TSC_AUX: ··· 7353 7345 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 7354 7346 7355 7347 .set_tsc_khz = vmx_set_tsc_khz, 7348 + .read_tsc_offset = vmx_read_tsc_offset, 7356 7349 .write_tsc_offset = vmx_write_tsc_offset, 7357 7350 .adjust_tsc_offset = vmx_adjust_tsc_offset, 7358 7351 .compute_tsc_offset = vmx_compute_tsc_offset,
+22
arch/x86/kvm/x86.c
··· 831 831 static unsigned num_msrs_to_save; 832 832 833 833 static const u32 emulated_msrs[] = { 834 + MSR_IA32_TSC_ADJUST, 834 835 MSR_IA32_TSCDEADLINE, 835 836 MSR_IA32_MISC_ENABLE, 836 837 MSR_IA32_MCG_STATUS, ··· 1136 1135 #endif 1137 1136 } 1138 1137 1138 + static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) 1139 + { 1140 + u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); 1141 + vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; 1142 + } 1143 + 1139 1144 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) 1140 1145 { 1141 1146 struct kvm *kvm = vcpu->kvm; ··· 1229 1222 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1230 1223 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1231 1224 1225 + if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) 1226 + update_ia32_tsc_adjust_msr(vcpu, offset); 1232 1227 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1233 1228 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1234 1229 ··· 1927 1918 case MSR_IA32_TSCDEADLINE: 1928 1919 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1929 1920 break; 1921 + case MSR_IA32_TSC_ADJUST: 1922 + if (guest_cpuid_has_tsc_adjust(vcpu)) { 1923 + if (!msr_info->host_initiated) { 1924 + u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; 1925 + kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); 1926 + } 1927 + vcpu->arch.ia32_tsc_adjust_msr = data; 1928 + } 1929 + break; 1930 1930 case MSR_IA32_MISC_ENABLE: 1931 1931 vcpu->arch.ia32_misc_enable_msr = data; 1932 1932 break; ··· 2294 2276 break; 2295 2277 case MSR_IA32_TSCDEADLINE: 2296 2278 data = kvm_get_lapic_tscdeadline_msr(vcpu); 2279 + break; 2280 + case MSR_IA32_TSC_ADJUST: 2281 + data = (u64)vcpu->arch.ia32_tsc_adjust_msr; 2297 2282 break; 2298 2283 case MSR_IA32_MISC_ENABLE: 2299 2284 data = vcpu->arch.ia32_misc_enable_msr; ··· 6628 6607 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6629 6608 goto fail_free_mce_banks; 6630 6609 6610 + vcpu->arch.ia32_tsc_adjust_msr = 0x0; 6631 6611 kvm_async_pf_hash_reset(vcpu); 6632 6612 kvm_pmu_init(vcpu); 6633 6613