Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86,kvm: move qemu/guest FPU switching out to vcpu_run

Currently, every time a VCPU is scheduled out, the host kernel will
first save the guest FPU/xstate context, then load the qemu userspace
FPU context, only to then immediately save the qemu userspace FPU
context back to memory. When scheduling in a VCPU, the same extraneous
FPU loads and saves are done.

This could be avoided by moving from a model where the guest FPU is
loaded and stored with preemption disabled, to a model where the
qemu userspace FPU is swapped out for the guest FPU context for
the duration of the KVM_RUN ioctl.

This is done under the VCPU mutex, which is also taken when other
tasks inspect the VCPU FPU context, so the code should already be
safe for this change. That should come as no surprise, given that
s390 already has this optimization.

This can fix a bug where KVM calls get_user_pages while owning the
FPU, and the file system ends up requesting the FPU again:

[258270.527947] __warn+0xcb/0xf0
[258270.527948] warn_slowpath_null+0x1d/0x20
[258270.527951] kernel_fpu_disable+0x3f/0x50
[258270.527953] __kernel_fpu_begin+0x49/0x100
[258270.527955] kernel_fpu_begin+0xe/0x10
[258270.527958] crc32c_pcl_intel_update+0x84/0xb0
[258270.527961] crypto_shash_update+0x3f/0x110
[258270.527968] crc32c+0x63/0x8a [libcrc32c]
[258270.527975] dm_bm_checksum+0x1b/0x20 [dm_persistent_data]
[258270.527978] node_prepare_for_write+0x44/0x70 [dm_persistent_data]
[258270.527985] dm_block_manager_write_callback+0x41/0x50 [dm_persistent_data]
[258270.527988] submit_io+0x170/0x1b0 [dm_bufio]
[258270.527992] __write_dirty_buffer+0x89/0x90 [dm_bufio]
[258270.527994] __make_buffer_clean+0x4f/0x80 [dm_bufio]
[258270.527996] __try_evict_buffer+0x42/0x60 [dm_bufio]
[258270.527998] dm_bufio_shrink_scan+0xc0/0x130 [dm_bufio]
[258270.528002] shrink_slab.part.40+0x1f5/0x420
[258270.528004] shrink_node+0x22c/0x320
[258270.528006] do_try_to_free_pages+0xf5/0x330
[258270.528008] try_to_free_pages+0xe9/0x190
[258270.528009] __alloc_pages_slowpath+0x40f/0xba0
[258270.528011] __alloc_pages_nodemask+0x209/0x260
[258270.528014] alloc_pages_vma+0x1f1/0x250
[258270.528017] do_huge_pmd_anonymous_page+0x123/0x660
[258270.528021] handle_mm_fault+0xfd3/0x1330
[258270.528025] __get_user_pages+0x113/0x640
[258270.528027] get_user_pages+0x4f/0x60
[258270.528063] __gfn_to_pfn_memslot+0x120/0x3f0 [kvm]
[258270.528108] try_async_pf+0x66/0x230 [kvm]
[258270.528135] tdp_page_fault+0x130/0x280 [kvm]
[258270.528149] kvm_mmu_page_fault+0x60/0x120 [kvm]
[258270.528158] handle_ept_violation+0x91/0x170 [kvm_intel]
[258270.528162] vmx_handle_exit+0x1ca/0x1400 [kvm_intel]

No performance changes were detected in quick ping-pong tests on
my 4 socket system, which is expected since an FPU+xstate load is
on the order of 0.1us, while ping-ponging between CPUs is on the
order of 20us, and somewhat noisy.

Cc: stable@vger.kernel.org
Signed-off-by: Rik van Riel <riel@redhat.com>
Suggested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Fixed a bug where reset_vcpu called put_fpu without preceding load_fpu,
which happened inside from KVM_CREATE_VCPU ioctl. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>

authored by

Rik van Riel and committed by
Radim Krčmář
f775b13e 609b7002

+31 -23
+13
arch/x86/include/asm/kvm_host.h
··· 536 536 struct kvm_mmu_memory_cache mmu_page_cache; 537 537 struct kvm_mmu_memory_cache mmu_page_header_cache; 538 538 539 + /* 540 + * QEMU userspace and the guest each have their own FPU state. 541 + * In vcpu_run, we switch between the user and guest FPU contexts. 542 + * While running a VCPU, the VCPU thread will have the guest FPU 543 + * context. 544 + * 545 + * Note that while the PKRU state lives inside the fpu registers, 546 + * it is switched out separately at VMENTER and VMEXIT time. The 547 + * "guest_fpu" state here contains the guest FPU context, with the 548 + * host PRKU bits. 549 + */ 550 + struct fpu user_fpu; 539 551 struct fpu guest_fpu; 552 + 540 553 u64 xcr0; 541 554 u64 guest_supported_xcr0; 542 555 u32 guest_xstate_size;
+17 -22
arch/x86/kvm/x86.c
··· 2937 2937 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2938 2938 pagefault_enable(); 2939 2939 kvm_x86_ops->vcpu_put(vcpu); 2940 - kvm_put_guest_fpu(vcpu); 2941 2940 vcpu->arch.last_host_tsc = rdtsc(); 2942 2941 } 2943 2942 ··· 5253 5254 5254 5255 static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) 5255 5256 { 5256 - preempt_disable(); 5257 - kvm_load_guest_fpu(emul_to_vcpu(ctxt)); 5258 5257 } 5259 5258 5260 5259 static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) 5261 5260 { 5262 - preempt_enable(); 5263 5261 } 5264 5262 5265 5263 static int emulator_intercept(struct x86_emulate_ctxt *ctxt, ··· 6948 6952 preempt_disable(); 6949 6953 6950 6954 kvm_x86_ops->prepare_guest_switch(vcpu); 6951 - kvm_load_guest_fpu(vcpu); 6952 6955 6953 6956 /* 6954 6957 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt ··· 7292 7297 } 7293 7298 } 7294 7299 7300 + kvm_load_guest_fpu(vcpu); 7301 + 7295 7302 if (unlikely(vcpu->arch.complete_userspace_io)) { 7296 7303 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; 7297 7304 vcpu->arch.complete_userspace_io = NULL; 7298 7305 r = cui(vcpu); 7299 7306 if (r <= 0) 7300 - goto out; 7307 + goto out_fpu; 7301 7308 } else 7302 7309 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 7303 7310 ··· 7308 7311 else 7309 7312 r = vcpu_run(vcpu); 7310 7313 7314 + out_fpu: 7315 + kvm_put_guest_fpu(vcpu); 7311 7316 out: 7312 7317 post_kvm_run_save(vcpu); 7313 7318 kvm_sigset_deactivate(vcpu); ··· 7703 7704 vcpu->arch.cr0 |= X86_CR0_ET; 7704 7705 } 7705 7706 7707 + /* Swap (qemu) user FPU context for the guest FPU context. */ 7706 7708 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 7707 7709 { 7708 - if (vcpu->guest_fpu_loaded) 7709 - return; 7710 - 7711 - /* 7712 - * Restore all possible states in the guest, 7713 - * and assume host would use all available bits. 7714 - * Guest xcr0 would be loaded later. 7715 - */ 7716 - vcpu->guest_fpu_loaded = 1; 7717 - __kernel_fpu_begin(); 7710 + preempt_disable(); 7711 + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); 7718 7712 /* PKRU is separately restored in kvm_x86_ops->run. */ 7719 7713 __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, 7720 7714 ~XFEATURE_MASK_PKRU); 7715 + preempt_enable(); 7721 7716 trace_kvm_fpu(1); 7722 7717 } 7723 7718 7719 + /* When vcpu_run ends, restore user space FPU context. */ 7724 7720 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 7725 7721 { 7726 - if (!vcpu->guest_fpu_loaded) 7727 - return; 7728 - 7729 - vcpu->guest_fpu_loaded = 0; 7722 + preempt_disable(); 7730 7723 copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 7731 - __kernel_fpu_end(); 7724 + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); 7725 + preempt_enable(); 7732 7726 ++vcpu->stat.fpu_reload; 7733 7727 trace_kvm_fpu(0); 7734 7728 } ··· 7838 7846 * To avoid have the INIT path from kvm_apic_has_events() that be 7839 7847 * called with loaded FPU and does not let userspace fix the state. 7840 7848 */ 7841 - kvm_put_guest_fpu(vcpu); 7849 + if (init_event) 7850 + kvm_put_guest_fpu(vcpu); 7842 7851 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, 7843 7852 XFEATURE_MASK_BNDREGS); 7844 7853 if (mpx_state_buffer) ··· 7848 7855 XFEATURE_MASK_BNDCSR); 7849 7856 if (mpx_state_buffer) 7850 7857 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); 7858 + if (init_event) 7859 + kvm_load_guest_fpu(vcpu); 7851 7860 } 7852 7861 7853 7862 if (!init_event) {
+1 -1
include/linux/kvm_host.h
··· 232 232 struct mutex mutex; 233 233 struct kvm_run *run; 234 234 235 - int guest_fpu_loaded, guest_xcr0_loaded; 235 + int guest_xcr0_loaded; 236 236 struct swait_queue_head wq; 237 237 struct pid __rcu *pid; 238 238 int sigset_active;