commit dd3e4012dd360873f95bbe7fe2eb65d951781803 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:
"A bunch of things accumulated for x86 in the last two weeks:

- Fix guest vtime accounting so that ticks happening while the guest
is running can also be accounted to it. Along with a consolidation
to the guest-specific context tracking helpers.

- Provide for the host NMI handler running after a VMX VMEXIT to be
able to run on the kernel stack correctly.

- Initialize MSR_TSC_AUX when RDPID is supported and not RDTSCP (virt
relevant - real hw supports both)

- A code generation improvement to TASK_SIZE_MAX through the use of
alternatives

- The usual misc and related cleanups and improvements"

* tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
KVM: x86: Consolidate guest enter/exit logic to common helpers
context_tracking: KVM: Move guest enter/exit wrappers to KVM's domain
context_tracking: Consolidate guest enter/exit wrappers
sched/vtime: Move guest enter/exit vtime accounting to vtime.h
sched/vtime: Move vtime accounting external declarations above inlines
KVM: x86: Defer vtime accounting 'til after IRQ handling
context_tracking: Move guest exit vtime accounting to separate helpers
context_tracking: Move guest exit context tracking to separate helpers
KVM/VMX: Invoke NMI non-IST entry instead of IST entry
x86/cpu: Remove write_tsc() and write_rdtscp_aux() wrappers
x86/cpu: Initialize MSR_TSC_AUX if RDTSCP *or* RDPID is supported
x86/resctrl: Fix init const confusion
x86: Delete UD0, UD1 traces
x86/smpboot: Remove duplicate includes
x86/cpu: Use alternative to generate the TASK_SIZE_MAX constant

Linus Torvalds 4 years ago dd3e4012 b7415964

+263 -233

16 changed files

expand all

unified split

arch

x86

include

asm

bug.h

idtentry.h

msr.h

page_64.h

page_64_types.h

kernel

cpu

common.c

resctrl

monitor.c

nmi.c

smpboot.c

kvm

svm

svm.c

vmx

vmx.c

x86.c

x86.h

include

linux

context_tracking.h

kvm_host.h

vtime.h

-9

arch/x86/include/asm/bug.h

··· 7 8 /* 9 * Despite that some emulators terminate on UD2, we use it for WARN(). 10 - * 11 - * Since various instruction decoders/specs disagree on the encoding of 12 - * UD0/UD1. 13 */ 14 - 15 - #define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */ 16 - #define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */ 17 #define ASM_UD2 ".byte 0x0f, 0x0b" 18 - 19 - #define INSN_UD0 0xff0f 20 #define INSN_UD2 0x0b0f 21 - 22 #define LEN_UD2 2 23 24 #ifdef CONFIG_GENERIC_BUG

··· 7 8 /* 9 * Despite that some emulators terminate on UD2, we use it for WARN(). 10 */ 11 #define ASM_UD2 ".byte 0x0f, 0x0b" 12 #define INSN_UD2 0x0b0f 13 #define LEN_UD2 2 14 15 #ifdef CONFIG_GENERIC_BUG

+15

arch/x86/include/asm/idtentry.h

··· 588 #endif 589 590 /* NMI */ 591 DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); 592 #ifdef CONFIG_XEN_PV 593 DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);

··· 588 #endif 589 590 /* NMI */ 591 + 592 + #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) 593 + /* 594 + * Special NOIST entry point for VMX which invokes this on the kernel 595 + * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI 596 + * 'executing' marker. 597 + * 598 + * On 32bit this just uses the regular NMI entry point because 32-bit does 599 + * not have ISTs. 600 + */ 601 + DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist); 602 + #else 603 + #define asm_exc_nmi_noist asm_exc_nmi 604 + #endif 605 + 606 DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); 607 #ifdef CONFIG_XEN_PV 608 DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);

-4

arch/x86/include/asm/msr.h

··· 324 return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); 325 } 326 327 - #define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high)) 328 - 329 - #define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0) 330 - 331 struct msr *msrs_alloc(void); 332 void msrs_free(struct msr *msrs); 333 int msr_set_bit(u32 msr, u8 bit);

··· 324 return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); 325 } 326 327 struct msr *msrs_alloc(void); 328 void msrs_free(struct msr *msrs); 329 int msr_set_bit(u32 msr, u8 bit);

+33

arch/x86/include/asm/page_64.h

··· 56 57 void copy_page(void *to, void *from); 58 59 #endif /* !__ASSEMBLY__ */ 60 61 #ifdef CONFIG_X86_VSYSCALL_EMULATION

··· 56 57 void copy_page(void *to, void *from); 58 59 + #ifdef CONFIG_X86_5LEVEL 60 + /* 61 + * User space process size. This is the first address outside the user range. 62 + * There are a few constraints that determine this: 63 + * 64 + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical 65 + * address, then that syscall will enter the kernel with a 66 + * non-canonical return address, and SYSRET will explode dangerously. 67 + * We avoid this particular problem by preventing anything 68 + * from being mapped at the maximum canonical address. 69 + * 70 + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the 71 + * CPUs malfunction if they execute code from the highest canonical page. 72 + * They'll speculate right off the end of the canonical space, and 73 + * bad things happen. This is worked around in the same way as the 74 + * Intel problem. 75 + * 76 + * With page table isolation enabled, we map the LDT in ... [stay tuned] 77 + */ 78 + static inline unsigned long task_size_max(void) 79 + { 80 + unsigned long ret; 81 + 82 + alternative_io("movq %[small],%0","movq %[large],%0", 83 + X86_FEATURE_LA57, 84 + "=r" (ret), 85 + [small] "i" ((1ul << 47)-PAGE_SIZE), 86 + [large] "i" ((1ul << 56)-PAGE_SIZE)); 87 + 88 + return ret; 89 + } 90 + #endif /* CONFIG_X86_5LEVEL */ 91 + 92 #endif /* !__ASSEMBLY__ */ 93 94 #ifdef CONFIG_X86_VSYSCALL_EMULATION

+3 -20

arch/x86/include/asm/page_64_types.h

··· 55 56 #ifdef CONFIG_X86_5LEVEL 57 #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) 58 #else 59 #define __VIRTUAL_MASK_SHIFT 47 60 #endif 61 62 - /* 63 - * User space process size. This is the first address outside the user range. 64 - * There are a few constraints that determine this: 65 - * 66 - * On Intel CPUs, if a SYSCALL instruction is at the highest canonical 67 - * address, then that syscall will enter the kernel with a 68 - * non-canonical return address, and SYSRET will explode dangerously. 69 - * We avoid this particular problem by preventing anything 70 - * from being mapped at the maximum canonical address. 71 - * 72 - * On AMD CPUs in the Ryzen family, there's a nasty bug in which the 73 - * CPUs malfunction if they execute code from the highest canonical page. 74 - * They'll speculate right off the end of the canonical space, and 75 - * bad things happen. This is worked around in the same way as the 76 - * Intel problem. 77 - * 78 - * With page table isolation enabled, we map the LDT in ... [stay tuned] 79 - */ 80 - #define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 81 - 82 #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 83 84 /* This decides where the kernel will search for a free chunk of vm

··· 55 56 #ifdef CONFIG_X86_5LEVEL 57 #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) 58 + /* See task_size_max() in <asm/page_64.h> */ 59 #else 60 #define __VIRTUAL_MASK_SHIFT 47 61 + #define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 62 #endif 63 64 + #define TASK_SIZE_MAX task_size_max() 65 #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 66 67 /* This decides where the kernel will search for a free chunk of vm

+2 -2

arch/x86/kernel/cpu/common.c

··· 1851 unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); 1852 struct desc_struct d = { }; 1853 1854 - if (boot_cpu_has(X86_FEATURE_RDTSCP)) 1855 - write_rdtscp_aux(cpudata); 1856 1857 /* Store CPU and node number in limit. */ 1858 d.limit0 = cpudata;

··· 1851 unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); 1852 struct desc_struct d = { }; 1853 1854 + if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) 1855 + wrmsr(MSR_TSC_AUX, cpudata, 0); 1856 1857 /* Store CPU and node number in limit. */ 1858 d.limit0 = cpudata;

+1 -1

arch/x86/kernel/cpu/resctrl/monitor.c

··· 84 static const struct mbm_correction_factor_table { 85 u32 rmidthreshold; 86 u64 cf; 87 - } mbm_cf_table[] __initdata = { 88 {7, CF(1.000000)}, 89 {15, CF(1.000000)}, 90 {15, CF(0.969650)},

··· 84 static const struct mbm_correction_factor_table { 85 u32 rmidthreshold; 86 u64 cf; 87 + } mbm_cf_table[] __initconst = { 88 {7, CF(1.000000)}, 89 {15, CF(1.000000)}, 90 {15, CF(0.969650)},

+10

arch/x86/kernel/nmi.c

··· 524 mds_user_clear_cpu_buffers(); 525 } 526 527 void stop_nmi(void) 528 { 529 ignore_nmis++;

··· 524 mds_user_clear_cpu_buffers(); 525 } 526 527 + #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) 528 + DEFINE_IDTENTRY_RAW(exc_nmi_noist) 529 + { 530 + exc_nmi(regs); 531 + } 532 + #endif 533 + #if IS_MODULE(CONFIG_KVM_INTEL) 534 + EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); 535 + #endif 536 + 537 void stop_nmi(void) 538 { 539 ignore_nmis++;

-3

arch/x86/kernel/smpboot.c

··· 1865 return true; 1866 } 1867 1868 - #include <asm/cpu_device_id.h> 1869 - #include <asm/intel-family.h> 1870 - 1871 #define X86_MATCH(model) \ 1872 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ 1873 INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)

··· 1865 return true; 1866 } 1867 1868 #define X86_MATCH(model) \ 1869 X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ 1870 INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)

+2 -37

arch/x86/kvm/svm/svm.c

··· 3710 struct vcpu_svm *svm = to_svm(vcpu); 3711 unsigned long vmcb_pa = svm->current_vmcb->pa; 3712 3713 - /* 3714 - * VMENTER enables interrupts (host state), but the kernel state is 3715 - * interrupts disabled when this is invoked. Also tell RCU about 3716 - * it. This is the same logic as for exit_to_user_mode(). 3717 - * 3718 - * This ensures that e.g. latency analysis on the host observes 3719 - * guest mode as interrupt enabled. 3720 - * 3721 - * guest_enter_irqoff() informs context tracking about the 3722 - * transition to guest mode and if enabled adjusts RCU state 3723 - * accordingly. 3724 - */ 3725 - instrumentation_begin(); 3726 - trace_hardirqs_on_prepare(); 3727 - lockdep_hardirqs_on_prepare(CALLER_ADDR0); 3728 - instrumentation_end(); 3729 - 3730 - guest_enter_irqoff(); 3731 - lockdep_hardirqs_on(CALLER_ADDR0); 3732 3733 if (sev_es_guest(vcpu->kvm)) { 3734 __svm_sev_es_vcpu_run(vmcb_pa); ··· 3730 vmload(__sme_page_pa(sd->save_area)); 3731 } 3732 3733 - /* 3734 - * VMEXIT disables interrupts (host state), but tracing and lockdep 3735 - * have them in state 'on' as recorded before entering guest mode. 3736 - * Same as enter_from_user_mode(). 3737 - * 3738 - * guest_exit_irqoff() restores host context and reinstates RCU if 3739 - * enabled and required. 3740 - * 3741 - * This needs to be done before the below as native_read_msr() 3742 - * contains a tracepoint and x86_spec_ctrl_restore_host() calls 3743 - * into world and some more. 3744 - */ 3745 - lockdep_hardirqs_off(CALLER_ADDR0); 3746 - guest_exit_irqoff(); 3747 - 3748 - instrumentation_begin(); 3749 - trace_hardirqs_off_finish(); 3750 - instrumentation_end(); 3751 } 3752 3753 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)

··· 3710 struct vcpu_svm *svm = to_svm(vcpu); 3711 unsigned long vmcb_pa = svm->current_vmcb->pa; 3712 3713 + kvm_guest_enter_irqoff(); 3714 3715 if (sev_es_guest(vcpu->kvm)) { 3716 __svm_sev_es_vcpu_run(vmcb_pa); ··· 3748 vmload(__sme_page_pa(sd->save_area)); 3749 } 3750 3751 + kvm_guest_exit_irqoff(); 3752 } 3753 3754 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)

+11 -44

arch/x86/kvm/vmx/vmx.c

··· 36 #include <asm/debugreg.h> 37 #include <asm/desc.h> 38 #include <asm/fpu/internal.h> 39 #include <asm/io.h> 40 #include <asm/irq_remapping.h> 41 #include <asm/kexec.h> ··· 6416 6417 void vmx_do_interrupt_nmi_irqoff(unsigned long entry); 6418 6419 - static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 6420 { 6421 - unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6422 - gate_desc *desc = (gate_desc *)host_idt_base + vector; 6423 - 6424 kvm_before_interrupt(vcpu); 6425 - vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); 6426 kvm_after_interrupt(vcpu); 6427 } 6428 6429 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) 6430 { 6431 u32 intr_info = vmx_get_intr_info(&vmx->vcpu); 6432 6433 /* if exit due to PF check for async PF */ ··· 6437 kvm_machine_check(); 6438 /* We need to handle NMIs before interrupts are enabled */ 6439 else if (is_nmi(intr_info)) 6440 - handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info); 6441 } 6442 6443 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) 6444 { 6445 u32 intr_info = vmx_get_intr_info(vcpu); 6446 6447 if (WARN_ONCE(!is_external_intr(intr_info), 6448 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6449 return; 6450 6451 - handle_interrupt_nmi_irqoff(vcpu, intr_info); 6452 } 6453 6454 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) ··· 6664 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 6665 struct vcpu_vmx *vmx) 6666 { 6667 - /* 6668 - * VMENTER enables interrupts (host state), but the kernel state is 6669 - * interrupts disabled when this is invoked. Also tell RCU about 6670 - * it. This is the same logic as for exit_to_user_mode(). 6671 - * 6672 - * This ensures that e.g. latency analysis on the host observes 6673 - * guest mode as interrupt enabled. 6674 - * 6675 - * guest_enter_irqoff() informs context tracking about the 6676 - * transition to guest mode and if enabled adjusts RCU state 6677 - * accordingly. 6678 - */ 6679 - instrumentation_begin(); 6680 - trace_hardirqs_on_prepare(); 6681 - lockdep_hardirqs_on_prepare(CALLER_ADDR0); 6682 - instrumentation_end(); 6683 - 6684 - guest_enter_irqoff(); 6685 - lockdep_hardirqs_on(CALLER_ADDR0); 6686 6687 /* L1D Flush includes CPU buffer clear to mitigate MDS */ 6688 if (static_branch_unlikely(&vmx_l1d_should_flush)) ··· 6680 6681 vcpu->arch.cr2 = native_read_cr2(); 6682 6683 - /* 6684 - * VMEXIT disables interrupts (host state), but tracing and lockdep 6685 - * have them in state 'on' as recorded before entering guest mode. 6686 - * Same as enter_from_user_mode(). 6687 - * 6688 - * guest_exit_irqoff() restores host context and reinstates RCU if 6689 - * enabled and required. 6690 - * 6691 - * This needs to be done before the below as native_read_msr() 6692 - * contains a tracepoint and x86_spec_ctrl_restore_host() calls 6693 - * into world and some more. 6694 - */ 6695 - lockdep_hardirqs_off(CALLER_ADDR0); 6696 - guest_exit_irqoff(); 6697 - 6698 - instrumentation_begin(); 6699 - trace_hardirqs_off_finish(); 6700 - instrumentation_end(); 6701 } 6702 6703 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)

··· 36 #include <asm/debugreg.h> 37 #include <asm/desc.h> 38 #include <asm/fpu/internal.h> 39 + #include <asm/idtentry.h> 40 #include <asm/io.h> 41 #include <asm/irq_remapping.h> 42 #include <asm/kexec.h> ··· 6415 6416 void vmx_do_interrupt_nmi_irqoff(unsigned long entry); 6417 6418 + static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, 6419 + unsigned long entry) 6420 { 6421 kvm_before_interrupt(vcpu); 6422 + vmx_do_interrupt_nmi_irqoff(entry); 6423 kvm_after_interrupt(vcpu); 6424 } 6425 6426 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) 6427 { 6428 + const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; 6429 u32 intr_info = vmx_get_intr_info(&vmx->vcpu); 6430 6431 /* if exit due to PF check for async PF */ ··· 6437 kvm_machine_check(); 6438 /* We need to handle NMIs before interrupts are enabled */ 6439 else if (is_nmi(intr_info)) 6440 + handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); 6441 } 6442 6443 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) 6444 { 6445 u32 intr_info = vmx_get_intr_info(vcpu); 6446 + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6447 + gate_desc *desc = (gate_desc *)host_idt_base + vector; 6448 6449 if (WARN_ONCE(!is_external_intr(intr_info), 6450 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6451 return; 6452 6453 + handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); 6454 } 6455 6456 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) ··· 6662 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 6663 struct vcpu_vmx *vmx) 6664 { 6665 + kvm_guest_enter_irqoff(); 6666 6667 /* L1D Flush includes CPU buffer clear to mitigate MDS */ 6668 if (static_branch_unlikely(&vmx_l1d_should_flush)) ··· 6696 6697 vcpu->arch.cr2 = native_read_cr2(); 6698 6699 + kvm_guest_exit_irqoff(); 6700 } 6701 6702 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)

arch/x86/kvm/x86.c

··· 9315 local_irq_disable(); 9316 kvm_after_interrupt(vcpu); 9317 9318 if (lapic_in_kernel(vcpu)) { 9319 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; 9320 if (delta != S64_MIN) {

··· 9315 local_irq_disable(); 9316 kvm_after_interrupt(vcpu); 9317 9318 + /* 9319 + * Wait until after servicing IRQs to account guest time so that any 9320 + * ticks that occurred while running the guest are properly accounted 9321 + * to the guest. Waiting until IRQs are enabled degrades the accuracy 9322 + * of accounting via context tracking, but the loss of accuracy is 9323 + * acceptable for all known use cases. 9324 + */ 9325 + vtime_account_guest_exit(); 9326 + 9327 if (lapic_in_kernel(vcpu)) { 9328 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; 9329 if (delta != S64_MIN) {

+45

arch/x86/kvm/x86.h

··· 8 #include "kvm_cache_regs.h" 9 #include "kvm_emulate.h" 10 11 #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ 12 ({ \ 13 bool failed = (consistency_check); \

··· 8 #include "kvm_cache_regs.h" 9 #include "kvm_emulate.h" 10 11 + static __always_inline void kvm_guest_enter_irqoff(void) 12 + { 13 + /* 14 + * VMENTER enables interrupts (host state), but the kernel state is 15 + * interrupts disabled when this is invoked. Also tell RCU about 16 + * it. This is the same logic as for exit_to_user_mode(). 17 + * 18 + * This ensures that e.g. latency analysis on the host observes 19 + * guest mode as interrupt enabled. 20 + * 21 + * guest_enter_irqoff() informs context tracking about the 22 + * transition to guest mode and if enabled adjusts RCU state 23 + * accordingly. 24 + */ 25 + instrumentation_begin(); 26 + trace_hardirqs_on_prepare(); 27 + lockdep_hardirqs_on_prepare(CALLER_ADDR0); 28 + instrumentation_end(); 29 + 30 + guest_enter_irqoff(); 31 + lockdep_hardirqs_on(CALLER_ADDR0); 32 + } 33 + 34 + static __always_inline void kvm_guest_exit_irqoff(void) 35 + { 36 + /* 37 + * VMEXIT disables interrupts (host state), but tracing and lockdep 38 + * have them in state 'on' as recorded before entering guest mode. 39 + * Same as enter_from_user_mode(). 40 + * 41 + * context_tracking_guest_exit() restores host context and reinstates 42 + * RCU if enabled and required. 43 + * 44 + * This needs to be done immediately after VM-Exit, before any code 45 + * that might contain tracepoints or call out to the greater world, 46 + * e.g. before x86_spec_ctrl_restore_host(). 47 + */ 48 + lockdep_hardirqs_off(CALLER_ADDR0); 49 + context_tracking_guest_exit(); 50 + 51 + instrumentation_begin(); 52 + trace_hardirqs_off_finish(); 53 + instrumentation_end(); 54 + } 55 + 56 #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ 57 ({ \ 58 bool failed = (consistency_check); \

+16 -76

include/linux/context_tracking.h

··· 71 } 72 } 73 74 75 /** 76 * ct_state() - return the current context tracking state if known ··· 105 static inline enum ctx_state exception_enter(void) { return 0; } 106 static inline void exception_exit(enum ctx_state prev_ctx) { } 107 static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } 108 #endif /* !CONFIG_CONTEXT_TRACKING */ 109 110 #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) ··· 117 #else 118 static inline void context_tracking_init(void) { } 119 #endif /* CONFIG_CONTEXT_TRACKING_FORCE */ 120 - 121 - 122 - #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 123 - /* must be called with irqs disabled */ 124 - static __always_inline void guest_enter_irqoff(void) 125 - { 126 - instrumentation_begin(); 127 - if (vtime_accounting_enabled_this_cpu()) 128 - vtime_guest_enter(current); 129 - else 130 - current->flags |= PF_VCPU; 131 - instrumentation_end(); 132 - 133 - if (context_tracking_enabled()) 134 - __context_tracking_enter(CONTEXT_GUEST); 135 - 136 - /* KVM does not hold any references to rcu protected data when it 137 - * switches CPU into a guest mode. In fact switching to a guest mode 138 - * is very similar to exiting to userspace from rcu point of view. In 139 - * addition CPU may stay in a guest mode for quite a long time (up to 140 - * one time slice). Lets treat guest mode as quiescent state, just like 141 - * we do with user-mode execution. 142 - */ 143 - if (!context_tracking_enabled_this_cpu()) { 144 - instrumentation_begin(); 145 - rcu_virt_note_context_switch(smp_processor_id()); 146 - instrumentation_end(); 147 - } 148 - } 149 - 150 - static __always_inline void guest_exit_irqoff(void) 151 - { 152 - if (context_tracking_enabled()) 153 - __context_tracking_exit(CONTEXT_GUEST); 154 - 155 - instrumentation_begin(); 156 - if (vtime_accounting_enabled_this_cpu()) 157 - vtime_guest_exit(current); 158 - else 159 - current->flags &= ~PF_VCPU; 160 - instrumentation_end(); 161 - } 162 - 163 - #else 164 - static __always_inline void guest_enter_irqoff(void) 165 - { 166 - /* 167 - * This is running in ioctl context so its safe 168 - * to assume that it's the stime pending cputime 169 - * to flush. 170 - */ 171 - instrumentation_begin(); 172 - vtime_account_kernel(current); 173 - current->flags |= PF_VCPU; 174 - rcu_virt_note_context_switch(smp_processor_id()); 175 - instrumentation_end(); 176 - } 177 - 178 - static __always_inline void guest_exit_irqoff(void) 179 - { 180 - instrumentation_begin(); 181 - /* Flush the guest cputime we spent on the guest */ 182 - vtime_account_kernel(current); 183 - current->flags &= ~PF_VCPU; 184 - instrumentation_end(); 185 - } 186 - #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 187 - 188 - static inline void guest_exit(void) 189 - { 190 - unsigned long flags; 191 - 192 - local_irq_save(flags); 193 - guest_exit_irqoff(); 194 - local_irq_restore(flags); 195 - } 196 197 #endif

··· 71 } 72 } 73 74 + static __always_inline bool context_tracking_guest_enter(void) 75 + { 76 + if (context_tracking_enabled()) 77 + __context_tracking_enter(CONTEXT_GUEST); 78 + 79 + return context_tracking_enabled_this_cpu(); 80 + } 81 + 82 + static __always_inline void context_tracking_guest_exit(void) 83 + { 84 + if (context_tracking_enabled()) 85 + __context_tracking_exit(CONTEXT_GUEST); 86 + } 87 88 /** 89 * ct_state() - return the current context tracking state if known ··· 92 static inline enum ctx_state exception_enter(void) { return 0; } 93 static inline void exception_exit(enum ctx_state prev_ctx) { } 94 static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } 95 + static inline bool context_tracking_guest_enter(void) { return false; } 96 + static inline void context_tracking_guest_exit(void) { } 97 + 98 #endif /* !CONFIG_CONTEXT_TRACKING */ 99 100 #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) ··· 101 #else 102 static inline void context_tracking_init(void) { } 103 #endif /* CONFIG_CONTEXT_TRACKING_FORCE */ 104 105 #endif

+45

include/linux/kvm_host.h

··· 338 struct kvm_dirty_ring dirty_ring; 339 }; 340 341 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) 342 { 343 /*

··· 338 struct kvm_dirty_ring dirty_ring; 339 }; 340 341 + /* must be called with irqs disabled */ 342 + static __always_inline void guest_enter_irqoff(void) 343 + { 344 + /* 345 + * This is running in ioctl context so its safe to assume that it's the 346 + * stime pending cputime to flush. 347 + */ 348 + instrumentation_begin(); 349 + vtime_account_guest_enter(); 350 + instrumentation_end(); 351 + 352 + /* 353 + * KVM does not hold any references to rcu protected data when it 354 + * switches CPU into a guest mode. In fact switching to a guest mode 355 + * is very similar to exiting to userspace from rcu point of view. In 356 + * addition CPU may stay in a guest mode for quite a long time (up to 357 + * one time slice). Lets treat guest mode as quiescent state, just like 358 + * we do with user-mode execution. 359 + */ 360 + if (!context_tracking_guest_enter()) { 361 + instrumentation_begin(); 362 + rcu_virt_note_context_switch(smp_processor_id()); 363 + instrumentation_end(); 364 + } 365 + } 366 + 367 + static __always_inline void guest_exit_irqoff(void) 368 + { 369 + context_tracking_guest_exit(); 370 + 371 + instrumentation_begin(); 372 + /* Flush the guest cputime we spent on the guest */ 373 + vtime_account_guest_exit(); 374 + instrumentation_end(); 375 + } 376 + 377 + static inline void guest_exit(void) 378 + { 379 + unsigned long flags; 380 + 381 + local_irq_save(flags); 382 + guest_exit_irqoff(); 383 + local_irq_restore(flags); 384 + } 385 + 386 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) 387 { 388 /*

+71 -37

include/linux/vtime.h

··· 3 #define _LINUX_KERNEL_VTIME_H 4 5 #include <linux/context_tracking_state.h> 6 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 7 #include <asm/vtime.h> 8 #endif 9 10 11 - struct task_struct; 12 13 /* 14 * vtime_accounting_enabled_this_cpu() definitions/declarations ··· 51 52 static inline bool vtime_accounting_enabled_this_cpu(void) { return true; } 53 extern void vtime_task_switch(struct task_struct *prev); 54 55 #elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN) 56 ··· 95 vtime_task_switch_generic(prev); 96 } 97 98 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 99 100 - static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; } 101 static inline bool vtime_accounting_enabled_this_cpu(void) { return false; } 102 static inline void vtime_task_switch(struct task_struct *prev) { } 103 104 - #endif 105 106 - /* 107 - * Common vtime APIs 108 - */ 109 - #ifdef CONFIG_VIRT_CPU_ACCOUNTING 110 - extern void vtime_account_kernel(struct task_struct *tsk); 111 - extern void vtime_account_idle(struct task_struct *tsk); 112 - #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 113 - static inline void vtime_account_kernel(struct task_struct *tsk) { } 114 - #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 115 116 - #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 117 - extern void arch_vtime_task_switch(struct task_struct *tsk); 118 - extern void vtime_user_enter(struct task_struct *tsk); 119 - extern void vtime_user_exit(struct task_struct *tsk); 120 - extern void vtime_guest_enter(struct task_struct *tsk); 121 - extern void vtime_guest_exit(struct task_struct *tsk); 122 - extern void vtime_init_idle(struct task_struct *tsk, int cpu); 123 - #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 124 - static inline void vtime_user_enter(struct task_struct *tsk) { } 125 - static inline void vtime_user_exit(struct task_struct *tsk) { } 126 - static inline void vtime_guest_enter(struct task_struct *tsk) { } 127 - static inline void vtime_guest_exit(struct task_struct *tsk) { } 128 - static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } 129 - #endif 130 - 131 - #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 132 - extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); 133 - extern void vtime_account_softirq(struct task_struct *tsk); 134 - extern void vtime_account_hardirq(struct task_struct *tsk); 135 - extern void vtime_flush(struct task_struct *tsk); 136 - #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 137 - static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } 138 - static inline void vtime_account_softirq(struct task_struct *tsk) { } 139 - static inline void vtime_account_hardirq(struct task_struct *tsk) { } 140 - static inline void vtime_flush(struct task_struct *tsk) { } 141 #endif 142 143

··· 3 #define _LINUX_KERNEL_VTIME_H 4 5 #include <linux/context_tracking_state.h> 6 + #include <linux/sched.h> 7 + 8 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 9 #include <asm/vtime.h> 10 #endif 11 12 + /* 13 + * Common vtime APIs 14 + */ 15 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING 16 + extern void vtime_account_kernel(struct task_struct *tsk); 17 + extern void vtime_account_idle(struct task_struct *tsk); 18 + #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 19 20 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 21 + extern void arch_vtime_task_switch(struct task_struct *tsk); 22 + extern void vtime_user_enter(struct task_struct *tsk); 23 + extern void vtime_user_exit(struct task_struct *tsk); 24 + extern void vtime_guest_enter(struct task_struct *tsk); 25 + extern void vtime_guest_exit(struct task_struct *tsk); 26 + extern void vtime_init_idle(struct task_struct *tsk, int cpu); 27 + #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 28 + static inline void vtime_user_enter(struct task_struct *tsk) { } 29 + static inline void vtime_user_exit(struct task_struct *tsk) { } 30 + static inline void vtime_guest_enter(struct task_struct *tsk) { } 31 + static inline void vtime_guest_exit(struct task_struct *tsk) { } 32 + static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } 33 + #endif 34 + 35 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 36 + extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); 37 + extern void vtime_account_softirq(struct task_struct *tsk); 38 + extern void vtime_account_hardirq(struct task_struct *tsk); 39 + extern void vtime_flush(struct task_struct *tsk); 40 + #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 41 + static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } 42 + static inline void vtime_account_softirq(struct task_struct *tsk) { } 43 + static inline void vtime_account_hardirq(struct task_struct *tsk) { } 44 + static inline void vtime_flush(struct task_struct *tsk) { } 45 + #endif 46 47 /* 48 * vtime_accounting_enabled_this_cpu() definitions/declarations ··· 17 18 static inline bool vtime_accounting_enabled_this_cpu(void) { return true; } 19 extern void vtime_task_switch(struct task_struct *prev); 20 + 21 + static __always_inline void vtime_account_guest_enter(void) 22 + { 23 + vtime_account_kernel(current); 24 + current->flags |= PF_VCPU; 25 + } 26 + 27 + static __always_inline void vtime_account_guest_exit(void) 28 + { 29 + vtime_account_kernel(current); 30 + current->flags &= ~PF_VCPU; 31 + } 32 33 #elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN) 34 ··· 49 vtime_task_switch_generic(prev); 50 } 51 52 + static __always_inline void vtime_account_guest_enter(void) 53 + { 54 + if (vtime_accounting_enabled_this_cpu()) 55 + vtime_guest_enter(current); 56 + else 57 + current->flags |= PF_VCPU; 58 + } 59 + 60 + static __always_inline void vtime_account_guest_exit(void) 61 + { 62 + if (vtime_accounting_enabled_this_cpu()) 63 + vtime_guest_exit(current); 64 + else 65 + current->flags &= ~PF_VCPU; 66 + } 67 + 68 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 69 70 static inline bool vtime_accounting_enabled_this_cpu(void) { return false; } 71 static inline void vtime_task_switch(struct task_struct *prev) { } 72 73 + static __always_inline void vtime_account_guest_enter(void) 74 + { 75 + current->flags |= PF_VCPU; 76 + } 77 78 + static __always_inline void vtime_account_guest_exit(void) 79 + { 80 + current->flags &= ~PF_VCPU; 81 + } 82 83 #endif 84 85