Merge tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+34

Documentation/virtual/kvm/api.txt

··· 1946 1946 the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd 1947 1947 and kvm_irqfd.gsi. 1948 1948 1949 + 4.76 KVM_PPC_ALLOCATE_HTAB 1950 + 1951 + Capability: KVM_CAP_PPC_ALLOC_HTAB 1952 + Architectures: powerpc 1953 + Type: vm ioctl 1954 + Parameters: Pointer to u32 containing hash table order (in/out) 1955 + Returns: 0 on success, -1 on error 1956 + 1957 + This requests the host kernel to allocate an MMU hash table for a 1958 + guest using the PAPR paravirtualization interface. This only does 1959 + anything if the kernel is configured to use the Book 3S HV style of 1960 + virtualization. Otherwise the capability doesn't exist and the ioctl 1961 + returns an ENOTTY error. The rest of this description assumes Book 3S 1962 + HV. 1963 + 1964 + There must be no vcpus running when this ioctl is called; if there 1965 + are, it will do nothing and return an EBUSY error. 1966 + 1967 + The parameter is a pointer to a 32-bit unsigned integer variable 1968 + containing the order (log base 2) of the desired size of the hash 1969 + table, which must be between 18 and 46. On successful return from the 1970 + ioctl, it will have been updated with the order of the hash table that 1971 + was allocated. 1972 + 1973 + If no hash table has been allocated when any vcpu is asked to run 1974 + (with the KVM_RUN ioctl), the host kernel will allocate a 1975 + default-sized hash table (16 MB). 1976 + 1977 + If this ioctl is called when a hash table has already been allocated, 1978 + the kernel will clear out the existing hash table (zero all HPTEs) and 1979 + return the hash table order in the parameter. (If the guest is using 1980 + the virtualized real-mode area (VRMA) facility, the kernel will 1981 + re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) 1982 + 1949 1983 1950 1984 5. The kvm_run structure 1951 1985 ------------------------

+129 -1

Documentation/virtual/kvm/locking.txt

··· 6 6 7 7 (to be written) 8 8 9 - 2. Reference 9 + 2: Exception 10 + ------------ 11 + 12 + Fast page fault: 13 + 14 + Fast page fault is the fast path which fixes the guest page fault out of 15 + the mmu-lock on x86. Currently, the page fault can be fast only if the 16 + shadow page table is present and it is caused by write-protect, that means 17 + we just need change the W bit of the spte. 18 + 19 + What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and 20 + SPTE_MMU_WRITEABLE bit on the spte: 21 + - SPTE_HOST_WRITEABLE means the gfn is writable on host. 22 + - SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when 23 + the gfn is writable on guest mmu and it is not write-protected by shadow 24 + page write-protection. 25 + 26 + On fast page fault path, we will use cmpxchg to atomically set the spte W 27 + bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this 28 + is safe because whenever changing these bits can be detected by cmpxchg. 29 + 30 + But we need carefully check these cases: 31 + 1): The mapping from gfn to pfn 32 + The mapping from gfn to pfn may be changed since we can only ensure the pfn 33 + is not changed during cmpxchg. This is a ABA problem, for example, below case 34 + will happen: 35 + 36 + At the beginning: 37 + gpte = gfn1 38 + gfn1 is mapped to pfn1 on host 39 + spte is the shadow page table entry corresponding with gpte and 40 + spte = pfn1 41 + 42 + VCPU 0 VCPU0 43 + on fast page fault path: 44 + 45 + old_spte = *spte; 46 + pfn1 is swapped out: 47 + spte = 0; 48 + 49 + pfn1 is re-alloced for gfn2. 50 + 51 + gpte is changed to point to 52 + gfn2 by the guest: 53 + spte = pfn1; 54 + 55 + if (cmpxchg(spte, old_spte, old_spte+W) 56 + mark_page_dirty(vcpu->kvm, gfn1) 57 + OOPS!!! 58 + 59 + We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. 60 + 61 + For direct sp, we can easily avoid it since the spte of direct sp is fixed 62 + to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() 63 + to pin gfn to pfn, because after gfn_to_pfn_atomic(): 64 + - We have held the refcount of pfn that means the pfn can not be freed and 65 + be reused for another gfn. 66 + - The pfn is writable that means it can not be shared between different gfns 67 + by KSM. 68 + 69 + Then, we can ensure the dirty bitmaps is correctly set for a gfn. 70 + 71 + Currently, to simplify the whole things, we disable fast page fault for 72 + indirect shadow page. 73 + 74 + 2): Dirty bit tracking 75 + In the origin code, the spte can be fast updated (non-atomically) if the 76 + spte is read-only and the Accessed bit has already been set since the 77 + Accessed bit and Dirty bit can not be lost. 78 + 79 + But it is not true after fast page fault since the spte can be marked 80 + writable between reading spte and updating spte. Like below case: 81 + 82 + At the beginning: 83 + spte.W = 0 84 + spte.Accessed = 1 85 + 86 + VCPU 0 VCPU0 87 + In mmu_spte_clear_track_bits(): 88 + 89 + old_spte = *spte; 90 + 91 + /* 'if' condition is satisfied. */ 92 + if (old_spte.Accssed == 1 && 93 + old_spte.W == 0) 94 + spte = 0ull; 95 + on fast page fault path: 96 + spte.W = 1 97 + memory write on the spte: 98 + spte.Dirty = 1 99 + 100 + 101 + else 102 + old_spte = xchg(spte, 0ull) 103 + 104 + 105 + if (old_spte.Accssed == 1) 106 + kvm_set_pfn_accessed(spte.pfn); 107 + if (old_spte.Dirty == 1) 108 + kvm_set_pfn_dirty(spte.pfn); 109 + OOPS!!! 110 + 111 + The Dirty bit is lost in this case. 112 + 113 + In order to avoid this kind of issue, we always treat the spte as "volatile" 114 + if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, 115 + the spte is always atomicly updated in this case. 116 + 117 + 3): flush tlbs due to spte updated 118 + If the spte is updated from writable to readonly, we should flush all TLBs, 119 + otherwise rmap_write_protect will find a read-only spte, even though the 120 + writable spte might be cached on a CPU's TLB. 121 + 122 + As mentioned before, the spte can be updated to writable out of mmu-lock on 123 + fast page fault path, in order to easily audit the path, we see if TLBs need 124 + be flushed caused by this reason in mmu_spte_update() since this is a common 125 + function to update spte (present -> present). 126 + 127 + Since the spte is "volatile" if it can be updated out of mmu-lock, we always 128 + atomicly update the spte, the race caused by fast page fault can be avoided, 129 + See the comments in spte_has_volatile_bits() and mmu_spte_update(). 130 + 131 + 3. Reference 10 132 ------------ 11 133 12 134 Name: kvm_lock ··· 145 23 Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} 146 24 - tsc offset in vmcb 147 25 Comment: 'raw' because updating the tsc offsets must not be preempted. 26 + 27 + Name: kvm->mmu_lock 28 + Type: spinlock_t 29 + Arch: any 30 + Protects: -shadow page/shadow tlb entry 31 + Comment: it is a spinlock since it is used in mmu notifier.

+33

Documentation/virtual/kvm/msr.txt

··· 223 223 steal: the amount of time in which this vCPU did not run, in 224 224 nanoseconds. Time during which the vcpu is idle, will not be 225 225 reported as steal time. 226 + 227 + MSR_KVM_EOI_EN: 0x4b564d04 228 + data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 229 + when disabled. Bit 1 is reserved and must be zero. When PV end of 230 + interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned 231 + physical address of a 4 byte memory area which must be in guest RAM and 232 + must be zeroed. 233 + 234 + The first, least significant bit of 4 byte memory location will be 235 + written to by the hypervisor, typically at the time of interrupt 236 + injection. Value of 1 means that guest can skip writing EOI to the apic 237 + (using MSR or MMIO write); instead, it is sufficient to signal 238 + EOI by clearing the bit in guest memory - this location will 239 + later be polled by the hypervisor. 240 + Value of 0 means that the EOI write is required. 241 + 242 + It is always safe for the guest to ignore the optimization and perform 243 + the APIC EOI write anyway. 244 + 245 + Hypervisor is guaranteed to only modify this least 246 + significant bit while in the current VCPU context, this means that 247 + guest does not need to use either lock prefix or memory ordering 248 + primitives to synchronise with the hypervisor. 249 + 250 + However, hypervisor can set and clear this memory bit at any time: 251 + therefore to make sure hypervisor does not interrupt the 252 + guest and clear the least significant bit in the memory area 253 + in the window between guest testing it to detect 254 + whether it can skip EOI apic write and between guest 255 + clearing it to signal EOI to the hypervisor, 256 + guest must both read the least significant bit in the memory area and 257 + clear it using a single CPU instruction, such as test and clear, or 258 + compare and exchange.

-2

Documentation/virtual/kvm/ppc-pv.txt

··· 109 109 110 110 MSR_EE 111 111 MSR_RI 112 - MSR_CR 113 - MSR_ME 114 112 115 113 If any other bit changes in the MSR, please still use mtmsr(d). 116 114

+1 -1

MAINTAINERS

··· 4002 4002 F: arch/ia64/kvm/ 4003 4003 4004 4004 KERNEL VIRTUAL MACHINE for s390 (KVM/s390) 4005 - M: Carsten Otte <cotte@de.ibm.com> 4006 4005 M: Christian Borntraeger <borntraeger@de.ibm.com> 4006 + M: Cornelia Huck <cornelia.huck@de.ibm.com> 4007 4007 M: linux390@de.ibm.com 4008 4008 L: linux-s390@vger.kernel.org 4009 4009 W: http://www.ibm.com/developerworks/linux/linux390/

+1

arch/ia64/include/asm/kvm.h

··· 26 26 27 27 /* Select x86 specific features in <linux/kvm.h> */ 28 28 #define __KVM_HAVE_IOAPIC 29 + #define __KVM_HAVE_IRQ_LINE 29 30 #define __KVM_HAVE_DEVICE_ASSIGNMENT 30 31 31 32 /* Architectural interrupt line count. */

+1

arch/ia64/kvm/Kconfig

··· 19 19 20 20 config KVM 21 21 tristate "Kernel-based Virtual Machine (KVM) support" 22 + depends on BROKEN 22 23 depends on HAVE_KVM && MODULES && EXPERIMENTAL 23 24 # for device assignment: 24 25 depends on PCI

+2

arch/powerpc/include/asm/epapr_hcalls.h

··· 153 153 #define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" 154 154 #define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" 155 155 156 + extern bool epapr_paravirt_enabled; 157 + extern u32 epapr_hypercall_start[]; 156 158 157 159 /* 158 160 * We use "uintptr_t" to define a register because it's guaranteed to be a

+2

arch/powerpc/include/asm/hw_irq.h

··· 34 34 35 35 extern void timer_interrupt(struct pt_regs *); 36 36 extern void performance_monitor_exception(struct pt_regs *regs); 37 + extern void WatchdogException(struct pt_regs *regs); 38 + extern void unknown_exception(struct pt_regs *regs); 37 39 38 40 #ifdef CONFIG_PPC64 39 41 #include <asm/paca.h>

+2 -5

arch/powerpc/include/asm/kvm_book3s_64.h

··· 36 36 #define SPAPR_TCE_SHIFT 12 37 37 38 38 #ifdef CONFIG_KVM_BOOK3S_64_HV 39 - /* For now use fixed-size 16MB page table */ 40 - #define HPT_ORDER 24 41 - #define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ 42 - #define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ 43 - #define HPT_HASH_MASK (HPT_NPTEG - 1) 39 + #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ 40 + extern int kvm_hpt_order; /* order of preallocated HPTs */ 44 41 #endif 45 42 46 43 #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */

+6

arch/powerpc/include/asm/kvm_host.h

··· 237 237 unsigned long vrma_slb_v; 238 238 int rma_setup_done; 239 239 int using_mmu_notifiers; 240 + u32 hpt_order; 241 + atomic_t vcpus_running; 242 + unsigned long hpt_npte; 243 + unsigned long hpt_mask; 240 244 spinlock_t slot_phys_lock; 241 245 unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; 242 246 int slot_npages[KVM_MEM_SLOTS_NUM]; ··· 418 414 ulong mcsrr1; 419 415 ulong mcsr; 420 416 u32 dec; 417 + #ifdef CONFIG_BOOKE 421 418 u32 decar; 419 + #endif 422 420 u32 tbl; 423 421 u32 tbu; 424 422 u32 tcr;

+2 -1

arch/powerpc/include/asm/kvm_ppc.h

··· 119 119 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); 120 120 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); 121 121 122 - extern long kvmppc_alloc_hpt(struct kvm *kvm); 122 + extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); 123 + extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); 123 124 extern void kvmppc_free_hpt(struct kvm *kvm); 124 125 extern long kvmppc_prepare_vrma(struct kvm *kvm, 125 126 struct kvm_userspace_memory_region *mem);

+1

arch/powerpc/kernel/Makefile

··· 128 128 obj-y += ppc_save_regs.o 129 129 endif 130 130 131 + obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o 131 132 obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o 132 133 133 134 # Disable GCOV in odd or sensitive code

+25

arch/powerpc/kernel/epapr_hcalls.S

··· 1 + /* 2 + * Copyright (C) 2012 Freescale Semiconductor, Inc. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/threads.h> 11 + #include <asm/reg.h> 12 + #include <asm/page.h> 13 + #include <asm/cputable.h> 14 + #include <asm/thread_info.h> 15 + #include <asm/ppc_asm.h> 16 + #include <asm/asm-offsets.h> 17 + 18 + /* Hypercall entry point. Will be patched with device tree instructions. */ 19 + .global epapr_hypercall_start 20 + epapr_hypercall_start: 21 + li r3, -1 22 + nop 23 + nop 24 + nop 25 + blr

+52

arch/powerpc/kernel/epapr_paravirt.c

··· 1 + /* 2 + * ePAPR para-virtualization support. 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License, version 2, as 6 + * published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 + * GNU General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public License 14 + * along with this program; if not, write to the Free Software 15 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 16 + * 17 + * Copyright (C) 2012 Freescale Semiconductor, Inc. 18 + */ 19 + 20 + #include <linux/of.h> 21 + #include <asm/epapr_hcalls.h> 22 + #include <asm/cacheflush.h> 23 + #include <asm/code-patching.h> 24 + 25 + bool epapr_paravirt_enabled; 26 + 27 + static int __init epapr_paravirt_init(void) 28 + { 29 + struct device_node *hyper_node; 30 + const u32 *insts; 31 + int len, i; 32 + 33 + hyper_node = of_find_node_by_path("/hypervisor"); 34 + if (!hyper_node) 35 + return -ENODEV; 36 + 37 + insts = of_get_property(hyper_node, "hcall-instructions", &len); 38 + if (!insts) 39 + return -ENODEV; 40 + 41 + if (len % 4 || len > (4 * 4)) 42 + return -ENODEV; 43 + 44 + for (i = 0; i < (len / 4); i++) 45 + patch_instruction(epapr_hypercall_start + i, insts[i]); 46 + 47 + epapr_paravirt_enabled = true; 48 + 49 + return 0; 50 + } 51 + 52 + early_initcall(epapr_paravirt_init);

+3 -25

arch/powerpc/kernel/kvm.c

··· 31 31 #include <asm/cacheflush.h> 32 32 #include <asm/disassemble.h> 33 33 #include <asm/ppc-opcode.h> 34 + #include <asm/epapr_hcalls.h> 34 35 35 36 #define KVM_MAGIC_PAGE (-4096L) 36 37 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) ··· 727 726 unsigned long register r11 asm("r11") = nr; 728 727 unsigned long register r12 asm("r12"); 729 728 730 - asm volatile("bl kvm_hypercall_start" 729 + asm volatile("bl epapr_hypercall_start" 731 730 : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), 732 731 "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), 733 732 "=r"(r12) ··· 747 746 return r3; 748 747 } 749 748 EXPORT_SYMBOL_GPL(kvm_hypercall); 750 - 751 - static int kvm_para_setup(void) 752 - { 753 - extern u32 kvm_hypercall_start; 754 - struct device_node *hyper_node; 755 - u32 *insts; 756 - int len, i; 757 - 758 - hyper_node = of_find_node_by_path("/hypervisor"); 759 - if (!hyper_node) 760 - return -1; 761 - 762 - insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len); 763 - if (len % 4) 764 - return -1; 765 - if (len > (4 * 4)) 766 - return -1; 767 - 768 - for (i = 0; i < (len / 4); i++) 769 - kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]); 770 - 771 - return 0; 772 - } 773 749 774 750 static __init void kvm_free_tmp(void) 775 751 { ··· 769 791 if (!kvm_para_available()) 770 792 goto free_tmp; 771 793 772 - if (kvm_para_setup()) 794 + if (!epapr_paravirt_enabled) 773 795 goto free_tmp; 774 796 775 797 if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))

+1 -11

arch/powerpc/kernel/kvm_emul.S

··· 24 24 #include <asm/page.h> 25 25 #include <asm/asm-offsets.h> 26 26 27 - /* Hypercall entry point. Will be patched with device tree instructions. */ 28 - 29 - .global kvm_hypercall_start 30 - kvm_hypercall_start: 31 - li r3, -1 32 - nop 33 - nop 34 - nop 35 - blr 36 - 37 27 #define KVM_MAGIC_PAGE (-4096) 38 28 39 29 #ifdef CONFIG_64BIT ··· 122 132 .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 123 133 124 134 125 - #define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI) 135 + #define MSR_SAFE_BITS (MSR_EE | MSR_RI) 126 136 #define MSR_CRITICAL_BITS ~MSR_SAFE_BITS 127 137 128 138 .global kvm_emulate_mtmsr

+95 -28

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 37 37 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ 38 38 #define MAX_LPID_970 63 39 39 40 - long kvmppc_alloc_hpt(struct kvm *kvm) 40 + /* Power architecture requires HPT is at least 256kB */ 41 + #define PPC_MIN_HPT_ORDER 18 42 + 43 + long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 41 44 { 42 45 unsigned long hpt; 43 - long lpid; 44 46 struct revmap_entry *rev; 45 47 struct kvmppc_linear_info *li; 48 + long order = kvm_hpt_order; 46 49 47 - /* Allocate guest's hashed page table */ 48 - li = kvm_alloc_hpt(); 49 - if (li) { 50 - /* using preallocated memory */ 51 - hpt = (ulong)li->base_virt; 52 - kvm->arch.hpt_li = li; 53 - } else { 54 - /* using dynamic memory */ 50 + if (htab_orderp) { 51 + order = *htab_orderp; 52 + if (order < PPC_MIN_HPT_ORDER) 53 + order = PPC_MIN_HPT_ORDER; 54 + } 55 + 56 + /* 57 + * If the user wants a different size from default, 58 + * try first to allocate it from the kernel page allocator. 59 + */ 60 + hpt = 0; 61 + if (order != kvm_hpt_order) { 55 62 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 56 - __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); 63 + __GFP_NOWARN, order - PAGE_SHIFT); 64 + if (!hpt) 65 + --order; 57 66 } 58 67 68 + /* Next try to allocate from the preallocated pool */ 59 69 if (!hpt) { 60 - pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); 61 - return -ENOMEM; 70 + li = kvm_alloc_hpt(); 71 + if (li) { 72 + hpt = (ulong)li->base_virt; 73 + kvm->arch.hpt_li = li; 74 + order = kvm_hpt_order; 75 + } 62 76 } 77 + 78 + /* Lastly try successively smaller sizes from the page allocator */ 79 + while (!hpt && order > PPC_MIN_HPT_ORDER) { 80 + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 81 + __GFP_NOWARN, order - PAGE_SHIFT); 82 + if (!hpt) 83 + --order; 84 + } 85 + 86 + if (!hpt) 87 + return -ENOMEM; 88 + 63 89 kvm->arch.hpt_virt = hpt; 90 + kvm->arch.hpt_order = order; 91 + /* HPTEs are 2**4 bytes long */ 92 + kvm->arch.hpt_npte = 1ul << (order - 4); 93 + /* 128 (2**7) bytes in each HPTEG */ 94 + kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; 64 95 65 96 /* Allocate reverse map array */ 66 - rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); 97 + rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); 67 98 if (!rev) { 68 99 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); 69 100 goto out_freehpt; 70 101 } 71 102 kvm->arch.revmap = rev; 103 + kvm->arch.sdr1 = __pa(hpt) | (order - 18); 72 104 73 - lpid = kvmppc_alloc_lpid(); 74 - if (lpid < 0) 75 - goto out_freeboth; 105 + pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", 106 + hpt, order, kvm->arch.lpid); 76 107 77 - kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); 78 - kvm->arch.lpid = lpid; 79 - 80 - pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); 108 + if (htab_orderp) 109 + *htab_orderp = order; 81 110 return 0; 82 111 83 - out_freeboth: 84 - vfree(rev); 85 112 out_freehpt: 86 - free_pages(hpt, HPT_ORDER - PAGE_SHIFT); 113 + if (kvm->arch.hpt_li) 114 + kvm_release_hpt(kvm->arch.hpt_li); 115 + else 116 + free_pages(hpt, order - PAGE_SHIFT); 87 117 return -ENOMEM; 118 + } 119 + 120 + long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) 121 + { 122 + long err = -EBUSY; 123 + long order; 124 + 125 + mutex_lock(&kvm->lock); 126 + if (kvm->arch.rma_setup_done) { 127 + kvm->arch.rma_setup_done = 0; 128 + /* order rma_setup_done vs. vcpus_running */ 129 + smp_mb(); 130 + if (atomic_read(&kvm->arch.vcpus_running)) { 131 + kvm->arch.rma_setup_done = 1; 132 + goto out; 133 + } 134 + } 135 + if (kvm->arch.hpt_virt) { 136 + order = kvm->arch.hpt_order; 137 + /* Set the entire HPT to 0, i.e. invalid HPTEs */ 138 + memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 139 + /* 140 + * Set the whole last_vcpu array to an invalid vcpu number. 141 + * This ensures that each vcpu will flush its TLB on next entry. 142 + */ 143 + memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); 144 + *htab_orderp = order; 145 + err = 0; 146 + } else { 147 + err = kvmppc_alloc_hpt(kvm, htab_orderp); 148 + order = *htab_orderp; 149 + } 150 + out: 151 + mutex_unlock(&kvm->lock); 152 + return err; 88 153 } 89 154 90 155 void kvmppc_free_hpt(struct kvm *kvm) ··· 159 94 if (kvm->arch.hpt_li) 160 95 kvm_release_hpt(kvm->arch.hpt_li); 161 96 else 162 - free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); 97 + free_pages(kvm->arch.hpt_virt, 98 + kvm->arch.hpt_order - PAGE_SHIFT); 163 99 } 164 100 165 101 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ ··· 185 119 unsigned long psize; 186 120 unsigned long hp0, hp1; 187 121 long ret; 122 + struct kvm *kvm = vcpu->kvm; 188 123 189 124 psize = 1ul << porder; 190 125 npages = memslot->npages >> (porder - PAGE_SHIFT); ··· 194 127 if (npages > 1ul << (40 - porder)) 195 128 npages = 1ul << (40 - porder); 196 129 /* Can't use more than 1 HPTE per HPTEG */ 197 - if (npages > HPT_NPTEG) 198 - npages = HPT_NPTEG; 130 + if (npages > kvm->arch.hpt_mask + 1) 131 + npages = kvm->arch.hpt_mask + 1; 199 132 200 133 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 201 134 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); ··· 205 138 for (i = 0; i < npages; ++i) { 206 139 addr = i << porder; 207 140 /* can't use hpt_hash since va > 64 bits */ 208 - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; 141 + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; 209 142 /* 210 143 * We assume that the hash table is empty and no 211 144 * vcpus are using it at this stage. Since we create

+28 -12

arch/powerpc/kvm/book3s_hv.c

··· 56 56 /* #define EXIT_DEBUG_INT */ 57 57 58 58 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 59 - static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); 59 + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 60 60 61 61 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 62 62 { ··· 1104 1104 return -EINTR; 1105 1105 } 1106 1106 1107 - /* On the first time here, set up VRMA or RMA */ 1107 + atomic_inc(&vcpu->kvm->arch.vcpus_running); 1108 + /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ 1109 + smp_mb(); 1110 + 1111 + /* On the first time here, set up HTAB and VRMA or RMA */ 1108 1112 if (!vcpu->kvm->arch.rma_setup_done) { 1109 - r = kvmppc_hv_setup_rma(vcpu); 1113 + r = kvmppc_hv_setup_htab_rma(vcpu); 1110 1114 if (r) 1111 - return r; 1115 + goto out; 1112 1116 } 1113 1117 1114 1118 flush_fp_to_thread(current); ··· 1130 1126 kvmppc_core_prepare_to_enter(vcpu); 1131 1127 } 1132 1128 } while (r == RESUME_GUEST); 1129 + 1130 + out: 1131 + atomic_dec(&vcpu->kvm->arch.vcpus_running); 1133 1132 return r; 1134 1133 } 1135 1134 ··· 1348 1341 { 1349 1342 } 1350 1343 1351 - static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) 1344 + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 1352 1345 { 1353 1346 int err = 0; 1354 1347 struct kvm *kvm = vcpu->kvm; ··· 1366 1359 mutex_lock(&kvm->lock); 1367 1360 if (kvm->arch.rma_setup_done) 1368 1361 goto out; /* another vcpu beat us to it */ 1362 + 1363 + /* Allocate hashed page table (if not done already) and reset it */ 1364 + if (!kvm->arch.hpt_virt) { 1365 + err = kvmppc_alloc_hpt(kvm, NULL); 1366 + if (err) { 1367 + pr_err("KVM: Couldn't alloc HPT\n"); 1368 + goto out; 1369 + } 1370 + } 1369 1371 1370 1372 /* Look up the memslot for guest physical address 0 */ 1371 1373 memslot = gfn_to_memslot(kvm, 0); ··· 1487 1471 1488 1472 int kvmppc_core_init_vm(struct kvm *kvm) 1489 1473 { 1490 - long r; 1491 - unsigned long lpcr; 1474 + unsigned long lpcr, lpid; 1492 1475 1493 - /* Allocate hashed page table */ 1494 - r = kvmppc_alloc_hpt(kvm); 1495 - if (r) 1496 - return r; 1476 + /* Allocate the guest's logical partition ID */ 1477 + 1478 + lpid = kvmppc_alloc_lpid(); 1479 + if (lpid < 0) 1480 + return -ENOMEM; 1481 + kvm->arch.lpid = lpid; 1497 1482 1498 1483 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1499 1484 ··· 1504 1487 1505 1488 if (cpu_has_feature(CPU_FTR_ARCH_201)) { 1506 1489 /* PPC970; HID4 is effectively the LPCR */ 1507 - unsigned long lpid = kvm->arch.lpid; 1508 1490 kvm->arch.host_lpid = 0; 1509 1491 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); 1510 1492 lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));

+4 -1

arch/powerpc/kvm/book3s_hv_builtin.c

··· 25 25 static struct kvmppc_linear_info *kvm_alloc_linear(int type); 26 26 static void kvm_release_linear(struct kvmppc_linear_info *ri); 27 27 28 + int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER; 29 + EXPORT_SYMBOL_GPL(kvm_hpt_order); 30 + 28 31 /*************** RMA *************/ 29 32 30 33 /* ··· 212 209 void __init kvm_linear_init(void) 213 210 { 214 211 /* HPT */ 215 - kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); 212 + kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT); 216 213 217 214 /* RMA */ 218 215 /* Only do this on PPC970 in HV mode */

+8 -7

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 237 237 238 238 /* Find and lock the HPTEG slot to use */ 239 239 do_insert: 240 - if (pte_index >= HPT_NPTE) 240 + if (pte_index >= kvm->arch.hpt_npte) 241 241 return H_PARAMETER; 242 242 if (likely((flags & H_EXACT) == 0)) { 243 243 pte_index &= ~7UL; ··· 352 352 unsigned long v, r, rb; 353 353 struct revmap_entry *rev; 354 354 355 - if (pte_index >= HPT_NPTE) 355 + if (pte_index >= kvm->arch.hpt_npte) 356 356 return H_PARAMETER; 357 357 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 358 358 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) ··· 419 419 i = 4; 420 420 break; 421 421 } 422 - if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { 422 + if (req != 1 || flags == 3 || 423 + pte_index >= kvm->arch.hpt_npte) { 423 424 /* parameter error */ 424 425 args[j] = ((0xa0 | flags) << 56) + pte_index; 425 426 ret = H_PARAMETER; ··· 522 521 struct revmap_entry *rev; 523 522 unsigned long v, r, rb, mask, bits; 524 523 525 - if (pte_index >= HPT_NPTE) 524 + if (pte_index >= kvm->arch.hpt_npte) 526 525 return H_PARAMETER; 527 526 528 527 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); ··· 584 583 int i, n = 1; 585 584 struct revmap_entry *rev = NULL; 586 585 587 - if (pte_index >= HPT_NPTE) 586 + if (pte_index >= kvm->arch.hpt_npte) 588 587 return H_PARAMETER; 589 588 if (flags & H_READ_4) { 590 589 pte_index &= ~3; ··· 679 678 somask = (1UL << 28) - 1; 680 679 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; 681 680 } 682 - hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; 681 + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; 683 682 avpn = slb_v & ~(somask >> 16); /* also includes B */ 684 683 avpn |= (eaddr & somask) >> 16; 685 684 ··· 724 723 if (val & HPTE_V_SECONDARY) 725 724 break; 726 725 val |= HPTE_V_SECONDARY; 727 - hash = hash ^ HPT_HASH_MASK; 726 + hash = hash ^ kvm->arch.hpt_mask; 728 727 } 729 728 return -1; 730 729 }

+26

arch/powerpc/kvm/booke.c

··· 612 612 regs->link = lr; 613 613 } 614 614 615 + /* 616 + * For interrupts needed to be handled by host interrupt handlers, 617 + * corresponding host handler are called from here in similar way 618 + * (but not exact) as they are called from low level handler 619 + * (such as from arch/powerpc/kernel/head_fsl_booke.S). 620 + */ 615 621 static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, 616 622 unsigned int exit_nr) 617 623 { ··· 644 638 case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: 645 639 kvmppc_fill_pt_regs(&regs); 646 640 performance_monitor_exception(&regs); 641 + break; 642 + case BOOKE_INTERRUPT_WATCHDOG: 643 + kvmppc_fill_pt_regs(&regs); 644 + #ifdef CONFIG_BOOKE_WDT 645 + WatchdogException(&regs); 646 + #else 647 + unknown_exception(&regs); 648 + #endif 649 + break; 650 + case BOOKE_INTERRUPT_CRITICAL: 651 + unknown_exception(&regs); 647 652 break; 648 653 } 649 654 } ··· 697 680 698 681 case BOOKE_INTERRUPT_DECREMENTER: 699 682 kvmppc_account_exit(vcpu, DEC_EXITS); 683 + r = RESUME_GUEST; 684 + break; 685 + 686 + case BOOKE_INTERRUPT_WATCHDOG: 700 687 r = RESUME_GUEST; 701 688 break; 702 689 ··· 1287 1266 void kvmppc_decrementer_func(unsigned long data) 1288 1267 { 1289 1268 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; 1269 + 1270 + if (vcpu->arch.tcr & TCR_ARE) { 1271 + vcpu->arch.dec = vcpu->arch.decar; 1272 + kvmppc_emulate_dec(vcpu); 1273 + } 1290 1274 1291 1275 kvmppc_set_tsr_bits(vcpu, TSR_DIS); 1292 1276 }

+28

arch/powerpc/kvm/booke_emulate.c

··· 24 24 #include "booke.h" 25 25 26 26 #define OP_19_XOP_RFI 50 27 + #define OP_19_XOP_RFCI 51 27 28 28 29 #define OP_31_XOP_MFMSR 83 29 30 #define OP_31_XOP_WRTEE 131 ··· 35 34 { 36 35 vcpu->arch.pc = vcpu->arch.shared->srr0; 37 36 kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); 37 + } 38 + 39 + static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu) 40 + { 41 + vcpu->arch.pc = vcpu->arch.csrr0; 42 + kvmppc_set_msr(vcpu, vcpu->arch.csrr1); 38 43 } 39 44 40 45 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, ··· 56 49 case OP_19_XOP_RFI: 57 50 kvmppc_emul_rfi(vcpu); 58 51 kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS); 52 + *advance = 0; 53 + break; 54 + 55 + case OP_19_XOP_RFCI: 56 + kvmppc_emul_rfci(vcpu); 57 + kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS); 59 58 *advance = 0; 60 59 break; 61 60 ··· 126 113 case SPRN_ESR: 127 114 vcpu->arch.shared->esr = spr_val; 128 115 break; 116 + case SPRN_CSRR0: 117 + vcpu->arch.csrr0 = spr_val; 118 + break; 119 + case SPRN_CSRR1: 120 + vcpu->arch.csrr1 = spr_val; 121 + break; 129 122 case SPRN_DBCR0: 130 123 vcpu->arch.dbcr0 = spr_val; 131 124 break; ··· 148 129 kvmppc_set_tcr(vcpu, spr_val); 149 130 break; 150 131 132 + case SPRN_DECAR: 133 + vcpu->arch.decar = spr_val; 134 + break; 151 135 /* 152 136 * Note: SPRG4-7 are user-readable. 153 137 * These values are loaded into the real SPRGs when resuming the ··· 250 228 break; 251 229 case SPRN_ESR: 252 230 *spr_val = vcpu->arch.shared->esr; 231 + break; 232 + case SPRN_CSRR0: 233 + *spr_val = vcpu->arch.csrr0; 234 + break; 235 + case SPRN_CSRR1: 236 + *spr_val = vcpu->arch.csrr1; 253 237 break; 254 238 case SPRN_DBCR0: 255 239 *spr_val = vcpu->arch.dbcr0;

+27 -28

arch/powerpc/kvm/booke_interrupts.S

··· 52 52 (1<<BOOKE_INTERRUPT_PROGRAM) | \ 53 53 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 54 54 55 - .macro KVM_HANDLER ivor_nr 55 + .macro KVM_HANDLER ivor_nr scratch srr0 56 56 _GLOBAL(kvmppc_handler_\ivor_nr) 57 57 /* Get pointer to vcpu and record exit number. */ 58 - mtspr SPRN_SPRG_WSCRATCH0, r4 58 + mtspr \scratch , r4 59 59 mfspr r4, SPRN_SPRG_RVCPU 60 + stw r3, VCPU_GPR(R3)(r4) 60 61 stw r5, VCPU_GPR(R5)(r4) 61 62 stw r6, VCPU_GPR(R6)(r4) 63 + mfspr r3, \scratch 62 64 mfctr r5 63 - lis r6, kvmppc_resume_host@h 65 + stw r3, VCPU_GPR(R4)(r4) 64 66 stw r5, VCPU_CTR(r4) 67 + mfspr r3, \srr0 68 + lis r6, kvmppc_resume_host@h 69 + stw r3, VCPU_PC(r4) 65 70 li r5, \ivor_nr 66 71 ori r6, r6, kvmppc_resume_host@l 67 72 mtctr r6 ··· 74 69 .endm 75 70 76 71 _GLOBAL(kvmppc_handlers_start) 77 - KVM_HANDLER BOOKE_INTERRUPT_CRITICAL 78 - KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK 79 - KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE 80 - KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE 81 - KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL 82 - KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT 83 - KVM_HANDLER BOOKE_INTERRUPT_PROGRAM 84 - KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL 85 - KVM_HANDLER BOOKE_INTERRUPT_SYSCALL 86 - KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL 87 - KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER 88 - KVM_HANDLER BOOKE_INTERRUPT_FIT 89 - KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG 90 - KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS 91 - KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS 92 - KVM_HANDLER BOOKE_INTERRUPT_DEBUG 93 - KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL 94 - KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA 95 - KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND 72 + KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 73 + KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 74 + KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 75 + KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 76 + KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 77 + KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 78 + KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0 79 + KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 80 + KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 81 + KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 82 + KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0 83 + KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 84 + KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 85 + KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 86 + KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 87 + KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 88 + KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 89 + KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 90 + KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 96 91 97 92 _GLOBAL(kvmppc_handler_len) 98 93 .long kvmppc_handler_1 - kvmppc_handler_0 99 - 100 94 101 95 /* Registers: 102 96 * SPRG_SCRATCH0: guest r4 ··· 103 99 * r5: KVM exit number 104 100 */ 105 101 _GLOBAL(kvmppc_resume_host) 106 - stw r3, VCPU_GPR(R3)(r4) 107 102 mfcr r3 108 103 stw r3, VCPU_CR(r4) 109 104 stw r7, VCPU_GPR(R7)(r4) ··· 183 180 stw r3, VCPU_LR(r4) 184 181 mfxer r3 185 182 stw r3, VCPU_XER(r4) 186 - mfspr r3, SPRN_SPRG_RSCRATCH0 187 - stw r3, VCPU_GPR(R4)(r4) 188 - mfspr r3, SPRN_SRR0 189 - stw r3, VCPU_PC(r4) 190 183 191 184 /* Restore host stack pointer and PID before IVPR, since the host 192 185 * exception handlers use them. */

+1 -1

arch/powerpc/kvm/bookehv_interrupts.S

··· 262 262 kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ 263 263 SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 264 264 kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ 265 - SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR) 265 + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) 266 266 kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR 267 267 kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 268 268 kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \

+3

arch/powerpc/kvm/e500_emulate.c

··· 269 269 *spr_val = vcpu->arch.shared->mas7_3 >> 32; 270 270 break; 271 271 #endif 272 + case SPRN_DECAR: 273 + *spr_val = vcpu->arch.decar; 274 + break; 272 275 case SPRN_TLB0CFG: 273 276 *spr_val = vcpu->arch.tlbcfg[0]; 274 277 break;

+6 -2

arch/powerpc/kvm/e500mc.c

··· 1 1 /* 2 - * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved. 2 + * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved. 3 3 * 4 4 * Author: Varun Sethi, <varun.sethi@freescale.com> 5 5 * ··· 57 57 struct kvm_book3e_206_tlb_entry *gtlbe) 58 58 { 59 59 unsigned int tid, ts; 60 - u32 val, eaddr, lpid; 60 + gva_t eaddr; 61 + u32 val, lpid; 61 62 unsigned long flags; 62 63 63 64 ts = get_tlb_ts(gtlbe); ··· 184 183 185 184 vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ 186 185 SPRN_EPCR_DUVD; 186 + #ifdef CONFIG_64BIT 187 + vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM; 188 + #endif 187 189 vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; 188 190 vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); 189 191 vcpu->arch.epsc = vcpu->arch.eplc;

+16

arch/powerpc/kvm/emulate.c

··· 59 59 #define OP_31_XOP_STHBRX 918 60 60 61 61 #define OP_LWZ 32 62 + #define OP_LD 58 62 63 #define OP_LWZU 33 63 64 #define OP_LBZ 34 64 65 #define OP_LBZU 35 65 66 #define OP_STW 36 66 67 #define OP_STWU 37 68 + #define OP_STD 62 67 69 #define OP_STB 38 68 70 #define OP_STBU 39 69 71 #define OP_LHZ 40 ··· 394 392 emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); 395 393 break; 396 394 395 + /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */ 396 + case OP_LD: 397 + rt = get_rt(inst); 398 + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); 399 + break; 400 + 397 401 case OP_LWZU: 398 402 emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); 399 403 kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); ··· 418 410 emulated = kvmppc_handle_store(run, vcpu, 419 411 kvmppc_get_gpr(vcpu, rs), 420 412 4, 1); 413 + break; 414 + 415 + /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */ 416 + case OP_STD: 417 + rs = get_rs(inst); 418 + emulated = kvmppc_handle_store(run, vcpu, 419 + kvmppc_get_gpr(vcpu, rs), 420 + 8, 1); 421 421 break; 422 422 423 423 case OP_STWU:

+18

arch/powerpc/kvm/powerpc.c

··· 246 246 #endif 247 247 #ifdef CONFIG_PPC_BOOK3S_64 248 248 case KVM_CAP_SPAPR_TCE: 249 + case KVM_CAP_PPC_ALLOC_HTAB: 249 250 r = 1; 250 251 break; 251 252 #endif /* CONFIG_PPC_BOOK3S_64 */ ··· 801 800 r = kvm_vm_ioctl_allocate_rma(kvm, &rma); 802 801 if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) 803 802 r = -EFAULT; 803 + break; 804 + } 805 + 806 + case KVM_PPC_ALLOCATE_HTAB: { 807 + struct kvm *kvm = filp->private_data; 808 + u32 htab_order; 809 + 810 + r = -EFAULT; 811 + if (get_user(htab_order, (u32 __user *)argp)) 812 + break; 813 + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); 814 + if (r) 815 + break; 816 + r = -EFAULT; 817 + if (put_user(htab_order, (u32 __user *)argp)) 818 + break; 819 + r = 0; 804 820 break; 805 821 } 806 822 #endif /* CONFIG_KVM_BOOK3S_64_HV */

+9

arch/powerpc/platforms/Kconfig

··· 25 25 config KVM_GUEST 26 26 bool "KVM Guest support" 27 27 default n 28 + select EPAPR_PARAVIRT 28 29 ---help--- 29 30 This option enables various optimizations for running under the KVM 30 31 hypervisor. Overhead for the kernel when not running inside KVM should 31 32 be minimal. 33 + 34 + In case of doubt, say Y 35 + 36 + config EPAPR_PARAVIRT 37 + bool "ePAPR para-virtualization support" 38 + default n 39 + help 40 + Enables ePAPR para-virtualization support for guests. 32 41 33 42 In case of doubt, say Y 34 43

+2

arch/s390/include/asm/sclp.h

··· 53 53 int sclp_chp_deconfigure(struct chp_id chpid); 54 54 int sclp_chp_read_info(struct sclp_chp_info *info); 55 55 void sclp_get_ipl_info(struct sclp_ipl_info *info); 56 + bool sclp_has_linemode(void); 57 + bool sclp_has_vt220(void); 56 58 57 59 #endif /* _ASM_S390_SCLP_H */

+1

arch/s390/include/asm/sigp.h

··· 24 24 25 25 #define SIGP_STATUS_CHECK_STOP 0x00000010UL 26 26 #define SIGP_STATUS_STOPPED 0x00000040UL 27 + #define SIGP_STATUS_EXT_CALL_PENDING 0x00000080UL 27 28 #define SIGP_STATUS_INVALID_PARAMETER 0x00000100UL 28 29 #define SIGP_STATUS_INCORRECT_STATE 0x00000200UL 29 30 #define SIGP_STATUS_NOT_RUNNING 0x00000400UL

+9 -3

arch/s390/kernel/setup.c

··· 61 61 #include <asm/kvm_virtio.h> 62 62 #include <asm/diag.h> 63 63 #include <asm/os_info.h> 64 + #include <asm/sclp.h> 64 65 #include "entry.h" 65 66 66 67 long psw_kernel_bits = PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_ASC_PRIMARY | ··· 137 136 138 137 static void __init set_preferred_console(void) 139 138 { 140 - if (MACHINE_IS_KVM) 141 - add_preferred_console("hvc", 0, NULL); 142 - else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) 139 + if (MACHINE_IS_KVM) { 140 + if (sclp_has_vt220()) 141 + add_preferred_console("ttyS", 1, NULL); 142 + else if (sclp_has_linemode()) 143 + add_preferred_console("ttyS", 0, NULL); 144 + else 145 + add_preferred_console("hvc", 0, NULL); 146 + } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) 143 147 add_preferred_console("ttyS", 0, NULL); 144 148 else if (CONSOLE_IS_3270) 145 149 add_preferred_console("tty3270", 0, NULL);

+1

arch/s390/kvm/kvm-s390.c

··· 347 347 vcpu->arch.guest_fpregs.fpc = 0; 348 348 asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); 349 349 vcpu->arch.sie_block->gbea = 1; 350 + atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 350 351 } 351 352 352 353 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)

+42 -35

arch/s390/kvm/sigp.c

··· 26 26 int rc; 27 27 28 28 if (cpu_addr >= KVM_MAX_VCPUS) 29 - return 3; /* not operational */ 29 + return SIGP_CC_NOT_OPERATIONAL; 30 30 31 31 spin_lock(&fi->lock); 32 32 if (fi->local_int[cpu_addr] == NULL) 33 - rc = 3; /* not operational */ 33 + rc = SIGP_CC_NOT_OPERATIONAL; 34 34 else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) 35 - & CPUSTAT_STOPPED)) { 35 + & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) 36 + rc = SIGP_CC_ORDER_CODE_ACCEPTED; 37 + else { 36 38 *reg &= 0xffffffff00000000UL; 37 - rc = 1; /* status stored */ 38 - } else { 39 - *reg &= 0xffffffff00000000UL; 40 - *reg |= SIGP_STATUS_STOPPED; 41 - rc = 1; /* status stored */ 39 + if (atomic_read(fi->local_int[cpu_addr]->cpuflags) 40 + & CPUSTAT_ECALL_PEND) 41 + *reg |= SIGP_STATUS_EXT_CALL_PENDING; 42 + if (atomic_read(fi->local_int[cpu_addr]->cpuflags) 43 + & CPUSTAT_STOPPED) 44 + *reg |= SIGP_STATUS_STOPPED; 45 + rc = SIGP_CC_STATUS_STORED; 42 46 } 43 47 spin_unlock(&fi->lock); 44 48 ··· 58 54 int rc; 59 55 60 56 if (cpu_addr >= KVM_MAX_VCPUS) 61 - return 3; /* not operational */ 57 + return SIGP_CC_NOT_OPERATIONAL; 62 58 63 59 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 64 60 if (!inti) ··· 70 66 spin_lock(&fi->lock); 71 67 li = fi->local_int[cpu_addr]; 72 68 if (li == NULL) { 73 - rc = 3; /* not operational */ 69 + rc = SIGP_CC_NOT_OPERATIONAL; 74 70 kfree(inti); 75 71 goto unlock; 76 72 } ··· 81 77 if (waitqueue_active(&li->wq)) 82 78 wake_up_interruptible(&li->wq); 83 79 spin_unlock_bh(&li->lock); 84 - rc = 0; /* order accepted */ 80 + rc = SIGP_CC_ORDER_CODE_ACCEPTED; 85 81 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); 86 82 unlock: 87 83 spin_unlock(&fi->lock); ··· 96 92 int rc; 97 93 98 94 if (cpu_addr >= KVM_MAX_VCPUS) 99 - return 3; /* not operational */ 95 + return SIGP_CC_NOT_OPERATIONAL; 100 96 101 97 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 102 98 if (!inti) ··· 108 104 spin_lock(&fi->lock); 109 105 li = fi->local_int[cpu_addr]; 110 106 if (li == NULL) { 111 - rc = 3; /* not operational */ 107 + rc = SIGP_CC_NOT_OPERATIONAL; 112 108 kfree(inti); 113 109 goto unlock; 114 110 } ··· 119 115 if (waitqueue_active(&li->wq)) 120 116 wake_up_interruptible(&li->wq); 121 117 spin_unlock_bh(&li->lock); 122 - rc = 0; /* order accepted */ 118 + rc = SIGP_CC_ORDER_CODE_ACCEPTED; 123 119 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); 124 120 unlock: 125 121 spin_unlock(&fi->lock); ··· 147 143 out: 148 144 spin_unlock_bh(&li->lock); 149 145 150 - return 0; /* order accepted */ 146 + return SIGP_CC_ORDER_CODE_ACCEPTED; 151 147 } 152 148 153 149 static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) ··· 157 153 int rc; 158 154 159 155 if (cpu_addr >= KVM_MAX_VCPUS) 160 - return 3; /* not operational */ 156 + return SIGP_CC_NOT_OPERATIONAL; 161 157 162 158 spin_lock(&fi->lock); 163 159 li = fi->local_int[cpu_addr]; 164 160 if (li == NULL) { 165 - rc = 3; /* not operational */ 161 + rc = SIGP_CC_NOT_OPERATIONAL; 166 162 goto unlock; 167 163 } 168 164 ··· 186 182 187 183 switch (parameter & 0xff) { 188 184 case 0: 189 - rc = 3; /* not operational */ 185 + rc = SIGP_CC_NOT_OPERATIONAL; 190 186 break; 191 187 case 1: 192 188 case 2: 193 - rc = 0; /* order accepted */ 189 + rc = SIGP_CC_ORDER_CODE_ACCEPTED; 194 190 break; 195 191 default: 196 192 rc = -EOPNOTSUPP; ··· 211 207 address = address & 0x7fffe000u; 212 208 if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || 213 209 copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) { 210 + *reg &= 0xffffffff00000000UL; 214 211 *reg |= SIGP_STATUS_INVALID_PARAMETER; 215 - return 1; /* invalid parameter */ 212 + return SIGP_CC_STATUS_STORED; 216 213 } 217 214 218 215 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 219 216 if (!inti) 220 - return 2; /* busy */ 217 + return SIGP_CC_BUSY; 221 218 222 219 spin_lock(&fi->lock); 223 220 if (cpu_addr < KVM_MAX_VCPUS) 224 221 li = fi->local_int[cpu_addr]; 225 222 226 223 if (li == NULL) { 227 - rc = 1; /* incorrect state */ 228 - *reg &= SIGP_STATUS_INCORRECT_STATE; 224 + *reg &= 0xffffffff00000000UL; 225 + *reg |= SIGP_STATUS_INCORRECT_STATE; 226 + rc = SIGP_CC_STATUS_STORED; 229 227 kfree(inti); 230 228 goto out_fi; 231 229 } ··· 235 229 spin_lock_bh(&li->lock); 236 230 /* cpu must be in stopped state */ 237 231 if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { 238 - rc = 1; /* incorrect state */ 239 - *reg &= SIGP_STATUS_INCORRECT_STATE; 232 + *reg &= 0xffffffff00000000UL; 233 + *reg |= SIGP_STATUS_INCORRECT_STATE; 234 + rc = SIGP_CC_STATUS_STORED; 240 235 kfree(inti); 241 236 goto out_li; 242 237 } ··· 249 242 atomic_set(&li->active, 1); 250 243 if (waitqueue_active(&li->wq)) 251 244 wake_up_interruptible(&li->wq); 252 - rc = 0; /* order accepted */ 245 + rc = SIGP_CC_ORDER_CODE_ACCEPTED; 253 246 254 247 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); 255 248 out_li: ··· 266 259 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; 267 260 268 261 if (cpu_addr >= KVM_MAX_VCPUS) 269 - return 3; /* not operational */ 262 + return SIGP_CC_NOT_OPERATIONAL; 270 263 271 264 spin_lock(&fi->lock); 272 265 if (fi->local_int[cpu_addr] == NULL) 273 - rc = 3; /* not operational */ 266 + rc = SIGP_CC_NOT_OPERATIONAL; 274 267 else { 275 268 if (atomic_read(fi->local_int[cpu_addr]->cpuflags) 276 269 & CPUSTAT_RUNNING) { 277 270 /* running */ 278 - rc = 1; 271 + rc = SIGP_CC_ORDER_CODE_ACCEPTED; 279 272 } else { 280 273 /* not running */ 281 274 *reg &= 0xffffffff00000000UL; 282 275 *reg |= SIGP_STATUS_NOT_RUNNING; 283 - rc = 0; 276 + rc = SIGP_CC_STATUS_STORED; 284 277 } 285 278 } 286 279 spin_unlock(&fi->lock); ··· 293 286 294 287 static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) 295 288 { 296 - int rc = 0; 297 289 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; 298 290 struct kvm_s390_local_interrupt *li; 291 + int rc = SIGP_CC_ORDER_CODE_ACCEPTED; 299 292 300 293 if (cpu_addr >= KVM_MAX_VCPUS) 301 - return 3; /* not operational */ 294 + return SIGP_CC_NOT_OPERATIONAL; 302 295 303 296 spin_lock(&fi->lock); 304 297 li = fi->local_int[cpu_addr]; 305 298 if (li == NULL) { 306 - rc = 3; /* not operational */ 299 + rc = SIGP_CC_NOT_OPERATIONAL; 307 300 goto out; 308 301 } 309 302 310 303 spin_lock_bh(&li->lock); 311 304 if (li->action_bits & ACTION_STOP_ON_STOP) 312 - rc = 2; /* busy */ 305 + rc = SIGP_CC_BUSY; 313 306 else 314 307 VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", 315 308 cpu_addr); ··· 384 377 case SIGP_RESTART: 385 378 vcpu->stat.instruction_sigp_restart++; 386 379 rc = __sigp_restart(vcpu, cpu_addr); 387 - if (rc == 2) /* busy */ 380 + if (rc == SIGP_CC_BUSY) 388 381 break; 389 382 /* user space must know about restart */ 390 383 default:

+3

arch/x86/include/asm/apic.h

··· 465 465 return apic->safe_wait_icr_idle(); 466 466 } 467 467 468 + extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); 469 + 468 470 #else /* CONFIG_X86_LOCAL_APIC */ 469 471 470 472 static inline u32 apic_read(u32 reg) { return 0; } ··· 476 474 static inline void apic_icr_write(u32 low, u32 high) { } 477 475 static inline void apic_wait_icr_idle(void) { } 478 476 static inline u32 safe_apic_wait_icr_idle(void) { return 0; } 477 + static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} 479 478 480 479 #endif /* CONFIG_X86_LOCAL_APIC */ 481 480

+7

arch/x86/include/asm/bitops.h

··· 264 264 * This operation is non-atomic and can be reordered. 265 265 * If two examples of this operation race, one can appear to succeed 266 266 * but actually fail. You must protect multiple accesses with a lock. 267 + * 268 + * Note: the operation is performed atomically with respect to 269 + * the local CPU, but not other CPUs. Portable code should not 270 + * rely on this behaviour. 271 + * KVM relies on this behaviour on x86 for modifying memory that is also 272 + * accessed from a hypervisor on the same CPU if running in a VM: don't change 273 + * this without also updating arch/x86/kernel/kvm.c 267 274 */ 268 275 static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 269 276 {

+1

arch/x86/include/asm/hypervisor.h

··· 49 49 extern const struct hypervisor_x86 x86_hyper_vmware; 50 50 extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 51 51 extern const struct hypervisor_x86 x86_hyper_xen_hvm; 52 + extern const struct hypervisor_x86 x86_hyper_kvm; 52 53 53 54 static inline bool hypervisor_x2apic_available(void) 54 55 {

+1

arch/x86/include/asm/kvm.h

··· 12 12 /* Select x86 specific features in <linux/kvm.h> */ 13 13 #define __KVM_HAVE_PIT 14 14 #define __KVM_HAVE_IOAPIC 15 + #define __KVM_HAVE_IRQ_LINE 15 16 #define __KVM_HAVE_DEVICE_ASSIGNMENT 16 17 #define __KVM_HAVE_MSI 17 18 #define __KVM_HAVE_USER_NMI

+3 -3

arch/x86/include/asm/kvm_emulate.h

··· 192 192 struct x86_instruction_info *info, 193 193 enum x86_intercept_stage stage); 194 194 195 - bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 196 - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 195 + void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, 196 + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 197 197 }; 198 198 199 199 typedef u32 __attribute__((vector_size(16))) sse128_t; ··· 280 280 u8 modrm_seg; 281 281 bool rip_relative; 282 282 unsigned long _eip; 283 + struct operand memop; 283 284 /* Fields above regs are cleared together. */ 284 285 unsigned long regs[NR_VCPU_REGS]; 285 - struct operand memop; 286 286 struct operand *memopp; 287 287 struct fetch_cache fetch; 288 288 struct read_cache io_read;

+29 -2

arch/x86/include/asm/kvm_host.h

··· 48 48 49 49 #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 50 50 #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 51 + #define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL 51 52 #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 52 53 0xFFFFFF0000000000ULL) 53 54 #define CR4_RESERVED_BITS \ 54 55 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 55 56 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 56 - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 57 + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ 57 58 | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ 58 59 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 59 60 ··· 176 175 177 176 /* apic attention bits */ 178 177 #define KVM_APIC_CHECK_VAPIC 0 178 + /* 179 + * The following bit is set with PV-EOI, unset on EOI. 180 + * We detect PV-EOI changes by guest by comparing 181 + * this bit with PV-EOI in guest memory. 182 + * See the implementation in apic_update_pv_eoi. 183 + */ 184 + #define KVM_APIC_PV_EOI_PENDING 1 179 185 180 186 /* 181 187 * We don't want allocation failures within the mmu code, so we preallocate ··· 492 484 u64 length; 493 485 u64 status; 494 486 } osvw; 487 + 488 + struct { 489 + u64 msr_val; 490 + struct gfn_to_hva_cache data; 491 + } pv_eoi; 495 492 }; 496 493 497 494 struct kvm_lpage_info { ··· 674 661 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 675 662 int (*get_lpage_level)(void); 676 663 bool (*rdtscp_supported)(void); 664 + bool (*invpcid_supported)(void); 677 665 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); 678 666 679 667 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); ··· 816 802 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); 817 803 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 818 804 819 - int kvm_pic_set_irq(void *opaque, int irq, int level); 805 + static inline int __kvm_irq_line_state(unsigned long *irq_state, 806 + int irq_source_id, int level) 807 + { 808 + /* Logical OR for level trig interrupt */ 809 + if (level) 810 + __set_bit(irq_source_id, irq_state); 811 + else 812 + __clear_bit(irq_source_id, irq_state); 813 + 814 + return !!(*irq_state); 815 + } 816 + 817 + int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); 818 + void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); 820 819 821 820 void kvm_inject_nmi(struct kvm_vcpu *vcpu); 822 821

+7

arch/x86/include/asm/kvm_para.h

··· 22 22 #define KVM_FEATURE_CLOCKSOURCE2 3 23 23 #define KVM_FEATURE_ASYNC_PF 4 24 24 #define KVM_FEATURE_STEAL_TIME 5 25 + #define KVM_FEATURE_PV_EOI 6 25 26 26 27 /* The last 8 bits are used to indicate how to interpret the flags field 27 28 * in pvclock structure. If no bits are set, all flags are ignored. ··· 38 37 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 39 38 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 40 39 #define MSR_KVM_STEAL_TIME 0x4b564d03 40 + #define MSR_KVM_PV_EOI_EN 0x4b564d04 41 41 42 42 struct kvm_steal_time { 43 43 __u64 steal; ··· 90 88 __u8 pad[60]; 91 89 __u32 enabled; 92 90 }; 91 + 92 + #define KVM_PV_EOI_BIT 0 93 + #define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) 94 + #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK 95 + #define KVM_PV_EOI_DISABLED 0x0 93 96 94 97 #ifdef __KERNEL__ 95 98 #include <asm/processor.h>

+2

arch/x86/include/asm/processor-flags.h

··· 44 44 */ 45 45 #define X86_CR3_PWT 0x00000008 /* Page Write Through */ 46 46 #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ 47 + #define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ 47 48 48 49 /* 49 50 * Intel CPU features in CR4 ··· 62 61 #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 63 62 #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 64 63 #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ 64 + #define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ 65 65 #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 66 66 #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ 67 67

+6

arch/x86/include/asm/vmx.h

··· 60 60 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 61 61 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 62 62 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 63 + #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 63 64 64 65 65 66 #define PIN_BASED_EXT_INTR_MASK 0x00000001 ··· 282 281 #define EXIT_REASON_EPT_MISCONFIG 49 283 282 #define EXIT_REASON_WBINVD 54 284 283 #define EXIT_REASON_XSETBV 55 284 + #define EXIT_REASON_INVPCID 58 285 285 286 286 /* 287 287 * Interruption-information format ··· 406 404 #define VMX_EPTP_WB_BIT (1ull << 14) 407 405 #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 408 406 #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 407 + #define VMX_EPT_AD_BIT (1ull << 21) 409 408 #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 410 409 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 411 410 #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) ··· 418 415 #define VMX_EPT_MAX_GAW 0x4 419 416 #define VMX_EPT_MT_EPTE_SHIFT 3 420 417 #define VMX_EPT_GAW_EPTP_SHIFT 3 418 + #define VMX_EPT_AD_ENABLE_BIT (1ull << 6) 421 419 #define VMX_EPT_DEFAULT_MT 0x6ull 422 420 #define VMX_EPT_READABLE_MASK 0x1ull 423 421 #define VMX_EPT_WRITABLE_MASK 0x2ull 424 422 #define VMX_EPT_EXECUTABLE_MASK 0x4ull 425 423 #define VMX_EPT_IPAT_BIT (1ull << 6) 424 + #define VMX_EPT_ACCESS_BIT (1ull << 8) 425 + #define VMX_EPT_DIRTY_BIT (1ull << 9) 426 426 427 427 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 428 428

+17

arch/x86/kernel/apic/apic.c

··· 2143 2143 } 2144 2144 2145 2145 /* 2146 + * Override the generic EOI implementation with an optimized version. 2147 + * Only called during early boot when only one CPU is active and with 2148 + * interrupts disabled, so we know this does not race with actual APIC driver 2149 + * use. 2150 + */ 2151 + void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) 2152 + { 2153 + struct apic **drv; 2154 + 2155 + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { 2156 + /* Should happen once for each apic */ 2157 + WARN_ON((*drv)->eoi_write == eoi_write); 2158 + (*drv)->eoi_write = eoi_write; 2159 + } 2160 + } 2161 + 2162 + /* 2146 2163 * Power management 2147 2164 */ 2148 2165 #ifdef CONFIG_PM

+3

arch/x86/kernel/cpu/hypervisor.c

··· 37 37 #endif 38 38 &x86_hyper_vmware, 39 39 &x86_hyper_ms_hyperv, 40 + #ifdef CONFIG_KVM_GUEST 41 + &x86_hyper_kvm, 42 + #endif 40 43 }; 41 44 42 45 const struct hypervisor_x86 *x86_hyper;

+61 -3

arch/x86/kernel/kvm.c

··· 39 39 #include <asm/desc.h> 40 40 #include <asm/tlbflush.h> 41 41 #include <asm/idle.h> 42 + #include <asm/apic.h> 43 + #include <asm/apicdef.h> 44 + #include <asm/hypervisor.h> 42 45 43 46 static int kvmapf = 1; 44 47 ··· 286 283 cpu, __pa(st)); 287 284 } 288 285 286 + static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 287 + 288 + static void kvm_guest_apic_eoi_write(u32 reg, u32 val) 289 + { 290 + /** 291 + * This relies on __test_and_clear_bit to modify the memory 292 + * in a way that is atomic with respect to the local CPU. 293 + * The hypervisor only accesses this memory from the local CPU so 294 + * there's no need for lock or memory barriers. 295 + * An optimization barrier is implied in apic write. 296 + */ 297 + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) 298 + return; 299 + apic_write(APIC_EOI, APIC_EOI_ACK); 300 + } 301 + 289 302 void __cpuinit kvm_guest_cpu_init(void) 290 303 { 291 304 if (!kvm_para_available()) ··· 319 300 smp_processor_id()); 320 301 } 321 302 303 + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { 304 + unsigned long pa; 305 + /* Size alignment is implied but just to make it explicit. */ 306 + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 307 + __get_cpu_var(kvm_apic_eoi) = 0; 308 + pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; 309 + wrmsrl(MSR_KVM_PV_EOI_EN, pa); 310 + } 311 + 322 312 if (has_steal_clock) 323 313 kvm_register_steal_time(); 324 314 } 325 315 326 - static void kvm_pv_disable_apf(void *unused) 316 + static void kvm_pv_disable_apf(void) 327 317 { 328 318 if (!__get_cpu_var(apf_reason).enabled) 329 319 return; ··· 344 316 smp_processor_id()); 345 317 } 346 318 319 + static void kvm_pv_guest_cpu_reboot(void *unused) 320 + { 321 + /* 322 + * We disable PV EOI before we load a new kernel by kexec, 323 + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. 324 + * New kernel can re-enable when it boots. 325 + */ 326 + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 327 + wrmsrl(MSR_KVM_PV_EOI_EN, 0); 328 + kvm_pv_disable_apf(); 329 + } 330 + 347 331 static int kvm_pv_reboot_notify(struct notifier_block *nb, 348 332 unsigned long code, void *unused) 349 333 { 350 334 if (code == SYS_RESTART) 351 - on_each_cpu(kvm_pv_disable_apf, NULL, 1); 335 + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 352 336 return NOTIFY_DONE; 353 337 } 354 338 ··· 411 371 static void kvm_guest_cpu_offline(void *dummy) 412 372 { 413 373 kvm_disable_steal_time(); 414 - kvm_pv_disable_apf(NULL); 374 + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 375 + wrmsrl(MSR_KVM_PV_EOI_EN, 0); 376 + kvm_pv_disable_apf(); 415 377 apf_task_wake_all(); 416 378 } 417 379 ··· 466 424 pv_time_ops.steal_clock = kvm_steal_clock; 467 425 } 468 426 427 + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 428 + apic_set_eoi_write(kvm_guest_apic_eoi_write); 429 + 469 430 #ifdef CONFIG_SMP 470 431 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 471 432 register_cpu_notifier(&kvm_cpu_notifier); ··· 476 431 kvm_guest_cpu_init(); 477 432 #endif 478 433 } 434 + 435 + static bool __init kvm_detect(void) 436 + { 437 + if (!kvm_para_available()) 438 + return false; 439 + return true; 440 + } 441 + 442 + const struct hypervisor_x86 x86_hyper_kvm __refconst = { 443 + .name = "KVM", 444 + .detect = kvm_detect, 445 + }; 446 + EXPORT_SYMBOL_GPL(x86_hyper_kvm); 479 447 480 448 static __init int activate_jump_labels(void) 481 449 {

+26 -20

arch/x86/kvm/cpuid.c

··· 201 201 unsigned f_lm = 0; 202 202 #endif 203 203 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 204 + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; 204 205 205 206 /* cpuid 1.edx */ 206 207 const u32 kvm_supported_word0_x86_features = ··· 229 228 0 /* DS-CPL, VMX, SMX, EST */ | 230 229 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 231 230 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | 232 - 0 /* Reserved, DCA */ | F(XMM4_1) | 231 + F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | 233 232 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 234 233 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 235 234 F(F16C) | F(RDRAND); ··· 249 248 /* cpuid 7.0.ebx */ 250 249 const u32 kvm_supported_word9_x86_features = 251 250 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 252 - F(BMI2) | F(ERMS) | F(RTM); 251 + F(BMI2) | F(ERMS) | f_invpcid | F(RTM); 253 252 254 253 /* all calls to cpuid_count() should be made on the same cpu */ 255 254 get_cpu(); ··· 410 409 (1 << KVM_FEATURE_NOP_IO_DELAY) | 411 410 (1 << KVM_FEATURE_CLOCKSOURCE2) | 412 411 (1 << KVM_FEATURE_ASYNC_PF) | 412 + (1 << KVM_FEATURE_PV_EOI) | 413 413 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 414 414 415 415 if (sched_info_on()) ··· 641 639 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 642 640 } 643 641 644 - void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 642 + void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 645 643 { 646 - u32 function, index; 644 + u32 function = *eax, index = *ecx; 647 645 struct kvm_cpuid_entry2 *best; 648 646 649 - function = kvm_register_read(vcpu, VCPU_REGS_RAX); 650 - index = kvm_register_read(vcpu, VCPU_REGS_RCX); 651 - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 652 - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 653 - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 654 - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 655 647 best = kvm_find_cpuid_entry(vcpu, function, index); 656 648 657 649 if (!best) 658 650 best = check_cpuid_limit(vcpu, function, index); 659 651 660 652 if (best) { 661 - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 662 - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 663 - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 664 - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 665 - } 653 + *eax = best->eax; 654 + *ebx = best->ebx; 655 + *ecx = best->ecx; 656 + *edx = best->edx; 657 + } else 658 + *eax = *ebx = *ecx = *edx = 0; 659 + } 660 + 661 + void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 662 + { 663 + u32 function, eax, ebx, ecx, edx; 664 + 665 + function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 666 + ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 667 + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); 668 + kvm_register_write(vcpu, VCPU_REGS_RAX, eax); 669 + kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); 670 + kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); 671 + kvm_register_write(vcpu, VCPU_REGS_RDX, edx); 666 672 kvm_x86_ops->skip_emulated_instruction(vcpu); 667 - trace_kvm_cpuid(function, 668 - kvm_register_read(vcpu, VCPU_REGS_RAX), 669 - kvm_register_read(vcpu, VCPU_REGS_RBX), 670 - kvm_register_read(vcpu, VCPU_REGS_RCX), 671 - kvm_register_read(vcpu, VCPU_REGS_RDX)); 673 + trace_kvm_cpuid(function, eax, ebx, ecx, edx); 672 674 } 673 675 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);

+9

arch/x86/kvm/cpuid.h

··· 17 17 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 18 18 struct kvm_cpuid2 *cpuid, 19 19 struct kvm_cpuid_entry2 __user *entries); 20 + void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); 20 21 21 22 22 23 static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) ··· 50 49 51 50 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 52 51 return best && (best->ecx & bit(X86_FEATURE_OSVW)); 52 + } 53 + 54 + static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) 55 + { 56 + struct kvm_cpuid_entry2 *best; 57 + 58 + best = kvm_find_cpuid_entry(vcpu, 1, 0); 59 + return best && (best->ecx & bit(X86_FEATURE_PCID)); 53 60 } 54 61 55 62 #endif

+218 -53

arch/x86/kvm/emulate.c

··· 433 433 return ctxt->ops->intercept(ctxt, &info, stage); 434 434 } 435 435 436 + static void assign_masked(ulong *dest, ulong src, ulong mask) 437 + { 438 + *dest = (*dest & ~mask) | (src & mask); 439 + } 440 + 436 441 static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) 437 442 { 438 443 return (1UL << (ctxt->ad_bytes << 3)) - 1; 444 + } 445 + 446 + static ulong stack_mask(struct x86_emulate_ctxt *ctxt) 447 + { 448 + u16 sel; 449 + struct desc_struct ss; 450 + 451 + if (ctxt->mode == X86EMUL_MODE_PROT64) 452 + return ~0UL; 453 + ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); 454 + return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ 455 + } 456 + 457 + static int stack_size(struct x86_emulate_ctxt *ctxt) 458 + { 459 + return (__fls(stack_mask(ctxt)) + 1) >> 3; 439 460 } 440 461 441 462 /* Access/update address held in a register, based on addressing mode. */ ··· 979 958 op->orig_val = op->val; 980 959 } 981 960 961 + static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) 962 + { 963 + if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) 964 + ctxt->modrm_seg = VCPU_SREG_SS; 965 + } 966 + 982 967 static int decode_modrm(struct x86_emulate_ctxt *ctxt, 983 968 struct operand *op) 984 969 { ··· 1088 1061 1089 1062 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1090 1063 modrm_ea += insn_fetch(s32, ctxt); 1091 - else 1064 + else { 1092 1065 modrm_ea += ctxt->regs[base_reg]; 1066 + adjust_modrm_seg(ctxt, base_reg); 1067 + } 1093 1068 if (index_reg != 4) 1094 1069 modrm_ea += ctxt->regs[index_reg] << scale; 1095 1070 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1096 1071 if (ctxt->mode == X86EMUL_MODE_PROT64) 1097 1072 ctxt->rip_relative = 1; 1098 - } else 1099 - modrm_ea += ctxt->regs[ctxt->modrm_rm]; 1073 + } else { 1074 + base_reg = ctxt->modrm_rm; 1075 + modrm_ea += ctxt->regs[base_reg]; 1076 + adjust_modrm_seg(ctxt, base_reg); 1077 + } 1100 1078 switch (ctxt->modrm_mod) { 1101 1079 case 0: 1102 1080 if (ctxt->modrm_rm == 5) ··· 1296 1264 1297 1265 /* allowed just for 8 bytes segments */ 1298 1266 static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1299 - u16 selector, struct desc_struct *desc) 1267 + u16 selector, struct desc_struct *desc, 1268 + ulong *desc_addr_p) 1300 1269 { 1301 1270 struct desc_ptr dt; 1302 1271 u16 index = selector >> 3; ··· 1308 1275 if (dt.size < index * 8 + 7) 1309 1276 return emulate_gp(ctxt, selector & 0xfffc); 1310 1277 1311 - addr = dt.address + index * 8; 1278 + *desc_addr_p = addr = dt.address + index * 8; 1312 1279 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, 1313 1280 &ctxt->exception); 1314 1281 } ··· 1335 1302 static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1336 1303 u16 selector, int seg) 1337 1304 { 1338 - struct desc_struct seg_desc; 1305 + struct desc_struct seg_desc, old_desc; 1339 1306 u8 dpl, rpl, cpl; 1340 1307 unsigned err_vec = GP_VECTOR; 1341 1308 u32 err_code = 0; 1342 1309 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1310 + ulong desc_addr; 1343 1311 int ret; 1344 1312 1345 1313 memset(&seg_desc, 0, sizeof seg_desc); ··· 1358 1324 goto load; 1359 1325 } 1360 1326 1361 - /* NULL selector is not valid for TR, CS and SS */ 1362 - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 1327 + rpl = selector & 3; 1328 + cpl = ctxt->ops->cpl(ctxt); 1329 + 1330 + /* NULL selector is not valid for TR, CS and SS (except for long mode) */ 1331 + if ((seg == VCPU_SREG_CS 1332 + || (seg == VCPU_SREG_SS 1333 + && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) 1334 + || seg == VCPU_SREG_TR) 1363 1335 && null_selector) 1364 1336 goto exception; 1365 1337 ··· 1376 1336 if (null_selector) /* for NULL selector skip all following checks */ 1377 1337 goto load; 1378 1338 1379 - ret = read_segment_descriptor(ctxt, selector, &seg_desc); 1339 + ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); 1380 1340 if (ret != X86EMUL_CONTINUE) 1381 1341 return ret; 1382 1342 ··· 1392 1352 goto exception; 1393 1353 } 1394 1354 1395 - rpl = selector & 3; 1396 1355 dpl = seg_desc.dpl; 1397 - cpl = ctxt->ops->cpl(ctxt); 1398 1356 1399 1357 switch (seg) { 1400 1358 case VCPU_SREG_SS: ··· 1422 1384 case VCPU_SREG_TR: 1423 1385 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) 1424 1386 goto exception; 1387 + old_desc = seg_desc; 1388 + seg_desc.type |= 2; /* busy */ 1389 + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, 1390 + sizeof(seg_desc), &ctxt->exception); 1391 + if (ret != X86EMUL_CONTINUE) 1392 + return ret; 1425 1393 break; 1426 1394 case VCPU_SREG_LDTR: 1427 1395 if (seg_desc.s || seg_desc.type != 2) ··· 1518 1474 return X86EMUL_CONTINUE; 1519 1475 } 1520 1476 1521 - static int em_push(struct x86_emulate_ctxt *ctxt) 1477 + static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) 1522 1478 { 1523 1479 struct segmented_address addr; 1524 1480 1525 - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); 1481 + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); 1526 1482 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1527 1483 addr.seg = VCPU_SREG_SS; 1528 1484 1485 + return segmented_write(ctxt, addr, data, bytes); 1486 + } 1487 + 1488 + static int em_push(struct x86_emulate_ctxt *ctxt) 1489 + { 1529 1490 /* Disable writeback. */ 1530 1491 ctxt->dst.type = OP_NONE; 1531 - return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); 1492 + return push(ctxt, &ctxt->src.val, ctxt->op_bytes); 1532 1493 } 1533 1494 1534 1495 static int emulate_pop(struct x86_emulate_ctxt *ctxt, ··· 1603 1554 ctxt->dst.addr.reg = &ctxt->eflags; 1604 1555 ctxt->dst.bytes = ctxt->op_bytes; 1605 1556 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); 1557 + } 1558 + 1559 + static int em_enter(struct x86_emulate_ctxt *ctxt) 1560 + { 1561 + int rc; 1562 + unsigned frame_size = ctxt->src.val; 1563 + unsigned nesting_level = ctxt->src2.val & 31; 1564 + 1565 + if (nesting_level) 1566 + return X86EMUL_UNHANDLEABLE; 1567 + 1568 + rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); 1569 + if (rc != X86EMUL_CONTINUE) 1570 + return rc; 1571 + assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], 1572 + stack_mask(ctxt)); 1573 + assign_masked(&ctxt->regs[VCPU_REGS_RSP], 1574 + ctxt->regs[VCPU_REGS_RSP] - frame_size, 1575 + stack_mask(ctxt)); 1576 + return X86EMUL_CONTINUE; 1577 + } 1578 + 1579 + static int em_leave(struct x86_emulate_ctxt *ctxt) 1580 + { 1581 + assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], 1582 + stack_mask(ctxt)); 1583 + return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); 1606 1584 } 1607 1585 1608 1586 static int em_push_sreg(struct x86_emulate_ctxt *ctxt) ··· 2069 1993 u32 eax, ebx, ecx, edx; 2070 1994 2071 1995 eax = ecx = 0; 2072 - return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) 2073 - && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 1996 + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 1997 + return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 2074 1998 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 2075 1999 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; 2076 2000 } ··· 2089 2013 2090 2014 eax = 0x00000000; 2091 2015 ecx = 0x00000000; 2092 - if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { 2093 - /* 2094 - * Intel ("GenuineIntel") 2095 - * remark: Intel CPUs only support "syscall" in 64bit 2096 - * longmode. Also an 64bit guest with a 2097 - * 32bit compat-app running will #UD !! While this 2098 - * behaviour can be fixed (by emulating) into AMD 2099 - * response - CPUs of AMD can't behave like Intel. 2100 - */ 2101 - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && 2102 - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && 2103 - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) 2104 - return false; 2016 + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 2017 + /* 2018 + * Intel ("GenuineIntel") 2019 + * remark: Intel CPUs only support "syscall" in 64bit 2020 + * longmode. Also an 64bit guest with a 2021 + * 32bit compat-app running will #UD !! While this 2022 + * behaviour can be fixed (by emulating) into AMD 2023 + * response - CPUs of AMD can't behave like Intel. 2024 + */ 2025 + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && 2026 + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && 2027 + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) 2028 + return false; 2105 2029 2106 - /* AMD ("AuthenticAMD") */ 2107 - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && 2108 - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && 2109 - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) 2110 - return true; 2030 + /* AMD ("AuthenticAMD") */ 2031 + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && 2032 + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && 2033 + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) 2034 + return true; 2111 2035 2112 - /* AMD ("AMDisbetter!") */ 2113 - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && 2114 - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && 2115 - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) 2116 - return true; 2117 - } 2036 + /* AMD ("AMDisbetter!") */ 2037 + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && 2038 + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && 2039 + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) 2040 + return true; 2118 2041 2119 2042 /* default: (not Intel, not AMD), apply Intel's stricter rules... */ 2120 2043 return false; ··· 2622 2547 ulong old_tss_base = 2623 2548 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); 2624 2549 u32 desc_limit; 2550 + ulong desc_addr; 2625 2551 2626 2552 /* FIXME: old_tss_base == ~0 ? */ 2627 2553 2628 - ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); 2554 + ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); 2629 2555 if (ret != X86EMUL_CONTINUE) 2630 2556 return ret; 2631 - ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); 2557 + ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); 2632 2558 if (ret != X86EMUL_CONTINUE) 2633 2559 return ret; 2634 2560 ··· 3024 2948 return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); 3025 2949 } 3026 2950 2951 + static int em_lldt(struct x86_emulate_ctxt *ctxt) 2952 + { 2953 + u16 sel = ctxt->src.val; 2954 + 2955 + /* Disable writeback. */ 2956 + ctxt->dst.type = OP_NONE; 2957 + return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); 2958 + } 2959 + 2960 + static int em_ltr(struct x86_emulate_ctxt *ctxt) 2961 + { 2962 + u16 sel = ctxt->src.val; 2963 + 2964 + /* Disable writeback. */ 2965 + ctxt->dst.type = OP_NONE; 2966 + return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); 2967 + } 2968 + 3027 2969 static int em_invlpg(struct x86_emulate_ctxt *ctxt) 3028 2970 { 3029 2971 int rc; ··· 3083 2989 return X86EMUL_CONTINUE; 3084 2990 } 3085 2991 2992 + static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, 2993 + void (*get)(struct x86_emulate_ctxt *ctxt, 2994 + struct desc_ptr *ptr)) 2995 + { 2996 + struct desc_ptr desc_ptr; 2997 + 2998 + if (ctxt->mode == X86EMUL_MODE_PROT64) 2999 + ctxt->op_bytes = 8; 3000 + get(ctxt, &desc_ptr); 3001 + if (ctxt->op_bytes == 2) { 3002 + ctxt->op_bytes = 4; 3003 + desc_ptr.address &= 0x00ffffff; 3004 + } 3005 + /* Disable writeback. */ 3006 + ctxt->dst.type = OP_NONE; 3007 + return segmented_write(ctxt, ctxt->dst.addr.mem, 3008 + &desc_ptr, 2 + ctxt->op_bytes); 3009 + } 3010 + 3011 + static int em_sgdt(struct x86_emulate_ctxt *ctxt) 3012 + { 3013 + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); 3014 + } 3015 + 3016 + static int em_sidt(struct x86_emulate_ctxt *ctxt) 3017 + { 3018 + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); 3019 + } 3020 + 3086 3021 static int em_lgdt(struct x86_emulate_ctxt *ctxt) 3087 3022 { 3088 3023 struct desc_ptr desc_ptr; 3089 3024 int rc; 3090 3025 3026 + if (ctxt->mode == X86EMUL_MODE_PROT64) 3027 + ctxt->op_bytes = 8; 3091 3028 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3092 3029 &desc_ptr.size, &desc_ptr.address, 3093 3030 ctxt->op_bytes); ··· 3146 3021 struct desc_ptr desc_ptr; 3147 3022 int rc; 3148 3023 3024 + if (ctxt->mode == X86EMUL_MODE_PROT64) 3025 + ctxt->op_bytes = 8; 3149 3026 rc = read_descriptor(ctxt, ctxt->src.addr.mem, 3150 3027 &desc_ptr.size, &desc_ptr.address, 3151 3028 ctxt->op_bytes); ··· 3267 3140 static int em_bsr(struct x86_emulate_ctxt *ctxt) 3268 3141 { 3269 3142 emulate_2op_SrcV_nobyte(ctxt, "bsr"); 3143 + return X86EMUL_CONTINUE; 3144 + } 3145 + 3146 + static int em_cpuid(struct x86_emulate_ctxt *ctxt) 3147 + { 3148 + u32 eax, ebx, ecx, edx; 3149 + 3150 + eax = ctxt->regs[VCPU_REGS_RAX]; 3151 + ecx = ctxt->regs[VCPU_REGS_RCX]; 3152 + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3153 + ctxt->regs[VCPU_REGS_RAX] = eax; 3154 + ctxt->regs[VCPU_REGS_RBX] = ebx; 3155 + ctxt->regs[VCPU_REGS_RCX] = ecx; 3156 + ctxt->regs[VCPU_REGS_RDX] = edx; 3157 + return X86EMUL_CONTINUE; 3158 + } 3159 + 3160 + static int em_lahf(struct x86_emulate_ctxt *ctxt) 3161 + { 3162 + ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; 3163 + ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; 3164 + return X86EMUL_CONTINUE; 3165 + } 3166 + 3167 + static int em_bswap(struct x86_emulate_ctxt *ctxt) 3168 + { 3169 + switch (ctxt->op_bytes) { 3170 + #ifdef CONFIG_X86_64 3171 + case 8: 3172 + asm("bswap %0" : "+r"(ctxt->dst.val)); 3173 + break; 3174 + #endif 3175 + default: 3176 + asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); 3177 + break; 3178 + } 3270 3179 return X86EMUL_CONTINUE; 3271 3180 } 3272 3181 ··· 3587 3424 static struct opcode group6[] = { 3588 3425 DI(Prot, sldt), 3589 3426 DI(Prot, str), 3590 - DI(Prot | Priv, lldt), 3591 - DI(Prot | Priv, ltr), 3427 + II(Prot | Priv | SrcMem16, em_lldt, lldt), 3428 + II(Prot | Priv | SrcMem16, em_ltr, ltr), 3592 3429 N, N, N, N, 3593 3430 }; 3594 3431 3595 3432 static struct group_dual group7 = { { 3596 - DI(Mov | DstMem | Priv, sgdt), 3597 - DI(Mov | DstMem | Priv, sidt), 3433 + II(Mov | DstMem | Priv, em_sgdt, sgdt), 3434 + II(Mov | DstMem | Priv, em_sidt, sidt), 3598 3435 II(SrcMem | Priv, em_lgdt, lgdt), 3599 3436 II(SrcMem | Priv, em_lidt, lidt), 3600 3437 II(SrcNone | DstMem | Mov, em_smsw, smsw), N, ··· 3701 3538 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 3702 3539 I(SrcImmFAddr | No64, em_call_far), N, 3703 3540 II(ImplicitOps | Stack, em_pushf, pushf), 3704 - II(ImplicitOps | Stack, em_popf, popf), N, N, 3541 + II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), 3705 3542 /* 0xA0 - 0xA7 */ 3706 3543 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3707 3544 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), ··· 3724 3561 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), 3725 3562 G(ByteOp, group11), G(0, group11), 3726 3563 /* 0xC8 - 0xCF */ 3727 - N, N, N, I(ImplicitOps | Stack, em_ret_far), 3564 + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), 3565 + N, I(ImplicitOps | Stack, em_ret_far), 3728 3566 D(ImplicitOps), DI(SrcImmByte, intn), 3729 3567 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3730 3568 /* 0xD0 - 0xD7 */ ··· 3799 3635 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3800 3636 /* 0xA0 - 0xA7 */ 3801 3637 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3802 - DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3638 + II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3803 3639 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3804 3640 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3805 3641 /* 0xA8 - 0xAF */ ··· 3822 3658 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3823 3659 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3824 3660 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3825 - /* 0xC0 - 0xCF */ 3661 + /* 0xC0 - 0xC7 */ 3826 3662 D2bv(DstMem | SrcReg | ModRM | Lock), 3827 3663 N, D(DstMem | SrcReg | ModRM | Mov), 3828 3664 N, N, N, GD(0, &group9), 3829 - N, N, N, N, N, N, N, N, 3665 + /* 0xC8 - 0xCF */ 3666 + X8(I(DstReg, em_bswap)), 3830 3667 /* 0xD0 - 0xDF */ 3831 3668 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3832 3669 /* 0xE0 - 0xEF */ ··· 4591 4426 break; 4592 4427 case 0xb6 ... 0xb7: /* movzx */ 4593 4428 ctxt->dst.bytes = ctxt->op_bytes; 4594 - ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val 4429 + ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val 4595 4430 : (u16) ctxt->src.val; 4596 4431 break; 4597 4432 case 0xbe ... 0xbf: /* movsx */ 4598 4433 ctxt->dst.bytes = ctxt->op_bytes; 4599 - ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : 4434 + ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : 4600 4435 (s16) ctxt->src.val; 4601 4436 break; 4602 4437 case 0xc0 ... 0xc1: /* xadd */

+14 -3

arch/x86/kvm/i8259.c

··· 188 188 pic_unlock(s); 189 189 } 190 190 191 - int kvm_pic_set_irq(void *opaque, int irq, int level) 191 + int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) 192 192 { 193 - struct kvm_pic *s = opaque; 194 193 int ret = -1; 195 194 196 195 pic_lock(s); 197 196 if (irq >= 0 && irq < PIC_NUM_PINS) { 198 - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 197 + int irq_level = __kvm_irq_line_state(&s->irq_states[irq], 198 + irq_source_id, level); 199 + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); 199 200 pic_update_irq(s); 200 201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 201 202 s->pics[irq >> 3].imr, ret == 0); ··· 204 203 pic_unlock(s); 205 204 206 205 return ret; 206 + } 207 + 208 + void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) 209 + { 210 + int i; 211 + 212 + pic_lock(s); 213 + for (i = 0; i < PIC_NUM_PINS; i++) 214 + __clear_bit(irq_source_id, &s->irq_states[i]); 215 + pic_unlock(s); 207 216 } 208 217 209 218 /*

+188 -6

arch/x86/kvm/lapic.c

··· 107 107 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 108 108 } 109 109 110 + static inline int __apic_test_and_set_vector(int vec, void *bitmap) 111 + { 112 + return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 113 + } 114 + 115 + static inline int __apic_test_and_clear_vector(int vec, void *bitmap) 116 + { 117 + return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 118 + } 119 + 110 120 static inline int apic_hw_enabled(struct kvm_lapic *apic) 111 121 { 112 122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; ··· 220 210 return fls(word[word_offset << 2]) - 1 + (word_offset << 5); 221 211 } 222 212 213 + static u8 count_vectors(void *bitmap) 214 + { 215 + u32 *word = bitmap; 216 + int word_offset; 217 + u8 count = 0; 218 + for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) 219 + count += hweight32(word[word_offset << 2]); 220 + return count; 221 + } 222 + 223 223 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 224 224 { 225 225 apic->irr_pending = true; ··· 262 242 apic->irr_pending = true; 263 243 } 264 244 245 + static inline void apic_set_isr(int vec, struct kvm_lapic *apic) 246 + { 247 + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) 248 + ++apic->isr_count; 249 + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); 250 + /* 251 + * ISR (in service register) bit is set when injecting an interrupt. 252 + * The highest vector is injected. Thus the latest bit set matches 253 + * the highest bit in ISR. 254 + */ 255 + apic->highest_isr_cache = vec; 256 + } 257 + 258 + static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) 259 + { 260 + if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) 261 + --apic->isr_count; 262 + BUG_ON(apic->isr_count < 0); 263 + apic->highest_isr_cache = -1; 264 + } 265 + 265 266 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 266 267 { 267 268 struct kvm_lapic *apic = vcpu->arch.apic; ··· 311 270 irq->level, irq->trig_mode); 312 271 } 313 272 273 + static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) 274 + { 275 + 276 + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, 277 + sizeof(val)); 278 + } 279 + 280 + static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) 281 + { 282 + 283 + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, 284 + sizeof(*val)); 285 + } 286 + 287 + static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) 288 + { 289 + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; 290 + } 291 + 292 + static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) 293 + { 294 + u8 val; 295 + if (pv_eoi_get_user(vcpu, &val) < 0) 296 + apic_debug("Can't read EOI MSR value: 0x%llx\n", 297 + (unsigned long long)vcpi->arch.pv_eoi.msr_val); 298 + return val & 0x1; 299 + } 300 + 301 + static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) 302 + { 303 + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { 304 + apic_debug("Can't set EOI MSR value: 0x%llx\n", 305 + (unsigned long long)vcpi->arch.pv_eoi.msr_val); 306 + return; 307 + } 308 + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 309 + } 310 + 311 + static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) 312 + { 313 + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { 314 + apic_debug("Can't clear EOI MSR value: 0x%llx\n", 315 + (unsigned long long)vcpi->arch.pv_eoi.msr_val); 316 + return; 317 + } 318 + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 319 + } 320 + 314 321 static inline int apic_find_highest_isr(struct kvm_lapic *apic) 315 322 { 316 323 int result; 324 + if (!apic->isr_count) 325 + return -1; 326 + if (likely(apic->highest_isr_cache != -1)) 327 + return apic->highest_isr_cache; 317 328 318 329 result = find_highest_vector(apic->regs + APIC_ISR); 319 330 ASSERT(result == -1 || result >= 16); ··· 575 482 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 576 483 } 577 484 578 - static void apic_set_eoi(struct kvm_lapic *apic) 485 + static int apic_set_eoi(struct kvm_lapic *apic) 579 486 { 580 487 int vector = apic_find_highest_isr(apic); 488 + 489 + trace_kvm_eoi(apic, vector); 490 + 581 491 /* 582 492 * Not every write EOI will has corresponding ISR, 583 493 * one example is when Kernel check timer on setup_IO_APIC 584 494 */ 585 495 if (vector == -1) 586 - return; 496 + return vector; 587 497 588 - apic_clear_vector(vector, apic->regs + APIC_ISR); 498 + apic_clear_isr(vector, apic); 589 499 apic_update_ppr(apic); 590 500 591 501 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && ··· 601 505 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 602 506 } 603 507 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 508 + return vector; 604 509 } 605 510 606 511 static void apic_send_ipi(struct kvm_lapic *apic) ··· 1178 1081 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1179 1082 } 1180 1083 apic->irr_pending = false; 1084 + apic->isr_count = 0; 1085 + apic->highest_isr_cache = -1; 1181 1086 update_divide_count(apic); 1182 1087 atomic_set(&apic->lapic_timer.pending, 0); 1183 1088 if (kvm_vcpu_is_bsp(vcpu)) 1184 1089 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1090 + vcpu->arch.pv_eoi.msr_val = 0; 1185 1091 apic_update_ppr(apic); 1186 1092 1187 1093 vcpu->arch.apic_arb_prio = 0; ··· 1348 1248 if (vector == -1) 1349 1249 return -1; 1350 1250 1351 - apic_set_vector(vector, apic->regs + APIC_ISR); 1251 + apic_set_isr(vector, apic); 1352 1252 apic_update_ppr(apic); 1353 1253 apic_clear_irr(vector, apic); 1354 1254 return vector; ··· 1367 1267 update_divide_count(apic); 1368 1268 start_apic_timer(apic); 1369 1269 apic->irr_pending = true; 1270 + apic->isr_count = count_vectors(apic->regs + APIC_ISR); 1271 + apic->highest_isr_cache = -1; 1370 1272 kvm_make_request(KVM_REQ_EVENT, vcpu); 1371 1273 } 1372 1274 ··· 1385 1283 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1386 1284 } 1387 1285 1286 + /* 1287 + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt 1288 + * 1289 + * Detect whether guest triggered PV EOI since the 1290 + * last entry. If yes, set EOI on guests's behalf. 1291 + * Clear PV EOI in guest memory in any case. 1292 + */ 1293 + static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, 1294 + struct kvm_lapic *apic) 1295 + { 1296 + bool pending; 1297 + int vector; 1298 + /* 1299 + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host 1300 + * and KVM_PV_EOI_ENABLED in guest memory as follows: 1301 + * 1302 + * KVM_APIC_PV_EOI_PENDING is unset: 1303 + * -> host disabled PV EOI. 1304 + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: 1305 + * -> host enabled PV EOI, guest did not execute EOI yet. 1306 + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: 1307 + * -> host enabled PV EOI, guest executed EOI. 1308 + */ 1309 + BUG_ON(!pv_eoi_enabled(vcpu)); 1310 + pending = pv_eoi_get_pending(vcpu); 1311 + /* 1312 + * Clear pending bit in any case: it will be set again on vmentry. 1313 + * While this might not be ideal from performance point of view, 1314 + * this makes sure pv eoi is only enabled when we know it's safe. 1315 + */ 1316 + pv_eoi_clr_pending(vcpu); 1317 + if (pending) 1318 + return; 1319 + vector = apic_set_eoi(apic); 1320 + trace_kvm_pv_eoi(apic, vector); 1321 + } 1322 + 1388 1323 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) 1389 1324 { 1390 1325 u32 data; 1391 1326 void *vapic; 1327 + 1328 + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) 1329 + apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); 1392 1330 1393 1331 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1394 1332 return; ··· 1440 1298 apic_set_tpr(vcpu->arch.apic, data & 0xff); 1441 1299 } 1442 1300 1301 + /* 1302 + * apic_sync_pv_eoi_to_guest - called before vmentry 1303 + * 1304 + * Detect whether it's safe to enable PV EOI and 1305 + * if yes do so. 1306 + */ 1307 + static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, 1308 + struct kvm_lapic *apic) 1309 + { 1310 + if (!pv_eoi_enabled(vcpu) || 1311 + /* IRR set or many bits in ISR: could be nested. */ 1312 + apic->irr_pending || 1313 + /* Cache not set: could be safe but we don't bother. */ 1314 + apic->highest_isr_cache == -1 || 1315 + /* Need EOI to update ioapic. */ 1316 + kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { 1317 + /* 1318 + * PV EOI was disabled by apic_sync_pv_eoi_from_guest 1319 + * so we need not do anything here. 1320 + */ 1321 + return; 1322 + } 1323 + 1324 + pv_eoi_set_pending(apic->vcpu); 1325 + } 1326 + 1443 1327 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) 1444 1328 { 1445 1329 u32 data, tpr; 1446 1330 int max_irr, max_isr; 1447 - struct kvm_lapic *apic; 1331 + struct kvm_lapic *apic = vcpu->arch.apic; 1448 1332 void *vapic; 1333 + 1334 + apic_sync_pv_eoi_to_guest(vcpu, apic); 1449 1335 1450 1336 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1451 1337 return; 1452 1338 1453 - apic = vcpu->arch.apic; 1454 1339 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1455 1340 max_irr = apic_find_highest_irr(apic); 1456 1341 if (max_irr < 0) ··· 1562 1393 *data = (((u64)high) << 32) | low; 1563 1394 1564 1395 return 0; 1396 + } 1397 + 1398 + int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) 1399 + { 1400 + u64 addr = data & ~KVM_MSR_ENABLED; 1401 + if (!IS_ALIGNED(addr, 4)) 1402 + return 1; 1403 + 1404 + vcpu->arch.pv_eoi.msr_val = data; 1405 + if (!pv_eoi_enabled(vcpu)) 1406 + return 0; 1407 + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, 1408 + addr); 1565 1409 }

+11

arch/x86/kvm/lapic.h

··· 13 13 u32 divide_count; 14 14 struct kvm_vcpu *vcpu; 15 15 bool irr_pending; 16 + /* Number of bits set in ISR. */ 17 + s16 isr_count; 18 + /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ 19 + int highest_isr_cache; 20 + /** 21 + * APIC register page. The layout matches the register layout seen by 22 + * the guest 1:1, because it is accessed by the vmx microcode. 23 + * Note: Only one register, the TPR, is used by the microcode. 24 + */ 16 25 void *regs; 17 26 gpa_t vapic_addr; 18 27 struct page *vapic_page; ··· 69 60 { 70 61 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; 71 62 } 63 + 64 + int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); 72 65 #endif

+261 -98

arch/x86/kvm/mmu.c

··· 90 90 91 91 #define PTE_PREFETCH_NUM 8 92 92 93 - #define PT_FIRST_AVAIL_BITS_SHIFT 9 93 + #define PT_FIRST_AVAIL_BITS_SHIFT 10 94 94 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 95 95 96 96 #define PT64_LEVEL_BITS 9 ··· 145 145 #define CREATE_TRACE_POINTS 146 146 #include "mmutrace.h" 147 147 148 - #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 148 + #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 149 + #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) 149 150 150 151 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 151 152 ··· 189 188 static u64 __read_mostly shadow_mmio_mask; 190 189 191 190 static void mmu_spte_set(u64 *sptep, u64 spte); 191 + static void mmu_free_roots(struct kvm_vcpu *vcpu); 192 192 193 193 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 194 194 { ··· 446 444 } 447 445 #endif 448 446 447 + static bool spte_is_locklessly_modifiable(u64 spte) 448 + { 449 + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); 450 + } 451 + 449 452 static bool spte_has_volatile_bits(u64 spte) 450 453 { 454 + /* 455 + * Always atomicly update spte if it can be updated 456 + * out of mmu-lock, it can ensure dirty bit is not lost, 457 + * also, it can help us to get a stable is_writable_pte() 458 + * to ensure tlb flush is not missed. 459 + */ 460 + if (spte_is_locklessly_modifiable(spte)) 461 + return true; 462 + 451 463 if (!shadow_accessed_mask) 452 464 return false; 453 465 ··· 494 478 495 479 /* Rules for using mmu_spte_update: 496 480 * Update the state bits, it means the mapped pfn is not changged. 481 + * 482 + * Whenever we overwrite a writable spte with a read-only one we 483 + * should flush remote TLBs. Otherwise rmap_write_protect 484 + * will find a read-only spte, even though the writable spte 485 + * might be cached on a CPU's TLB, the return value indicates this 486 + * case. 497 487 */ 498 - static void mmu_spte_update(u64 *sptep, u64 new_spte) 488 + static bool mmu_spte_update(u64 *sptep, u64 new_spte) 499 489 { 500 - u64 mask, old_spte = *sptep; 490 + u64 old_spte = *sptep; 491 + bool ret = false; 501 492 502 493 WARN_ON(!is_rmap_spte(new_spte)); 503 494 504 - if (!is_shadow_present_pte(old_spte)) 505 - return mmu_spte_set(sptep, new_spte); 495 + if (!is_shadow_present_pte(old_spte)) { 496 + mmu_spte_set(sptep, new_spte); 497 + return ret; 498 + } 506 499 507 - new_spte |= old_spte & shadow_dirty_mask; 508 - 509 - mask = shadow_accessed_mask; 510 - if (is_writable_pte(old_spte)) 511 - mask |= shadow_dirty_mask; 512 - 513 - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 500 + if (!spte_has_volatile_bits(old_spte)) 514 501 __update_clear_spte_fast(sptep, new_spte); 515 502 else 516 503 old_spte = __update_clear_spte_slow(sptep, new_spte); 517 504 505 + /* 506 + * For the spte updated out of mmu-lock is safe, since 507 + * we always atomicly update it, see the comments in 508 + * spte_has_volatile_bits(). 509 + */ 510 + if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) 511 + ret = true; 512 + 518 513 if (!shadow_accessed_mask) 519 - return; 514 + return ret; 520 515 521 516 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 522 517 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 523 518 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 524 519 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 520 + 521 + return ret; 525 522 } 526 523 527 524 /* ··· 681 652 mmu_page_header_cache); 682 653 } 683 654 684 - static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 685 - size_t size) 655 + static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 686 656 { 687 657 void *p; 688 658 ··· 692 664 693 665 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 694 666 { 695 - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, 696 - sizeof(struct pte_list_desc)); 667 + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); 697 668 } 698 669 699 670 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) ··· 1078 1051 rmap_remove(kvm, sptep); 1079 1052 } 1080 1053 1081 - static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) 1054 + 1055 + static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) 1056 + { 1057 + if (is_large_pte(*sptep)) { 1058 + WARN_ON(page_header(__pa(sptep))->role.level == 1059 + PT_PAGE_TABLE_LEVEL); 1060 + drop_spte(kvm, sptep); 1061 + --kvm->stat.lpages; 1062 + return true; 1063 + } 1064 + 1065 + return false; 1066 + } 1067 + 1068 + static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1069 + { 1070 + if (__drop_large_spte(vcpu->kvm, sptep)) 1071 + kvm_flush_remote_tlbs(vcpu->kvm); 1072 + } 1073 + 1074 + /* 1075 + * Write-protect on the specified @sptep, @pt_protect indicates whether 1076 + * spte writ-protection is caused by protecting shadow page table. 1077 + * @flush indicates whether tlb need be flushed. 1078 + * 1079 + * Note: write protection is difference between drity logging and spte 1080 + * protection: 1081 + * - for dirty logging, the spte can be set to writable at anytime if 1082 + * its dirty bitmap is properly set. 1083 + * - for spte protection, the spte can be writable only after unsync-ing 1084 + * shadow page. 1085 + * 1086 + * Return true if the spte is dropped. 1087 + */ 1088 + static bool 1089 + spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) 1090 + { 1091 + u64 spte = *sptep; 1092 + 1093 + if (!is_writable_pte(spte) && 1094 + !(pt_protect && spte_is_locklessly_modifiable(spte))) 1095 + return false; 1096 + 1097 + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1098 + 1099 + if (__drop_large_spte(kvm, sptep)) { 1100 + *flush |= true; 1101 + return true; 1102 + } 1103 + 1104 + if (pt_protect) 1105 + spte &= ~SPTE_MMU_WRITEABLE; 1106 + spte = spte & ~PT_WRITABLE_MASK; 1107 + 1108 + *flush |= mmu_spte_update(sptep, spte); 1109 + return false; 1110 + } 1111 + 1112 + static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, 1113 + int level, bool pt_protect) 1082 1114 { 1083 1115 u64 *sptep; 1084 1116 struct rmap_iterator iter; 1085 - int write_protected = 0; 1117 + bool flush = false; 1086 1118 1087 1119 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1088 1120 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1089 - rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1090 - 1091 - if (!is_writable_pte(*sptep)) { 1092 - sptep = rmap_get_next(&iter); 1121 + if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { 1122 + sptep = rmap_get_first(*rmapp, &iter); 1093 1123 continue; 1094 1124 } 1095 1125 1096 - if (level == PT_PAGE_TABLE_LEVEL) { 1097 - mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); 1098 - sptep = rmap_get_next(&iter); 1099 - } else { 1100 - BUG_ON(!is_large_pte(*sptep)); 1101 - drop_spte(kvm, sptep); 1102 - --kvm->stat.lpages; 1103 - sptep = rmap_get_first(*rmapp, &iter); 1104 - } 1105 - 1106 - write_protected = 1; 1126 + sptep = rmap_get_next(&iter); 1107 1127 } 1108 1128 1109 - return write_protected; 1129 + return flush; 1110 1130 } 1111 1131 1112 1132 /** ··· 1174 1100 1175 1101 while (mask) { 1176 1102 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1177 - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); 1103 + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); 1178 1104 1179 1105 /* clear the first set bit */ 1180 1106 mask &= mask - 1; 1181 1107 } 1182 1108 } 1183 1109 1184 - static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1110 + static bool rmap_write_protect(struct kvm *kvm, u64 gfn) 1185 1111 { 1186 1112 struct kvm_memory_slot *slot; 1187 1113 unsigned long *rmapp; 1188 1114 int i; 1189 - int write_protected = 0; 1115 + bool write_protected = false; 1190 1116 1191 1117 slot = gfn_to_memslot(kvm, gfn); 1192 1118 1193 1119 for (i = PT_PAGE_TABLE_LEVEL; 1194 1120 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1195 1121 rmapp = __gfn_to_rmap(gfn, i, slot); 1196 - write_protected |= __rmap_write_protect(kvm, rmapp, i); 1122 + write_protected |= __rmap_write_protect(kvm, rmapp, i, true); 1197 1123 } 1198 1124 1199 1125 return write_protected; ··· 1312 1238 unsigned long data) 1313 1239 { 1314 1240 u64 *sptep; 1315 - struct rmap_iterator iter; 1241 + struct rmap_iterator uninitialized_var(iter); 1316 1242 int young = 0; 1317 1243 1318 1244 /* 1319 - * Emulate the accessed bit for EPT, by checking if this page has 1245 + * In case of absence of EPT Access and Dirty Bits supports, 1246 + * emulate the accessed bit for EPT, by checking if this page has 1320 1247 * an EPT mapping, and clearing it if it does. On the next access, 1321 1248 * a new EPT mapping will be established. 1322 1249 * This has some overhead, but not as much as the cost of swapping ··· 1328 1253 1329 1254 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1330 1255 sptep = rmap_get_next(&iter)) { 1331 - BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1256 + BUG_ON(!is_shadow_present_pte(*sptep)); 1332 1257 1333 - if (*sptep & PT_ACCESSED_MASK) { 1258 + if (*sptep & shadow_accessed_mask) { 1334 1259 young = 1; 1335 - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); 1260 + clear_bit((ffs(shadow_accessed_mask) - 1), 1261 + (unsigned long *)sptep); 1336 1262 } 1337 1263 } 1338 1264 ··· 1357 1281 1358 1282 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1359 1283 sptep = rmap_get_next(&iter)) { 1360 - BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1284 + BUG_ON(!is_shadow_present_pte(*sptep)); 1361 1285 1362 - if (*sptep & PT_ACCESSED_MASK) { 1286 + if (*sptep & shadow_accessed_mask) { 1363 1287 young = 1; 1364 1288 break; 1365 1289 } ··· 1477 1401 u64 *parent_pte, int direct) 1478 1402 { 1479 1403 struct kvm_mmu_page *sp; 1480 - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, 1481 - sizeof *sp); 1482 - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 1404 + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1405 + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1483 1406 if (!direct) 1484 - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 1485 - PAGE_SIZE); 1407 + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1486 1408 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1487 1409 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1488 1410 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); ··· 1775 1701 1776 1702 kvm_mmu_pages_init(parent, &parents, &pages); 1777 1703 while (mmu_unsync_walk(parent, &pages)) { 1778 - int protected = 0; 1704 + bool protected = false; 1779 1705 1780 1706 for_each_sp(pages, sp, parents, i) 1781 1707 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); ··· 1938 1864 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1939 1865 | PT_WRITABLE_MASK | PT_USER_MASK; 1940 1866 mmu_spte_set(sptep, spte); 1941 - } 1942 - 1943 - static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1944 - { 1945 - if (is_large_pte(*sptep)) { 1946 - drop_spte(vcpu->kvm, sptep); 1947 - --vcpu->kvm->stat.lpages; 1948 - kvm_flush_remote_tlbs(vcpu->kvm); 1949 - } 1950 1867 } 1951 1868 1952 1869 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, ··· 2308 2243 gfn_t gfn, pfn_t pfn, bool speculative, 2309 2244 bool can_unsync, bool host_writable) 2310 2245 { 2311 - u64 spte, entry = *sptep; 2246 + u64 spte; 2312 2247 int ret = 0; 2313 2248 2314 2249 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) ··· 2322 2257 spte |= shadow_x_mask; 2323 2258 else 2324 2259 spte |= shadow_nx_mask; 2260 + 2325 2261 if (pte_access & ACC_USER_MASK) 2326 2262 spte |= shadow_user_mask; 2263 + 2327 2264 if (level > PT_PAGE_TABLE_LEVEL) 2328 2265 spte |= PT_PAGE_SIZE_MASK; 2329 2266 if (tdp_enabled) ··· 2350 2283 goto done; 2351 2284 } 2352 2285 2353 - spte |= PT_WRITABLE_MASK; 2286 + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2354 2287 2355 2288 if (!vcpu->arch.mmu.direct_map 2356 2289 && !(pte_access & ACC_WRITE_MASK)) { ··· 2379 2312 __func__, gfn); 2380 2313 ret = 1; 2381 2314 pte_access &= ~ACC_WRITE_MASK; 2382 - if (is_writable_pte(spte)) 2383 - spte &= ~PT_WRITABLE_MASK; 2315 + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 2384 2316 } 2385 2317 } 2386 2318 ··· 2387 2321 mark_page_dirty(vcpu->kvm, gfn); 2388 2322 2389 2323 set_pte: 2390 - mmu_spte_update(sptep, spte); 2391 - /* 2392 - * If we overwrite a writable spte with a read-only one we 2393 - * should flush remote TLBs. Otherwise rmap_write_protect 2394 - * will find a read-only spte, even though the writable spte 2395 - * might be cached on a CPU's TLB. 2396 - */ 2397 - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) 2324 + if (mmu_spte_update(sptep, spte)) 2398 2325 kvm_flush_remote_tlbs(vcpu->kvm); 2399 2326 done: 2400 2327 return ret; ··· 2462 2403 2463 2404 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2464 2405 { 2406 + mmu_free_roots(vcpu); 2465 2407 } 2466 2408 2467 2409 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, ··· 2685 2625 return ret; 2686 2626 } 2687 2627 2628 + static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) 2629 + { 2630 + /* 2631 + * #PF can be fast only if the shadow page table is present and it 2632 + * is caused by write-protect, that means we just need change the 2633 + * W bit of the spte which can be done out of mmu-lock. 2634 + */ 2635 + if (!(error_code & PFERR_PRESENT_MASK) || 2636 + !(error_code & PFERR_WRITE_MASK)) 2637 + return false; 2638 + 2639 + return true; 2640 + } 2641 + 2642 + static bool 2643 + fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) 2644 + { 2645 + struct kvm_mmu_page *sp = page_header(__pa(sptep)); 2646 + gfn_t gfn; 2647 + 2648 + WARN_ON(!sp->role.direct); 2649 + 2650 + /* 2651 + * The gfn of direct spte is stable since it is calculated 2652 + * by sp->gfn. 2653 + */ 2654 + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 2655 + 2656 + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) 2657 + mark_page_dirty(vcpu->kvm, gfn); 2658 + 2659 + return true; 2660 + } 2661 + 2662 + /* 2663 + * Return value: 2664 + * - true: let the vcpu to access on the same address again. 2665 + * - false: let the real page fault path to fix it. 2666 + */ 2667 + static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, 2668 + u32 error_code) 2669 + { 2670 + struct kvm_shadow_walk_iterator iterator; 2671 + bool ret = false; 2672 + u64 spte = 0ull; 2673 + 2674 + if (!page_fault_can_be_fast(vcpu, error_code)) 2675 + return false; 2676 + 2677 + walk_shadow_page_lockless_begin(vcpu); 2678 + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) 2679 + if (!is_shadow_present_pte(spte) || iterator.level < level) 2680 + break; 2681 + 2682 + /* 2683 + * If the mapping has been changed, let the vcpu fault on the 2684 + * same address again. 2685 + */ 2686 + if (!is_rmap_spte(spte)) { 2687 + ret = true; 2688 + goto exit; 2689 + } 2690 + 2691 + if (!is_last_spte(spte, level)) 2692 + goto exit; 2693 + 2694 + /* 2695 + * Check if it is a spurious fault caused by TLB lazily flushed. 2696 + * 2697 + * Need not check the access of upper level table entries since 2698 + * they are always ACC_ALL. 2699 + */ 2700 + if (is_writable_pte(spte)) { 2701 + ret = true; 2702 + goto exit; 2703 + } 2704 + 2705 + /* 2706 + * Currently, to simplify the code, only the spte write-protected 2707 + * by dirty-log can be fast fixed. 2708 + */ 2709 + if (!spte_is_locklessly_modifiable(spte)) 2710 + goto exit; 2711 + 2712 + /* 2713 + * Currently, fast page fault only works for direct mapping since 2714 + * the gfn is not stable for indirect shadow page. 2715 + * See Documentation/virtual/kvm/locking.txt to get more detail. 2716 + */ 2717 + ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); 2718 + exit: 2719 + trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, 2720 + spte, ret); 2721 + walk_shadow_page_lockless_end(vcpu); 2722 + 2723 + return ret; 2724 + } 2725 + 2688 2726 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2689 2727 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2690 2728 2691 - static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, 2692 - bool prefault) 2729 + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 2730 + gfn_t gfn, bool prefault) 2693 2731 { 2694 2732 int r; 2695 2733 int level; 2696 2734 int force_pt_level; 2697 2735 pfn_t pfn; 2698 2736 unsigned long mmu_seq; 2699 - bool map_writable; 2737 + bool map_writable, write = error_code & PFERR_WRITE_MASK; 2700 2738 2701 2739 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2702 2740 if (likely(!force_pt_level)) { ··· 2810 2652 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2811 2653 } else 2812 2654 level = PT_PAGE_TABLE_LEVEL; 2655 + 2656 + if (fast_page_fault(vcpu, v, level, error_code)) 2657 + return 0; 2813 2658 2814 2659 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2815 2660 smp_rmb(); ··· 3202 3041 gfn = gva >> PAGE_SHIFT; 3203 3042 3204 3043 return nonpaging_map(vcpu, gva & PAGE_MASK, 3205 - error_code & PFERR_WRITE_MASK, gfn, prefault); 3044 + error_code, gfn, prefault); 3206 3045 } 3207 3046 3208 3047 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) ··· 3281 3120 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3282 3121 } else 3283 3122 level = PT_PAGE_TABLE_LEVEL; 3123 + 3124 + if (fast_page_fault(vcpu, gpa, level, error_code)) 3125 + return 0; 3284 3126 3285 3127 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3286 3128 smp_rmb(); ··· 4049 3885 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4050 3886 { 4051 3887 struct kvm_mmu_page *sp; 3888 + bool flush = false; 4052 3889 4053 3890 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4054 3891 int i; ··· 4064 3899 !is_last_spte(pt[i], sp->role.level)) 4065 3900 continue; 4066 3901 4067 - if (is_large_pte(pt[i])) { 4068 - drop_spte(kvm, &pt[i]); 4069 - --kvm->stat.lpages; 4070 - continue; 4071 - } 4072 - 4073 - /* avoid RMW */ 4074 - if (is_writable_pte(pt[i])) 4075 - mmu_spte_update(&pt[i], 4076 - pt[i] & ~PT_WRITABLE_MASK); 3902 + spte_write_protect(kvm, &pt[i], &flush, false); 4077 3903 } 4078 3904 } 4079 3905 kvm_flush_remote_tlbs(kvm); ··· 4101 3945 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4102 3946 { 4103 3947 struct kvm *kvm; 4104 - struct kvm *kvm_freed = NULL; 4105 3948 int nr_to_scan = sc->nr_to_scan; 4106 3949 4107 3950 if (nr_to_scan == 0) ··· 4112 3957 int idx; 4113 3958 LIST_HEAD(invalid_list); 4114 3959 3960 + /* 3961 + * n_used_mmu_pages is accessed without holding kvm->mmu_lock 3962 + * here. We may skip a VM instance errorneosly, but we do not 3963 + * want to shrink a VM that only started to populate its MMU 3964 + * anyway. 3965 + */ 3966 + if (kvm->arch.n_used_mmu_pages > 0) { 3967 + if (!nr_to_scan--) 3968 + break; 3969 + continue; 3970 + } 3971 + 4115 3972 idx = srcu_read_lock(&kvm->srcu); 4116 3973 spin_lock(&kvm->mmu_lock); 4117 - if (!kvm_freed && nr_to_scan > 0 && 4118 - kvm->arch.n_used_mmu_pages > 0) { 4119 - kvm_mmu_remove_some_alloc_mmu_pages(kvm, 4120 - &invalid_list); 4121 - kvm_freed = kvm; 4122 - } 4123 - nr_to_scan--; 4124 3974 3975 + kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); 4125 3976 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3977 + 4126 3978 spin_unlock(&kvm->mmu_lock); 4127 3979 srcu_read_unlock(&kvm->srcu, idx); 3980 + 3981 + list_move_tail(&kvm->vm_list, &vm_list); 3982 + break; 4128 3983 } 4129 - if (kvm_freed) 4130 - list_move_tail(&kvm_freed->vm_list, &vm_list); 4131 3984 4132 3985 raw_spin_unlock(&kvm_lock); 4133 3986

+41 -4

arch/x86/kvm/mmutrace.h

··· 54 54 */ 55 55 TRACE_EVENT( 56 56 kvm_mmu_pagetable_walk, 57 - TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), 58 - TP_ARGS(addr, write_fault, user_fault, fetch_fault), 57 + TP_PROTO(u64 addr, u32 pferr), 58 + TP_ARGS(addr, pferr), 59 59 60 60 TP_STRUCT__entry( 61 61 __field(__u64, addr) ··· 64 64 65 65 TP_fast_assign( 66 66 __entry->addr = addr; 67 - __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) 68 - | (!!fetch_fault << 4); 67 + __entry->pferr = pferr; 69 68 ), 70 69 71 70 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, ··· 241 242 242 243 TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, 243 244 __entry->access) 245 + ); 246 + 247 + #define __spte_satisfied(__spte) \ 248 + (__entry->retry && is_writable_pte(__entry->__spte)) 249 + 250 + TRACE_EVENT( 251 + fast_page_fault, 252 + TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, 253 + u64 *sptep, u64 old_spte, bool retry), 254 + TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), 255 + 256 + TP_STRUCT__entry( 257 + __field(int, vcpu_id) 258 + __field(gva_t, gva) 259 + __field(u32, error_code) 260 + __field(u64 *, sptep) 261 + __field(u64, old_spte) 262 + __field(u64, new_spte) 263 + __field(bool, retry) 264 + ), 265 + 266 + TP_fast_assign( 267 + __entry->vcpu_id = vcpu->vcpu_id; 268 + __entry->gva = gva; 269 + __entry->error_code = error_code; 270 + __entry->sptep = sptep; 271 + __entry->old_spte = old_spte; 272 + __entry->new_spte = *sptep; 273 + __entry->retry = retry; 274 + ), 275 + 276 + TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" 277 + " new %llx spurious %d fixed %d", __entry->vcpu_id, 278 + __entry->gva, __print_flags(__entry->error_code, "|", 279 + kvm_mmu_trace_pferr_flags), __entry->sptep, 280 + __entry->old_spte, __entry->new_spte, 281 + __spte_satisfied(old_spte), __spte_satisfied(new_spte) 282 + ) 244 283 ); 245 284 #endif /* _TRACE_KVMMMU_H */ 246 285

+1 -2

arch/x86/kvm/paging_tmpl.h

··· 154 154 const int fetch_fault = access & PFERR_FETCH_MASK; 155 155 u16 errcode = 0; 156 156 157 - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 158 - fetch_fault); 157 + trace_kvm_mmu_pagetable_walk(addr, access); 159 158 retry_walk: 160 159 eperm = false; 161 160 walker->level = mmu->root_level;

+9 -3

arch/x86/kvm/svm.c

··· 3185 3185 break; 3186 3186 case MSR_IA32_DEBUGCTLMSR: 3187 3187 if (!boot_cpu_has(X86_FEATURE_LBRV)) { 3188 - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3189 - __func__, data); 3188 + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3189 + __func__, data); 3190 3190 break; 3191 3191 } 3192 3192 if (data & DEBUGCTL_RESERVED_BITS) ··· 3205 3205 case MSR_VM_CR: 3206 3206 return svm_set_vm_cr(vcpu, data); 3207 3207 case MSR_VM_IGNNE: 3208 - pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3208 + vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3209 3209 break; 3210 3210 default: 3211 3211 return kvm_set_msr_common(vcpu, ecx, data); ··· 4044 4044 return false; 4045 4045 } 4046 4046 4047 + static bool svm_invpcid_supported(void) 4048 + { 4049 + return false; 4050 + } 4051 + 4047 4052 static bool svm_has_wbinvd_exit(void) 4048 4053 { 4049 4054 return true; ··· 4317 4312 .cpuid_update = svm_cpuid_update, 4318 4313 4319 4314 .rdtscp_supported = svm_rdtscp_supported, 4315 + .invpcid_supported = svm_invpcid_supported, 4320 4316 4321 4317 .set_supported_cpuid = svm_set_supported_cpuid, 4322 4318

+34

arch/x86/kvm/trace.h

··· 517 517 __entry->coalesced ? " (coalesced)" : "") 518 518 ); 519 519 520 + TRACE_EVENT(kvm_eoi, 521 + TP_PROTO(struct kvm_lapic *apic, int vector), 522 + TP_ARGS(apic, vector), 523 + 524 + TP_STRUCT__entry( 525 + __field( __u32, apicid ) 526 + __field( int, vector ) 527 + ), 528 + 529 + TP_fast_assign( 530 + __entry->apicid = apic->vcpu->vcpu_id; 531 + __entry->vector = vector; 532 + ), 533 + 534 + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) 535 + ); 536 + 537 + TRACE_EVENT(kvm_pv_eoi, 538 + TP_PROTO(struct kvm_lapic *apic, int vector), 539 + TP_ARGS(apic, vector), 540 + 541 + TP_STRUCT__entry( 542 + __field( __u32, apicid ) 543 + __field( int, vector ) 544 + ), 545 + 546 + TP_fast_assign( 547 + __entry->apicid = apic->vcpu->vcpu_id; 548 + __entry->vector = vector; 549 + ), 550 + 551 + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) 552 + ); 553 + 520 554 /* 521 555 * Tracepoint for nested VMRUN 522 556 */

+152 -37

arch/x86/kvm/vmx.c

··· 71 71 module_param_named(unrestricted_guest, 72 72 enable_unrestricted_guest, bool, S_IRUGO); 73 73 74 - static bool __read_mostly emulate_invalid_guest_state = 0; 74 + static bool __read_mostly enable_ept_ad_bits = 1; 75 + module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 76 + 77 + static bool __read_mostly emulate_invalid_guest_state = true; 75 78 module_param(emulate_invalid_guest_state, bool, S_IRUGO); 76 79 77 80 static bool __read_mostly vmm_exclusive = 1; ··· 618 615 static void kvm_cpu_vmxoff(void); 619 616 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 620 617 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 618 + static void vmx_set_segment(struct kvm_vcpu *vcpu, 619 + struct kvm_segment *var, int seg); 620 + static void vmx_get_segment(struct kvm_vcpu *vcpu, 621 + struct kvm_segment *var, int seg); 621 622 622 623 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 623 624 static DEFINE_PER_CPU(struct vmcs *, current_vmcs); ··· 796 789 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 797 790 } 798 791 792 + static inline bool cpu_has_vmx_ept_ad_bits(void) 793 + { 794 + return vmx_capability.ept & VMX_EPT_AD_BIT; 795 + } 796 + 799 797 static inline bool cpu_has_vmx_invept_individual_addr(void) 800 798 { 801 799 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; ··· 859 847 { 860 848 return vmcs_config.cpu_based_2nd_exec_ctrl & 861 849 SECONDARY_EXEC_RDTSCP; 850 + } 851 + 852 + static inline bool cpu_has_vmx_invpcid(void) 853 + { 854 + return vmcs_config.cpu_based_2nd_exec_ctrl & 855 + SECONDARY_EXEC_ENABLE_INVPCID; 862 856 } 863 857 864 858 static inline bool cpu_has_virtual_nmis(void) ··· 1757 1739 return cpu_has_vmx_rdtscp(); 1758 1740 } 1759 1741 1742 + static bool vmx_invpcid_supported(void) 1743 + { 1744 + return cpu_has_vmx_invpcid() && enable_ept; 1745 + } 1746 + 1760 1747 /* 1761 1748 * Swap MSR entry in host/guest MSR entry array. 1762 1749 */ ··· 2481 2458 SECONDARY_EXEC_ENABLE_EPT | 2482 2459 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2483 2460 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2484 - SECONDARY_EXEC_RDTSCP; 2461 + SECONDARY_EXEC_RDTSCP | 2462 + SECONDARY_EXEC_ENABLE_INVPCID; 2485 2463 if (adjust_vmx_controls(min2, opt2, 2486 2464 MSR_IA32_VMX_PROCBASED_CTLS2, 2487 2465 &_cpu_based_2nd_exec_control) < 0) ··· 2669 2645 !cpu_has_vmx_ept_4levels()) { 2670 2646 enable_ept = 0; 2671 2647 enable_unrestricted_guest = 0; 2648 + enable_ept_ad_bits = 0; 2672 2649 } 2650 + 2651 + if (!cpu_has_vmx_ept_ad_bits()) 2652 + enable_ept_ad_bits = 0; 2673 2653 2674 2654 if (!cpu_has_vmx_unrestricted_guest()) 2675 2655 enable_unrestricted_guest = 0; ··· 2798 2770 { 2799 2771 unsigned long flags; 2800 2772 struct vcpu_vmx *vmx = to_vmx(vcpu); 2773 + struct kvm_segment var; 2801 2774 2802 2775 if (enable_unrestricted_guest) 2803 2776 return; ··· 2842 2813 if (emulate_invalid_guest_state) 2843 2814 goto continue_rmode; 2844 2815 2845 - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 2846 - vmcs_write32(GUEST_SS_LIMIT, 0xffff); 2847 - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 2816 + vmx_get_segment(vcpu, &var, VCPU_SREG_SS); 2817 + vmx_set_segment(vcpu, &var, VCPU_SREG_SS); 2848 2818 2849 - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 2850 - vmcs_write32(GUEST_CS_LIMIT, 0xffff); 2851 - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) 2852 - vmcs_writel(GUEST_CS_BASE, 0xf0000); 2853 - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 2819 + vmx_get_segment(vcpu, &var, VCPU_SREG_CS); 2820 + vmx_set_segment(vcpu, &var, VCPU_SREG_CS); 2854 2821 2855 - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); 2856 - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); 2857 - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); 2858 - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); 2822 + vmx_get_segment(vcpu, &var, VCPU_SREG_ES); 2823 + vmx_set_segment(vcpu, &var, VCPU_SREG_ES); 2824 + 2825 + vmx_get_segment(vcpu, &var, VCPU_SREG_DS); 2826 + vmx_set_segment(vcpu, &var, VCPU_SREG_DS); 2827 + 2828 + vmx_get_segment(vcpu, &var, VCPU_SREG_GS); 2829 + vmx_set_segment(vcpu, &var, VCPU_SREG_GS); 2830 + 2831 + vmx_get_segment(vcpu, &var, VCPU_SREG_FS); 2832 + vmx_set_segment(vcpu, &var, VCPU_SREG_FS); 2859 2833 2860 2834 continue_rmode: 2861 2835 kvm_mmu_reset_context(vcpu); ··· 3059 3027 /* TODO write the value reading from MSR */ 3060 3028 eptp = VMX_EPT_DEFAULT_MT | 3061 3029 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3030 + if (enable_ept_ad_bits) 3031 + eptp |= VMX_EPT_AD_ENABLE_BIT; 3062 3032 eptp |= (root_hpa & PAGE_MASK); 3063 3033 3064 3034 return eptp; ··· 3187 3153 3188 3154 static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3189 3155 { 3156 + struct vcpu_vmx *vmx = to_vmx(vcpu); 3157 + 3158 + /* 3159 + * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations 3160 + * fail; use the cache instead. 3161 + */ 3162 + if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { 3163 + return vmx->cpl; 3164 + } 3165 + 3190 3166 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3191 3167 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3192 - to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); 3168 + vmx->cpl = __vmx_get_cpl(vcpu); 3193 3169 } 3194 - return to_vmx(vcpu)->cpl; 3170 + 3171 + return vmx->cpl; 3195 3172 } 3196 3173 3197 3174 ··· 3210 3165 { 3211 3166 u32 ar; 3212 3167 3213 - if (var->unusable) 3168 + if (var->unusable || !var->present) 3214 3169 ar = 1 << 16; 3215 3170 else { 3216 3171 ar = var->type & 15; ··· 3222 3177 ar |= (var->db & 1) << 14; 3223 3178 ar |= (var->g & 1) << 15; 3224 3179 } 3225 - if (ar == 0) /* a 0 value means unusable */ 3226 - ar = AR_UNUSABLE_MASK; 3227 3180 3228 3181 return ar; 3229 3182 } ··· 3272 3229 3273 3230 vmcs_write32(sf->ar_bytes, ar); 3274 3231 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3232 + 3233 + /* 3234 + * Fix segments for real mode guest in hosts that don't have 3235 + * "unrestricted_mode" or it was disabled. 3236 + * This is done to allow migration of the guests from hosts with 3237 + * unrestricted guest like Westmere to older host that don't have 3238 + * unrestricted guest like Nehelem. 3239 + */ 3240 + if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { 3241 + switch (seg) { 3242 + case VCPU_SREG_CS: 3243 + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 3244 + vmcs_write32(GUEST_CS_LIMIT, 0xffff); 3245 + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) 3246 + vmcs_writel(GUEST_CS_BASE, 0xf0000); 3247 + vmcs_write16(GUEST_CS_SELECTOR, 3248 + vmcs_readl(GUEST_CS_BASE) >> 4); 3249 + break; 3250 + case VCPU_SREG_ES: 3251 + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); 3252 + break; 3253 + case VCPU_SREG_DS: 3254 + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); 3255 + break; 3256 + case VCPU_SREG_GS: 3257 + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); 3258 + break; 3259 + case VCPU_SREG_FS: 3260 + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); 3261 + break; 3262 + case VCPU_SREG_SS: 3263 + vmcs_write16(GUEST_SS_SELECTOR, 3264 + vmcs_readl(GUEST_SS_BASE) >> 4); 3265 + vmcs_write32(GUEST_SS_LIMIT, 0xffff); 3266 + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 3267 + break; 3268 + } 3269 + } 3275 3270 } 3276 3271 3277 3272 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) ··· 3812 3731 if (!enable_ept) { 3813 3732 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 3814 3733 enable_unrestricted_guest = 0; 3734 + /* Enable INVPCID for non-ept guests may cause performance regression. */ 3735 + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 3815 3736 } 3816 3737 if (!enable_unrestricted_guest) 3817 3738 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; ··· 4572 4489 break; 4573 4490 } 4574 4491 vcpu->run->exit_reason = 0; 4575 - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4492 + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4576 4493 (int)(exit_qualification >> 4) & 3, cr); 4577 4494 return 0; 4578 4495 } ··· 4852 4769 { 4853 4770 unsigned long exit_qualification; 4854 4771 gpa_t gpa; 4772 + u32 error_code; 4855 4773 int gla_validity; 4856 4774 4857 4775 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); ··· 4877 4793 4878 4794 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 4879 4795 trace_kvm_page_fault(gpa, exit_qualification); 4880 - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); 4796 + 4797 + /* It is a write fault? */ 4798 + error_code = exit_qualification & (1U << 1); 4799 + /* ept page table is present? */ 4800 + error_code |= (exit_qualification >> 3) & 0x1; 4801 + 4802 + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 4881 4803 } 4882 4804 4883 4805 static u64 ept_rsvd_mask(u64 spte, int level) ··· 4998 4908 int ret = 1; 4999 4909 u32 cpu_exec_ctrl; 5000 4910 bool intr_window_requested; 4911 + unsigned count = 130; 5001 4912 5002 4913 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5003 4914 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5004 4915 5005 - while (!guest_state_valid(vcpu)) { 5006 - if (intr_window_requested 5007 - && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) 4916 + while (!guest_state_valid(vcpu) && count-- != 0) { 4917 + if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 5008 4918 return handle_interrupt_window(&vmx->vcpu); 4919 + 4920 + if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 4921 + return 1; 5009 4922 5010 4923 err = emulate_instruction(vcpu, 0); 5011 4924 ··· 5017 4924 goto out; 5018 4925 } 5019 4926 5020 - if (err != EMULATE_DONE) 4927 + if (err != EMULATE_DONE) { 4928 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4929 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4930 + vcpu->run->internal.ndata = 0; 5021 4931 return 0; 4932 + } 5022 4933 5023 4934 if (signal_pending(current)) 5024 4935 goto out; ··· 5030 4933 schedule(); 5031 4934 } 5032 4935 5033 - vmx->emulation_required = 0; 4936 + vmx->emulation_required = !guest_state_valid(vcpu); 5034 4937 out: 5035 4938 return ret; 5036 4939 } ··· 6564 6467 } 6565 6468 } 6566 6469 } 6470 + 6471 + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6472 + /* Exposing INVPCID only when PCID is exposed */ 6473 + best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 6474 + if (vmx_invpcid_supported() && 6475 + best && (best->ecx & bit(X86_FEATURE_INVPCID)) && 6476 + guest_cpuid_has_pcid(vcpu)) { 6477 + exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; 6478 + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 6479 + exec_control); 6480 + } else { 6481 + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 6482 + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 6483 + exec_control); 6484 + if (best) 6485 + best->ecx &= ~bit(X86_FEATURE_INVPCID); 6486 + } 6567 6487 } 6568 6488 6569 6489 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) ··· 7315 7201 .cpuid_update = vmx_cpuid_update, 7316 7202 7317 7203 .rdtscp_supported = vmx_rdtscp_supported, 7204 + .invpcid_supported = vmx_invpcid_supported, 7318 7205 7319 7206 .set_supported_cpuid = vmx_set_supported_cpuid, 7320 7207 ··· 7345 7230 if (!vmx_io_bitmap_a) 7346 7231 return -ENOMEM; 7347 7232 7233 + r = -ENOMEM; 7234 + 7348 7235 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 7349 - if (!vmx_io_bitmap_b) { 7350 - r = -ENOMEM; 7236 + if (!vmx_io_bitmap_b) 7351 7237 goto out; 7352 - } 7353 7238 7354 7239 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 7355 - if (!vmx_msr_bitmap_legacy) { 7356 - r = -ENOMEM; 7240 + if (!vmx_msr_bitmap_legacy) 7357 7241 goto out1; 7358 - } 7242 + 7359 7243 7360 7244 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7361 - if (!vmx_msr_bitmap_longmode) { 7362 - r = -ENOMEM; 7245 + if (!vmx_msr_bitmap_longmode) 7363 7246 goto out2; 7364 - } 7247 + 7365 7248 7366 7249 /* 7367 7250 * Allow direct access to the PC debug port (it is often used for I/O ··· 7388 7275 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7389 7276 7390 7277 if (enable_ept) { 7391 - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 7392 - VMX_EPT_EXECUTABLE_MASK); 7278 + kvm_mmu_set_mask_ptes(0ull, 7279 + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, 7280 + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, 7281 + 0ull, VMX_EPT_EXECUTABLE_MASK); 7393 7282 ept_set_mmio_spte_mask(); 7394 7283 kvm_enable_tdp(); 7395 7284 } else

+67 -56

arch/x86/kvm/x86.c

··· 528 528 return 1; 529 529 } 530 530 531 + if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) 532 + return 1; 533 + 531 534 kvm_x86_ops->set_cr0(vcpu, cr0); 532 535 533 536 if ((cr0 ^ old_cr0) & X86_CR0_PG) { ··· 607 604 kvm_read_cr3(vcpu))) 608 605 return 1; 609 606 607 + if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { 608 + if (!guest_cpuid_has_pcid(vcpu)) 609 + return 1; 610 + 611 + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ 612 + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) 613 + return 1; 614 + } 615 + 610 616 if (kvm_x86_ops->set_cr4(vcpu, cr4)) 611 617 return 1; 612 618 613 - if ((cr4 ^ old_cr4) & pdptr_bits) 619 + if (((cr4 ^ old_cr4) & pdptr_bits) || 620 + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) 614 621 kvm_mmu_reset_context(vcpu); 615 622 616 623 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) ··· 639 626 } 640 627 641 628 if (is_long_mode(vcpu)) { 642 - if (cr3 & CR3_L_MODE_RESERVED_BITS) 643 - return 1; 629 + if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { 630 + if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 631 + return 1; 632 + } else 633 + if (cr3 & CR3_L_MODE_RESERVED_BITS) 634 + return 1; 644 635 } else { 645 636 if (is_pae(vcpu)) { 646 637 if (cr3 & CR3_PAE_RESERVED_BITS) ··· 812 795 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 813 796 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 814 797 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 798 + MSR_KVM_PV_EOI_EN, 815 799 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 816 800 MSR_STAR, 817 801 #ifdef CONFIG_X86_64 ··· 1455 1437 break; 1456 1438 } 1457 1439 default: 1458 - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1459 - "data 0x%llx\n", msr, data); 1440 + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1441 + "data 0x%llx\n", msr, data); 1460 1442 return 1; 1461 1443 } 1462 1444 return 0; ··· 1488 1470 case HV_X64_MSR_TPR: 1489 1471 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1490 1472 default: 1491 - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1492 - "data 0x%llx\n", msr, data); 1473 + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1474 + "data 0x%llx\n", msr, data); 1493 1475 return 1; 1494 1476 } 1495 1477 ··· 1569 1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1570 1552 data &= ~(u64)0x8; /* ignore TLB cache disable */ 1571 1553 if (data != 0) { 1572 - pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1573 - data); 1554 + vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1555 + data); 1574 1556 return 1; 1575 1557 } 1576 1558 break; 1577 1559 case MSR_FAM10H_MMIO_CONF_BASE: 1578 1560 if (data != 0) { 1579 - pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1580 - "0x%llx\n", data); 1561 + vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1562 + "0x%llx\n", data); 1581 1563 return 1; 1582 1564 } 1583 1565 break; ··· 1592 1574 thus reserved and should throw a #GP */ 1593 1575 return 1; 1594 1576 } 1595 - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1596 - __func__, data); 1577 + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1578 + __func__, data); 1597 1579 break; 1598 1580 case MSR_IA32_UCODE_REV: 1599 1581 case MSR_IA32_UCODE_WRITE: ··· 1671 1653 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); 1672 1654 1673 1655 break; 1656 + case MSR_KVM_PV_EOI_EN: 1657 + if (kvm_lapic_enable_pv_eoi(vcpu, data)) 1658 + return 1; 1659 + break; 1674 1660 1675 1661 case MSR_IA32_MCG_CTL: 1676 1662 case MSR_IA32_MCG_STATUS: ··· 1693 1671 case MSR_K7_EVNTSEL2: 1694 1672 case MSR_K7_EVNTSEL3: 1695 1673 if (data != 0) 1696 - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1697 - "0x%x data 0x%llx\n", msr, data); 1674 + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1675 + "0x%x data 0x%llx\n", msr, data); 1698 1676 break; 1699 1677 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1700 1678 * so we ignore writes to make it happy. ··· 1703 1681 case MSR_K7_PERFCTR1: 1704 1682 case MSR_K7_PERFCTR2: 1705 1683 case MSR_K7_PERFCTR3: 1706 - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1707 - "0x%x data 0x%llx\n", msr, data); 1684 + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1685 + "0x%x data 0x%llx\n", msr, data); 1708 1686 break; 1709 1687 case MSR_P6_PERFCTR0: 1710 1688 case MSR_P6_PERFCTR1: ··· 1715 1693 return kvm_pmu_set_msr(vcpu, msr, data); 1716 1694 1717 1695 if (pr || data != 0) 1718 - pr_unimpl(vcpu, "disabled perfctr wrmsr: " 1719 - "0x%x data 0x%llx\n", msr, data); 1696 + vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " 1697 + "0x%x data 0x%llx\n", msr, data); 1720 1698 break; 1721 1699 case MSR_K7_CLK_CTL: 1722 1700 /* ··· 1742 1720 /* Drop writes to this legacy MSR -- see rdmsr 1743 1721 * counterpart for further detail. 1744 1722 */ 1745 - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1723 + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1746 1724 break; 1747 1725 case MSR_AMD64_OSVW_ID_LENGTH: 1748 1726 if (!guest_cpuid_has_osvw(vcpu)) ··· 1760 1738 if (kvm_pmu_msr(vcpu, msr)) 1761 1739 return kvm_pmu_set_msr(vcpu, msr, data); 1762 1740 if (!ignore_msrs) { 1763 - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1764 - msr, data); 1741 + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1742 + msr, data); 1765 1743 return 1; 1766 1744 } else { 1767 - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1768 - msr, data); 1745 + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1746 + msr, data); 1769 1747 break; 1770 1748 } 1771 1749 } ··· 1868 1846 data = kvm->arch.hv_hypercall; 1869 1847 break; 1870 1848 default: 1871 - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1849 + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1872 1850 return 1; 1873 1851 } 1874 1852 ··· 1899 1877 data = vcpu->arch.hv_vapic; 1900 1878 break; 1901 1879 default: 1902 - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1880 + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1903 1881 return 1; 1904 1882 } 1905 1883 *pdata = data; ··· 2052 2030 if (kvm_pmu_msr(vcpu, msr)) 2053 2031 return kvm_pmu_get_msr(vcpu, msr, pdata); 2054 2032 if (!ignore_msrs) { 2055 - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2033 + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 2056 2034 return 1; 2057 2035 } else { 2058 - pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2036 + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 2059 2037 data = 0; 2060 2038 } 2061 2039 break; ··· 4138 4116 value = kvm_get_cr8(vcpu); 4139 4117 break; 4140 4118 default: 4141 - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4119 + kvm_err("%s: unexpected cr %u\n", __func__, cr); 4142 4120 return 0; 4143 4121 } 4144 4122 ··· 4167 4145 res = kvm_set_cr8(vcpu, val); 4168 4146 break; 4169 4147 default: 4170 - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4148 + kvm_err("%s: unexpected cr %u\n", __func__, cr); 4171 4149 res = -1; 4172 4150 } 4173 4151 ··· 4319 4297 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); 4320 4298 } 4321 4299 4322 - static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 4300 + static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, 4323 4301 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) 4324 4302 { 4325 - struct kvm_cpuid_entry2 *cpuid = NULL; 4326 - 4327 - if (eax && ecx) 4328 - cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), 4329 - *eax, *ecx); 4330 - 4331 - if (cpuid) { 4332 - *eax = cpuid->eax; 4333 - *ecx = cpuid->ecx; 4334 - if (ebx) 4335 - *ebx = cpuid->ebx; 4336 - if (edx) 4337 - *edx = cpuid->edx; 4338 - return true; 4339 - } 4340 - 4341 - return false; 4303 + kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); 4342 4304 } 4343 4305 4344 4306 static struct x86_emulate_ops emulate_ops = { ··· 5302 5296 5303 5297 r = kvm_mmu_reload(vcpu); 5304 5298 if (unlikely(r)) { 5305 - kvm_x86_ops->cancel_injection(vcpu); 5306 - goto out; 5299 + goto cancel_injection; 5307 5300 } 5308 5301 5309 5302 preempt_disable(); ··· 5327 5322 smp_wmb(); 5328 5323 local_irq_enable(); 5329 5324 preempt_enable(); 5330 - kvm_x86_ops->cancel_injection(vcpu); 5331 5325 r = 1; 5332 - goto out; 5326 + goto cancel_injection; 5333 5327 } 5334 5328 5335 5329 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); ··· 5392 5388 if (unlikely(vcpu->arch.tsc_always_catchup)) 5393 5389 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 5394 5390 5395 - kvm_lapic_sync_from_vapic(vcpu); 5391 + if (vcpu->arch.apic_attention) 5392 + kvm_lapic_sync_from_vapic(vcpu); 5396 5393 5397 5394 r = kvm_x86_ops->handle_exit(vcpu); 5395 + return r; 5396 + 5397 + cancel_injection: 5398 + kvm_x86_ops->cancel_injection(vcpu); 5399 + if (unlikely(vcpu->arch.apic_attention)) 5400 + kvm_lapic_sync_from_vapic(vcpu); 5398 5401 out: 5399 5402 return r; 5400 5403 } ··· 6315 6304 6316 6305 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6317 6306 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6318 - vfree(free->arch.lpage_info[i]); 6307 + kvm_kvfree(free->arch.lpage_info[i]); 6319 6308 free->arch.lpage_info[i] = NULL; 6320 6309 } 6321 6310 } ··· 6334 6323 slot->base_gfn, level) + 1; 6335 6324 6336 6325 slot->arch.lpage_info[i] = 6337 - vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6326 + kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6338 6327 if (!slot->arch.lpage_info[i]) 6339 6328 goto out_free; 6340 6329 ··· 6361 6350 6362 6351 out_free: 6363 6352 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6364 - vfree(slot->arch.lpage_info[i]); 6353 + kvm_kvfree(slot->arch.lpage_info[i]); 6365 6354 slot->arch.lpage_info[i] = NULL; 6366 6355 } 6367 6356 return -ENOMEM;

-10

drivers/s390/char/sclp.c

··· 654 654 655 655 EXPORT_SYMBOL(sclp_remove_processed); 656 656 657 - struct init_sccb { 658 - struct sccb_header header; 659 - u16 _reserved; 660 - u16 mask_length; 661 - sccb_mask_t receive_mask; 662 - sccb_mask_t send_mask; 663 - sccb_mask_t sclp_receive_mask; 664 - sccb_mask_t sclp_send_mask; 665 - } __attribute__((packed)); 666 - 667 657 /* Prepare init mask request. Called while sclp_lock is locked. */ 668 658 static inline void 669 659 __sclp_make_init_req(u32 receive_mask, u32 send_mask)

+10

drivers/s390/char/sclp.h

··· 88 88 u16 response_code; 89 89 } __attribute__((packed)); 90 90 91 + struct init_sccb { 92 + struct sccb_header header; 93 + u16 _reserved; 94 + u16 mask_length; 95 + sccb_mask_t receive_mask; 96 + sccb_mask_t send_mask; 97 + sccb_mask_t sclp_receive_mask; 98 + sccb_mask_t sclp_send_mask; 99 + } __attribute__((packed)); 100 + 91 101 extern u64 sclp_facilities; 92 102 #define SCLP_HAS_CHP_INFO (sclp_facilities & 0x8000000000000000ULL) 93 103 #define SCLP_HAS_CHP_RECONFIG (sclp_facilities & 0x2000000000000000ULL)

+38

drivers/s390/char/sclp_cmd.c

··· 48 48 u8 _reserved5[4096 - 112]; /* 112-4095 */ 49 49 } __attribute__((packed, aligned(PAGE_SIZE))); 50 50 51 + static struct init_sccb __initdata early_event_mask_sccb __aligned(PAGE_SIZE); 51 52 static struct read_info_sccb __initdata early_read_info_sccb; 52 53 static int __initdata early_read_info_sccb_valid; 53 54 ··· 105 104 } 106 105 } 107 106 107 + static void __init sclp_event_mask_early(void) 108 + { 109 + struct init_sccb *sccb = &early_event_mask_sccb; 110 + int rc; 111 + 112 + do { 113 + memset(sccb, 0, sizeof(*sccb)); 114 + sccb->header.length = sizeof(*sccb); 115 + sccb->mask_length = sizeof(sccb_mask_t); 116 + rc = sclp_cmd_sync_early(SCLP_CMDW_WRITE_EVENT_MASK, sccb); 117 + } while (rc == -EBUSY); 118 + } 119 + 108 120 void __init sclp_facilities_detect(void) 109 121 { 110 122 struct read_info_sccb *sccb; ··· 133 119 rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; 134 120 rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; 135 121 rzm <<= 20; 122 + 123 + sclp_event_mask_early(); 124 + } 125 + 126 + bool __init sclp_has_linemode(void) 127 + { 128 + struct init_sccb *sccb = &early_event_mask_sccb; 129 + 130 + if (sccb->header.response_code != 0x20) 131 + return 0; 132 + if (sccb->sclp_send_mask & (EVTYP_MSG_MASK | EVTYP_PMSGCMD_MASK)) 133 + return 1; 134 + return 0; 135 + } 136 + 137 + bool __init sclp_has_vt220(void) 138 + { 139 + struct init_sccb *sccb = &early_event_mask_sccb; 140 + 141 + if (sccb->header.response_code != 0x20) 142 + return 0; 143 + if (sccb->sclp_send_mask & EVTYP_VT220MSG_MASK) 144 + return 1; 145 + return 0; 136 146 } 137 147 138 148 unsigned long long sclp_get_rnmax(void)

+2 -1

drivers/s390/kvm/kvm_virtio.c

··· 25 25 #include <asm/io.h> 26 26 #include <asm/kvm_para.h> 27 27 #include <asm/kvm_virtio.h> 28 + #include <asm/sclp.h> 28 29 #include <asm/setup.h> 29 30 #include <asm/irq.h> 30 31 ··· 469 468 470 469 static int __init s390_virtio_console_init(void) 471 470 { 472 - if (!MACHINE_IS_KVM) 471 + if (sclp_has_vt220() || sclp_has_linemode()) 473 472 return -ENODEV; 474 473 return virtio_cons_early_init(early_put_chars); 475 474 }

+3

include/linux/kvm.h

··· 617 617 #define KVM_CAP_SIGNAL_MSI 77 618 618 #define KVM_CAP_PPC_GET_SMMU_INFO 78 619 619 #define KVM_CAP_S390_COW 79 620 + #define KVM_CAP_PPC_ALLOC_HTAB 80 620 621 621 622 #ifdef KVM_CAP_IRQ_ROUTING 622 623 ··· 829 828 #define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) 830 829 /* Available with KVM_CAP_PPC_GET_SMMU_INFO */ 831 830 #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) 831 + /* Available with KVM_CAP_PPC_ALLOC_HTAB */ 832 + #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) 832 833 833 834 /* 834 835 * ioctls for vcpu fds

+18 -9

include/linux/kvm_host.h

··· 306 306 struct hlist_head irq_ack_notifier_list; 307 307 #endif 308 308 309 - #ifdef KVM_ARCH_WANT_MMU_NOTIFIER 309 + #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 310 310 struct mmu_notifier mmu_notifier; 311 311 unsigned long mmu_notifier_seq; 312 312 long mmu_notifier_count; ··· 314 314 long tlbs_dirty; 315 315 }; 316 316 317 - /* The guest did something we don't support. */ 318 - #define pr_unimpl(vcpu, fmt, ...) \ 319 - pr_err_ratelimited("kvm: %i: cpu%i " fmt, \ 320 - current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__) 317 + #define kvm_err(fmt, ...) \ 318 + pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) 319 + #define kvm_info(fmt, ...) \ 320 + pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) 321 + #define kvm_debug(fmt, ...) \ 322 + pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) 323 + #define kvm_pr_unimpl(fmt, ...) \ 324 + pr_err_ratelimited("kvm [%i]: " fmt, \ 325 + task_tgid_nr(current), ## __VA_ARGS__) 321 326 322 - #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) 323 - #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) 327 + /* The guest did something we don't support. */ 328 + #define vcpu_unimpl(vcpu, fmt, ...) \ 329 + kvm_pr_unimpl("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) 324 330 325 331 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) 326 332 { ··· 540 534 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); 541 535 542 536 void kvm_free_physmem(struct kvm *kvm); 537 + 538 + void *kvm_kvzalloc(unsigned long size); 539 + void kvm_kvfree(const void *addr); 543 540 544 541 #ifndef __KVM_HAVE_ARCH_VM_ALLOC 545 542 static inline struct kvm *kvm_arch_alloc_vm(void) ··· 780 771 extern struct kvm_stats_debugfs_item debugfs_entries[]; 781 772 extern struct dentry *kvm_debugfs_dir; 782 773 783 - #ifdef KVM_ARCH_WANT_MMU_NOTIFIER 774 + #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 784 775 static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) 785 776 { 786 777 if (unlikely(vcpu->kvm->mmu_notifier_count)) ··· 802 793 } 803 794 #endif 804 795 805 - #ifdef CONFIG_HAVE_KVM_IRQCHIP 796 + #ifdef KVM_CAP_IRQ_ROUTING 806 797 807 798 #define KVM_MAX_IRQ_ROUTES 1024 808 799

+5 -2

include/trace/events/kvm.h

··· 13 13 ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \ 14 14 ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ 15 15 ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ 16 - ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI) 16 + ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ 17 + ERSN(S390_UCONTROL) 17 18 18 19 TRACE_EVENT(kvm_userspace_exit, 19 20 TP_PROTO(__u32 reason, int errno), ··· 37 36 __entry->errno < 0 ? -__entry->errno : __entry->reason) 38 37 ); 39 38 40 - #if defined(__KVM_HAVE_IOAPIC) 39 + #if defined(__KVM_HAVE_IRQ_LINE) 41 40 TRACE_EVENT(kvm_set_irq, 42 41 TP_PROTO(unsigned int gsi, int level, int irq_source_id), 43 42 TP_ARGS(gsi, level, irq_source_id), ··· 57 56 TP_printk("gsi %u level %d source %d", 58 57 __entry->gsi, __entry->level, __entry->irq_source_id) 59 58 ); 59 + #endif 60 60 61 + #if defined(__KVM_HAVE_IOAPIC) 61 62 #define kvm_deliver_mode \ 62 63 {0x0, "Fixed"}, \ 63 64 {0x1, "LowPrio"}, \

+16 -3

virt/kvm/ioapic.c

··· 191 191 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); 192 192 } 193 193 194 - int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) 194 + int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 195 + int level) 195 196 { 196 197 u32 old_irr; 197 198 u32 mask = 1 << irq; ··· 202 201 spin_lock(&ioapic->lock); 203 202 old_irr = ioapic->irr; 204 203 if (irq >= 0 && irq < IOAPIC_NUM_PINS) { 204 + int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], 205 + irq_source_id, level); 205 206 entry = ioapic->redirtbl[irq]; 206 - level ^= entry.fields.polarity; 207 - if (!level) 207 + irq_level ^= entry.fields.polarity; 208 + if (!irq_level) 208 209 ioapic->irr &= ~mask; 209 210 else { 210 211 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); ··· 222 219 spin_unlock(&ioapic->lock); 223 220 224 221 return ret; 222 + } 223 + 224 + void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) 225 + { 226 + int i; 227 + 228 + spin_lock(&ioapic->lock); 229 + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) 230 + __clear_bit(irq_source_id, &ioapic->irq_states[i]); 231 + spin_unlock(&ioapic->lock); 225 232 } 226 233 227 234 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,

+3 -1

virt/kvm/ioapic.h

··· 74 74 bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); 75 75 int kvm_ioapic_init(struct kvm *kvm); 76 76 void kvm_ioapic_destroy(struct kvm *kvm); 77 - int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); 77 + int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 78 + int level); 79 + void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); 78 80 void kvm_ioapic_reset(struct kvm_ioapic *ioapic); 79 81 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 80 82 struct kvm_lapic_irq *irq);

+4 -27

virt/kvm/irq_comm.c

··· 33 33 34 34 #include "ioapic.h" 35 35 36 - static inline int kvm_irq_line_state(unsigned long *irq_state, 37 - int irq_source_id, int level) 38 - { 39 - /* Logical OR for level trig interrupt */ 40 - if (level) 41 - set_bit(irq_source_id, irq_state); 42 - else 43 - clear_bit(irq_source_id, irq_state); 44 - 45 - return !!(*irq_state); 46 - } 47 - 48 36 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, 49 37 struct kvm *kvm, int irq_source_id, int level) 50 38 { 51 39 #ifdef CONFIG_X86 52 40 struct kvm_pic *pic = pic_irqchip(kvm); 53 - level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], 54 - irq_source_id, level); 55 - return kvm_pic_set_irq(pic, e->irqchip.pin, level); 41 + return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); 56 42 #else 57 43 return -1; 58 44 #endif ··· 48 62 struct kvm *kvm, int irq_source_id, int level) 49 63 { 50 64 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 51 - level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], 52 - irq_source_id, level); 53 - 54 - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); 65 + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level); 55 66 } 56 67 57 68 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) ··· 232 249 233 250 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) 234 251 { 235 - int i; 236 - 237 252 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 238 253 239 254 mutex_lock(&kvm->irq_lock); ··· 244 263 if (!irqchip_in_kernel(kvm)) 245 264 goto unlock; 246 265 247 - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { 248 - clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); 249 - if (i >= 16) 250 - continue; 266 + kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); 251 267 #ifdef CONFIG_X86 252 - clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); 268 + kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); 253 269 #endif 254 - } 255 270 unlock: 256 271 mutex_unlock(&kvm->irq_lock); 257 272 }

+24 -12

virt/kvm/kvm_main.c

··· 516 516 return ERR_PTR(r); 517 517 } 518 518 519 + /* 520 + * Avoid using vmalloc for a small buffer. 521 + * Should not be used when the size is statically known. 522 + */ 523 + void *kvm_kvzalloc(unsigned long size) 524 + { 525 + if (size > PAGE_SIZE) 526 + return vzalloc(size); 527 + else 528 + return kzalloc(size, GFP_KERNEL); 529 + } 530 + 531 + void kvm_kvfree(const void *addr) 532 + { 533 + if (is_vmalloc_addr(addr)) 534 + vfree(addr); 535 + else 536 + kfree(addr); 537 + } 538 + 519 539 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 520 540 { 521 541 if (!memslot->dirty_bitmap) 522 542 return; 523 543 524 - if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) 525 - vfree(memslot->dirty_bitmap); 526 - else 527 - kfree(memslot->dirty_bitmap); 528 - 544 + kvm_kvfree(memslot->dirty_bitmap); 529 545 memslot->dirty_bitmap = NULL; 530 546 } 531 547 ··· 633 617 #ifndef CONFIG_S390 634 618 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 635 619 636 - if (dirty_bytes > PAGE_SIZE) 637 - memslot->dirty_bitmap = vzalloc(dirty_bytes); 638 - else 639 - memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); 640 - 620 + memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); 641 621 if (!memslot->dirty_bitmap) 642 622 return -ENOMEM; 643 623 ··· 1598 1586 */ 1599 1587 for (pass = 0; pass < 2 && !yielded; pass++) { 1600 1588 kvm_for_each_vcpu(i, vcpu, kvm) { 1601 - if (!pass && i < last_boosted_vcpu) { 1589 + if (!pass && i <= last_boosted_vcpu) { 1602 1590 i = last_boosted_vcpu; 1603 1591 continue; 1604 1592 } else if (pass && i > last_boosted_vcpu) ··· 2225 2213 case KVM_CAP_SIGNAL_MSI: 2226 2214 #endif 2227 2215 return 1; 2228 - #ifdef CONFIG_HAVE_KVM_IRQCHIP 2216 + #ifdef KVM_CAP_IRQ_ROUTING 2229 2217 case KVM_CAP_IRQ_ROUTING: 2230 2218 return KVM_MAX_IRQ_ROUTES; 2231 2219 #endif