Merge tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+140 -6

Documentation/virtual/kvm/api.txt

··· 1486 1486 __u8 pad[36]; 1487 1487 }; 1488 1488 1489 + For the special case of virtio-ccw devices on s390, the ioevent is matched 1490 + to a subchannel/virtqueue tuple instead. 1491 + 1489 1492 The following flags are defined: 1490 1493 1491 1494 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) 1492 1495 #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) 1493 1496 #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) 1497 + #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ 1498 + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) 1494 1499 1495 1500 If datamatch flag is set, the event will be signaled only if the written value 1496 1501 to the registered address is equal to datamatch in struct kvm_ioeventfd. 1502 + 1503 + For virtio-ccw devices, addr contains the subchannel id and datamatch the 1504 + virtqueue index. 1497 1505 1498 1506 1499 1507 4.60 KVM_DIRTY_TLB ··· 1788 1780 PPC | KVM_REG_PPC_VPA_DTL | 128 1789 1781 PPC | KVM_REG_PPC_EPCR | 32 1790 1782 PPC | KVM_REG_PPC_EPR | 32 1783 + PPC | KVM_REG_PPC_TCR | 32 1784 + PPC | KVM_REG_PPC_TSR | 32 1785 + PPC | KVM_REG_PPC_OR_TSR | 32 1786 + PPC | KVM_REG_PPC_CLEAR_TSR | 32 1787 + PPC | KVM_REG_PPC_MAS0 | 32 1788 + PPC | KVM_REG_PPC_MAS1 | 32 1789 + PPC | KVM_REG_PPC_MAS2 | 64 1790 + PPC | KVM_REG_PPC_MAS7_3 | 64 1791 + PPC | KVM_REG_PPC_MAS4 | 32 1792 + PPC | KVM_REG_PPC_MAS6 | 32 1793 + PPC | KVM_REG_PPC_MMUCFG | 32 1794 + PPC | KVM_REG_PPC_TLB0CFG | 32 1795 + PPC | KVM_REG_PPC_TLB1CFG | 32 1796 + PPC | KVM_REG_PPC_TLB2CFG | 32 1797 + PPC | KVM_REG_PPC_TLB3CFG | 32 1798 + PPC | KVM_REG_PPC_TLB0PS | 32 1799 + PPC | KVM_REG_PPC_TLB1PS | 32 1800 + PPC | KVM_REG_PPC_TLB2PS | 32 1801 + PPC | KVM_REG_PPC_TLB3PS | 32 1802 + PPC | KVM_REG_PPC_EPTCFG | 32 1803 + PPC | KVM_REG_PPC_ICP_STATE | 64 1791 1804 1792 1805 ARM registers are mapped using the lower 32 bits. The upper 16 of that 1793 1806 is the register group type, or coprocessor number: 1794 1807 1795 1808 ARM core registers have the following id bit patterns: 1796 - 0x4002 0000 0010 <index into the kvm_regs struct:16> 1809 + 0x4020 0000 0010 <index into the kvm_regs struct:16> 1797 1810 1798 1811 ARM 32-bit CP15 registers have the following id bit patterns: 1799 - 0x4002 0000 000F <zero:1> <crn:4> <crm:4> <opc1:4> <opc2:3> 1812 + 0x4020 0000 000F <zero:1> <crn:4> <crm:4> <opc1:4> <opc2:3> 1800 1813 1801 1814 ARM 64-bit CP15 registers have the following id bit patterns: 1802 - 0x4003 0000 000F <zero:1> <zero:4> <crm:4> <opc1:4> <zero:3> 1815 + 0x4030 0000 000F <zero:1> <zero:4> <crm:4> <opc1:4> <zero:3> 1803 1816 1804 1817 ARM CCSIDR registers are demultiplexed by CSSELR value: 1805 - 0x4002 0000 0011 00 <csselr:8> 1818 + 0x4020 0000 0011 00 <csselr:8> 1806 1819 1807 1820 ARM 32-bit VFP control registers have the following id bit patterns: 1808 - 0x4002 0000 0012 1 <regno:12> 1821 + 0x4020 0000 0012 1 <regno:12> 1809 1822 1810 1823 ARM 64-bit FP registers have the following id bit patterns: 1811 - 0x4002 0000 0012 0 <regno:12> 1824 + 0x4030 0000 0012 0 <regno:12> 1812 1825 1813 1826 4.69 KVM_GET_ONE_REG 1814 1827 ··· 2190 2161 written, then `n_invalid' invalid entries, invalidating any previously 2191 2162 valid entries found. 2192 2163 2164 + 4.79 KVM_CREATE_DEVICE 2165 + 2166 + Capability: KVM_CAP_DEVICE_CTRL 2167 + Type: vm ioctl 2168 + Parameters: struct kvm_create_device (in/out) 2169 + Returns: 0 on success, -1 on error 2170 + Errors: 2171 + ENODEV: The device type is unknown or unsupported 2172 + EEXIST: Device already created, and this type of device may not 2173 + be instantiated multiple times 2174 + 2175 + Other error conditions may be defined by individual device types or 2176 + have their standard meanings. 2177 + 2178 + Creates an emulated device in the kernel. The file descriptor returned 2179 + in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. 2180 + 2181 + If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the 2182 + device type is supported (not necessarily whether it can be created 2183 + in the current vm). 2184 + 2185 + Individual devices should not define flags. Attributes should be used 2186 + for specifying any behavior that is not implied by the device type 2187 + number. 2188 + 2189 + struct kvm_create_device { 2190 + __u32 type; /* in: KVM_DEV_TYPE_xxx */ 2191 + __u32 fd; /* out: device handle */ 2192 + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ 2193 + }; 2194 + 2195 + 4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR 2196 + 2197 + Capability: KVM_CAP_DEVICE_CTRL 2198 + Type: device ioctl 2199 + Parameters: struct kvm_device_attr 2200 + Returns: 0 on success, -1 on error 2201 + Errors: 2202 + ENXIO: The group or attribute is unknown/unsupported for this device 2203 + EPERM: The attribute cannot (currently) be accessed this way 2204 + (e.g. read-only attribute, or attribute that only makes 2205 + sense when the device is in a different state) 2206 + 2207 + Other error conditions may be defined by individual device types. 2208 + 2209 + Gets/sets a specified piece of device configuration and/or state. The 2210 + semantics are device-specific. See individual device documentation in 2211 + the "devices" directory. As with ONE_REG, the size of the data 2212 + transferred is defined by the particular attribute. 2213 + 2214 + struct kvm_device_attr { 2215 + __u32 flags; /* no flags currently defined */ 2216 + __u32 group; /* device-defined */ 2217 + __u64 attr; /* group-defined */ 2218 + __u64 addr; /* userspace address of attr data */ 2219 + }; 2220 + 2221 + 4.81 KVM_HAS_DEVICE_ATTR 2222 + 2223 + Capability: KVM_CAP_DEVICE_CTRL 2224 + Type: device ioctl 2225 + Parameters: struct kvm_device_attr 2226 + Returns: 0 on success, -1 on error 2227 + Errors: 2228 + ENXIO: The group or attribute is unknown/unsupported for this device 2229 + 2230 + Tests whether a device supports a particular attribute. A successful 2231 + return indicates the attribute is implemented. It does not necessarily 2232 + indicate that the attribute can be read or written in the device's 2233 + current state. "addr" is ignored. 2193 2234 2194 2235 4.77 KVM_ARM_VCPU_INIT 2195 2236 ··· 2341 2242 and distributor interface, the ioctl must be called after calling 2342 2243 KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling 2343 2244 this ioctl twice for any of the base addresses will return -EEXIST. 2245 + 2246 + 4.82 KVM_PPC_RTAS_DEFINE_TOKEN 2247 + 2248 + Capability: KVM_CAP_PPC_RTAS 2249 + Architectures: ppc 2250 + Type: vm ioctl 2251 + Parameters: struct kvm_rtas_token_args 2252 + Returns: 0 on success, -1 on error 2253 + 2254 + Defines a token value for a RTAS (Run Time Abstraction Services) 2255 + service in order to allow it to be handled in the kernel. The 2256 + argument struct gives the name of the service, which must be the name 2257 + of a service that has a kernel-side implementation. If the token 2258 + value is non-zero, it will be associated with that service, and 2259 + subsequent RTAS calls by the guest specifying that token will be 2260 + handled by the kernel. If the token value is 0, then any token 2261 + associated with the service will be forgotten, and subsequent RTAS 2262 + calls by the guest for that service will be passed to userspace to be 2263 + handled. 2344 2264 2345 2265 2346 2266 5. The kvm_run structure ··· 2764 2646 When disabled (args[0] == 0), behavior is as if this facility is unsupported. 2765 2647 2766 2648 When this capability is enabled, KVM_EXIT_EPR can occur. 2649 + 2650 + 6.6 KVM_CAP_IRQ_MPIC 2651 + 2652 + Architectures: ppc 2653 + Parameters: args[0] is the MPIC device fd 2654 + args[1] is the MPIC CPU number for this vcpu 2655 + 2656 + This capability connects the vcpu to an in-kernel MPIC device. 2657 + 2658 + 6.7 KVM_CAP_IRQ_XICS 2659 + 2660 + Architectures: ppc 2661 + Parameters: args[0] is the XICS device fd 2662 + args[1] is the XICS CPU number (server ID) for this vcpu 2663 + 2664 + This capability connects the vcpu to an in-kernel XICS device.

+1

Documentation/virtual/kvm/devices/README

··· 1 + This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.

+53

Documentation/virtual/kvm/devices/mpic.txt

··· 1 + MPIC interrupt controller 2 + ========================= 3 + 4 + Device types supported: 5 + KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 6 + KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 7 + 8 + Only one MPIC instance, of any type, may be instantiated. The created 9 + MPIC will act as the system interrupt controller, connecting to each 10 + vcpu's interrupt inputs. 11 + 12 + Groups: 13 + KVM_DEV_MPIC_GRP_MISC 14 + Attributes: 15 + KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) 16 + Base address of the 256 KiB MPIC register space. Must be 17 + naturally aligned. A value of zero disables the mapping. 18 + Reset value is zero. 19 + 20 + KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) 21 + Access an MPIC register, as if the access were made from the guest. 22 + "attr" is the byte offset into the MPIC register space. Accesses 23 + must be 4-byte aligned. 24 + 25 + MSIs may be signaled by using this attribute group to write 26 + to the relevant MSIIR. 27 + 28 + KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) 29 + IRQ input line for each standard openpic source. 0 is inactive and 1 30 + is active, regardless of interrupt sense. 31 + 32 + For edge-triggered interrupts: Writing 1 is considered an activating 33 + edge, and writing 0 is ignored. Reading returns 1 if a previously 34 + signaled edge has not been acknowledged, and 0 otherwise. 35 + 36 + "attr" is the IRQ number. IRQ numbers for standard sources are the 37 + byte offset of the relevant IVPR from EIVPR0, divided by 32. 38 + 39 + IRQ Routing: 40 + 41 + The MPIC emulation supports IRQ routing. Only a single MPIC device can 42 + be instantiated. Once that device has been created, it's available as 43 + irqchip id 0. 44 + 45 + This irqchip 0 has 256 interrupt pins, which expose the interrupts in 46 + the main array of interrupt sources (a.k.a. "SRC" interrupts). 47 + 48 + The numbering is the same as the MPIC device tree binding -- based on 49 + the register offset from the beginning of the sources array, without 50 + regard to any subdivisions in chip documentation such as "internal" 51 + or "external" interrupts. 52 + 53 + Access to non-SRC interrupts is not implemented through IRQ routing mechanisms.

+66

Documentation/virtual/kvm/devices/xics.txt

··· 1 + XICS interrupt controller 2 + 3 + Device type supported: KVM_DEV_TYPE_XICS 4 + 5 + Groups: 6 + KVM_DEV_XICS_SOURCES 7 + Attributes: One per interrupt source, indexed by the source number. 8 + 9 + This device emulates the XICS (eXternal Interrupt Controller 10 + Specification) defined in PAPR. The XICS has a set of interrupt 11 + sources, each identified by a 20-bit source number, and a set of 12 + Interrupt Control Presentation (ICP) entities, also called "servers", 13 + each associated with a virtual CPU. 14 + 15 + The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH 16 + capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and 17 + the interrupt server number (i.e. the vcpu number from the XICS's 18 + point of view) in args[1] of the kvm_enable_cap struct. Each ICP has 19 + 64 bits of state which can be read and written using the 20 + KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit 21 + state word has the following bitfields, starting at the 22 + least-significant end of the word: 23 + 24 + * Unused, 16 bits 25 + 26 + * Pending interrupt priority, 8 bits 27 + Zero is the highest priority, 255 means no interrupt is pending. 28 + 29 + * Pending IPI (inter-processor interrupt) priority, 8 bits 30 + Zero is the highest priority, 255 means no IPI is pending. 31 + 32 + * Pending interrupt source number, 24 bits 33 + Zero means no interrupt pending, 2 means an IPI is pending 34 + 35 + * Current processor priority, 8 bits 36 + Zero is the highest priority, meaning no interrupts can be 37 + delivered, and 255 is the lowest priority. 38 + 39 + Each source has 64 bits of state that can be read and written using 40 + the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the 41 + KVM_DEV_XICS_SOURCES attribute group, with the attribute number being 42 + the interrupt source number. The 64 bit state word has the following 43 + bitfields, starting from the least-significant end of the word: 44 + 45 + * Destination (server number), 32 bits 46 + This specifies where the interrupt should be sent, and is the 47 + interrupt server number specified for the destination vcpu. 48 + 49 + * Priority, 8 bits 50 + This is the priority specified for this interrupt source, where 0 is 51 + the highest priority and 255 is the lowest. An interrupt with a 52 + priority of 255 will never be delivered. 53 + 54 + * Level sensitive flag, 1 bit 55 + This bit is 1 for a level-sensitive interrupt source, or 0 for 56 + edge-sensitive (or MSI). 57 + 58 + * Masked flag, 1 bit 59 + This bit is set to 1 if the interrupt is masked (cannot be delivered 60 + regardless of its priority), for example by the ibm,int-off RTAS 61 + call, or 0 if it is not masked. 62 + 63 + * Pending flag, 1 bit 64 + This bit is 1 if the source has a pending interrupt, otherwise 0. 65 + 66 + Only one XICS instance may be created per VM.

-1

arch/arm/include/asm/idmap.h

··· 8 8 #define __idmap __section(.idmap.text) noinline notrace 9 9 10 10 extern pgd_t *idmap_pgd; 11 - extern pgd_t *hyp_pgd; 12 11 13 12 void setup_mm_for_reboot(void); 14 13

+32 -15

arch/arm/include/asm/kvm_host.h

··· 87 87 u32 hyp_pc; /* PC when exception was taken from Hyp mode */ 88 88 }; 89 89 90 - typedef struct vfp_hard_struct kvm_kernel_vfp_t; 90 + typedef struct vfp_hard_struct kvm_cpu_context_t; 91 91 92 92 struct kvm_vcpu_arch { 93 93 struct kvm_regs regs; ··· 105 105 struct kvm_vcpu_fault_info fault; 106 106 107 107 /* Floating point registers (VFP and Advanced SIMD/NEON) */ 108 - kvm_kernel_vfp_t vfp_guest; 109 - kvm_kernel_vfp_t *vfp_host; 108 + struct vfp_hard_struct vfp_guest; 109 + 110 + /* Host FP context */ 111 + kvm_cpu_context_t *host_cpu_context; 110 112 111 113 /* VGIC state */ 112 114 struct vgic_cpu vgic_cpu; ··· 190 188 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, 191 189 int exception_index); 192 190 193 - static inline void __cpu_init_hyp_mode(unsigned long long pgd_ptr, 191 + static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr, 192 + unsigned long long pgd_ptr, 194 193 unsigned long hyp_stack_ptr, 195 194 unsigned long vector_ptr) 196 195 { 197 - unsigned long pgd_low, pgd_high; 198 - 199 - pgd_low = (pgd_ptr & ((1ULL << 32) - 1)); 200 - pgd_high = (pgd_ptr >> 32ULL); 201 - 202 196 /* 203 - * Call initialization code, and switch to the full blown 204 - * HYP code. The init code doesn't need to preserve these registers as 205 - * r1-r3 and r12 are already callee save according to the AAPCS. 206 - * Note that we slightly misuse the prototype by casing the pgd_low to 207 - * a void *. 197 + * Call initialization code, and switch to the full blown HYP 198 + * code. The init code doesn't need to preserve these 199 + * registers as r0-r3 are already callee saved according to 200 + * the AAPCS. 201 + * Note that we slightly misuse the prototype by casing the 202 + * stack pointer to a void *. 203 + * 204 + * We don't have enough registers to perform the full init in 205 + * one go. Install the boot PGD first, and then install the 206 + * runtime PGD, stack pointer and vectors. The PGDs are always 207 + * passed as the third argument, in order to be passed into 208 + * r2-r3 to the init code (yes, this is compliant with the 209 + * PCS!). 208 210 */ 209 - kvm_call_hyp((void *)pgd_low, pgd_high, hyp_stack_ptr, vector_ptr); 211 + 212 + kvm_call_hyp(NULL, 0, boot_pgd_ptr); 213 + 214 + kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); 210 215 } 216 + 217 + static inline int kvm_arch_dev_ioctl_check_extension(long ext) 218 + { 219 + return 0; 220 + } 221 + 222 + int kvm_perf_init(void); 223 + int kvm_perf_teardown(void); 211 224 212 225 #endif /* __ARM_KVM_HOST_H__ */

+23 -5

arch/arm/include/asm/kvm_mmu.h

··· 19 19 #ifndef __ARM_KVM_MMU_H__ 20 20 #define __ARM_KVM_MMU_H__ 21 21 22 - #include <asm/cacheflush.h> 23 - #include <asm/pgalloc.h> 24 - #include <asm/idmap.h> 22 + #include <asm/memory.h> 23 + #include <asm/page.h> 25 24 26 25 /* 27 26 * We directly use the kernel VA for the HYP, as we can directly share 28 27 * the mapping (HTTBR "covers" TTBR1). 29 28 */ 30 - #define HYP_PAGE_OFFSET_MASK (~0UL) 29 + #define HYP_PAGE_OFFSET_MASK UL(~0) 31 30 #define HYP_PAGE_OFFSET PAGE_OFFSET 32 31 #define KERN_TO_HYP(kva) (kva) 33 32 33 + /* 34 + * Our virtual mapping for the boot-time MMU-enable code. Must be 35 + * shared across all the page-tables. Conveniently, we use the vectors 36 + * page, where no kernel data will ever be shared with HYP. 37 + */ 38 + #define TRAMPOLINE_VA UL(CONFIG_VECTORS_BASE) 39 + 40 + #ifndef __ASSEMBLY__ 41 + 42 + #include <asm/cacheflush.h> 43 + #include <asm/pgalloc.h> 44 + 34 45 int create_hyp_mappings(void *from, void *to); 35 46 int create_hyp_io_mappings(void *from, void *to, phys_addr_t); 36 - void free_hyp_pmds(void); 47 + void free_boot_hyp_pgd(void); 48 + void free_hyp_pgds(void); 37 49 38 50 int kvm_alloc_stage2_pgd(struct kvm *kvm); 39 51 void kvm_free_stage2_pgd(struct kvm *kvm); ··· 57 45 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); 58 46 59 47 phys_addr_t kvm_mmu_get_httbr(void); 48 + phys_addr_t kvm_mmu_get_boot_httbr(void); 49 + phys_addr_t kvm_get_idmap_vector(void); 60 50 int kvm_mmu_init(void); 61 51 void kvm_clear_hyp_idmap(void); 62 52 ··· 127 113 __flush_icache_all(); 128 114 } 129 115 } 116 + 117 + #define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) 118 + 119 + #endif /* !__ASSEMBLY__ */ 130 120 131 121 #endif /* __ARM_KVM_MMU_H__ */

+1 -1

arch/arm/kernel/asm-offsets.c

··· 158 158 DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.midr)); 159 159 DEFINE(VCPU_CP15, offsetof(struct kvm_vcpu, arch.cp15)); 160 160 DEFINE(VCPU_VFP_GUEST, offsetof(struct kvm_vcpu, arch.vfp_guest)); 161 - DEFINE(VCPU_VFP_HOST, offsetof(struct kvm_vcpu, arch.vfp_host)); 161 + DEFINE(VCPU_VFP_HOST, offsetof(struct kvm_vcpu, arch.host_cpu_context)); 162 162 DEFINE(VCPU_REGS, offsetof(struct kvm_vcpu, arch.regs)); 163 163 DEFINE(VCPU_USR_REGS, offsetof(struct kvm_vcpu, arch.regs.usr_regs)); 164 164 DEFINE(VCPU_SVC_REGS, offsetof(struct kvm_vcpu, arch.regs.svc_regs));

+6 -1

arch/arm/kernel/vmlinux.lds.S

··· 20 20 VMLINUX_SYMBOL(__idmap_text_start) = .; \ 21 21 *(.idmap.text) \ 22 22 VMLINUX_SYMBOL(__idmap_text_end) = .; \ 23 - ALIGN_FUNCTION(); \ 23 + . = ALIGN(32); \ 24 24 VMLINUX_SYMBOL(__hyp_idmap_text_start) = .; \ 25 25 *(.hyp.idmap.text) \ 26 26 VMLINUX_SYMBOL(__hyp_idmap_text_end) = .; ··· 315 315 */ 316 316 ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support") 317 317 ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined") 318 + /* 319 + * The HYP init code can't be more than a page long. 320 + * The above comment applies as well. 321 + */ 322 + ASSERT(((__hyp_idmap_text_end - __hyp_idmap_text_start) <= PAGE_SIZE), "HYP init code too big")

+3 -3

arch/arm/kvm/Kconfig

··· 41 41 Provides host support for ARM processors. 42 42 43 43 config KVM_ARM_MAX_VCPUS 44 - int "Number maximum supported virtual CPUs per VM" 45 - depends on KVM_ARM_HOST 46 - default 4 44 + int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST 45 + default 4 if KVM_ARM_HOST 46 + default 0 47 47 help 48 48 Static number of max supported virtual CPUs per VM. 49 49

+1 -1

arch/arm/kvm/Makefile

··· 18 18 19 19 obj-y += kvm-arm.o init.o interrupts.o 20 20 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o 21 - obj-y += coproc.o coproc_a15.o mmio.o psci.o 21 + obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o 22 22 obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o 23 23 obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o

+4 -3

arch/arm/kvm/arch_timer.c

··· 22 22 #include <linux/kvm_host.h> 23 23 #include <linux/interrupt.h> 24 24 25 + #include <clocksource/arm_arch_timer.h> 25 26 #include <asm/arch_timer.h> 26 27 27 28 #include <asm/kvm_vgic.h> ··· 65 64 { 66 65 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 67 66 68 - timer->cntv_ctl |= 1 << 1; /* Mask the interrupt in the guest */ 67 + timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK; 69 68 kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, 70 69 vcpu->arch.timer_cpu.irq->irq, 71 70 vcpu->arch.timer_cpu.irq->level); ··· 134 133 cycle_t cval, now; 135 134 u64 ns; 136 135 137 - /* Check if the timer is enabled and unmasked first */ 138 - if ((timer->cntv_ctl & 3) != 1) 136 + if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || 137 + !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE)) 139 138 return; 140 139 141 140 cval = timer->cntv_cval;

+75 -54

arch/arm/kvm/arm.c

··· 16 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 17 */ 18 18 19 + #include <linux/cpu.h> 19 20 #include <linux/errno.h> 20 21 #include <linux/err.h> 21 22 #include <linux/kvm_host.h> ··· 49 48 #endif 50 49 51 50 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); 52 - static kvm_kernel_vfp_t __percpu *kvm_host_vfp_state; 51 + static kvm_cpu_context_t __percpu *kvm_host_cpu_state; 53 52 static unsigned long hyp_default_vectors; 54 53 55 54 /* Per-CPU variable containing the currently running vcpu. */ ··· 207 206 r = KVM_MAX_VCPUS; 208 207 break; 209 208 default: 210 - r = 0; 209 + r = kvm_arch_dev_ioctl_check_extension(ext); 211 210 break; 212 211 } 213 212 return r; ··· 219 218 return -EINVAL; 220 219 } 221 220 222 - int kvm_arch_set_memory_region(struct kvm *kvm, 223 - struct kvm_userspace_memory_region *mem, 224 - struct kvm_memory_slot old, 225 - int user_alloc) 226 - { 227 - return 0; 228 - } 229 - 230 221 int kvm_arch_prepare_memory_region(struct kvm *kvm, 231 222 struct kvm_memory_slot *memslot, 232 - struct kvm_memory_slot old, 233 223 struct kvm_userspace_memory_region *mem, 234 - bool user_alloc) 224 + enum kvm_mr_change change) 235 225 { 236 226 return 0; 237 227 } 238 228 239 229 void kvm_arch_commit_memory_region(struct kvm *kvm, 240 230 struct kvm_userspace_memory_region *mem, 241 - struct kvm_memory_slot old, 242 - bool user_alloc) 231 + const struct kvm_memory_slot *old, 232 + enum kvm_mr_change change) 243 233 { 244 234 } 245 235 ··· 318 326 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 319 327 { 320 328 vcpu->cpu = cpu; 321 - vcpu->arch.vfp_host = this_cpu_ptr(kvm_host_vfp_state); 329 + vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); 322 330 323 331 /* 324 332 * Check whether this vcpu requires the cache to be flushed on ··· 631 639 return 0; 632 640 } 633 641 634 - int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level) 642 + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, 643 + bool line_status) 635 644 { 636 645 u32 irq = irq_level->irq; 637 646 unsigned int irq_type, vcpu_idx, irq_num; ··· 787 794 } 788 795 } 789 796 790 - static void cpu_init_hyp_mode(void *vector) 797 + static void cpu_init_hyp_mode(void *dummy) 791 798 { 799 + unsigned long long boot_pgd_ptr; 792 800 unsigned long long pgd_ptr; 793 801 unsigned long hyp_stack_ptr; 794 802 unsigned long stack_page; 795 803 unsigned long vector_ptr; 796 804 797 805 /* Switch from the HYP stub to our own HYP init vector */ 798 - __hyp_set_vectors((unsigned long)vector); 806 + __hyp_set_vectors(kvm_get_idmap_vector()); 799 807 808 + boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr(); 800 809 pgd_ptr = (unsigned long long)kvm_mmu_get_httbr(); 801 810 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); 802 811 hyp_stack_ptr = stack_page + PAGE_SIZE; 803 812 vector_ptr = (unsigned long)__kvm_hyp_vector; 804 813 805 - __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); 814 + __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); 806 815 } 816 + 817 + static int hyp_init_cpu_notify(struct notifier_block *self, 818 + unsigned long action, void *cpu) 819 + { 820 + switch (action) { 821 + case CPU_STARTING: 822 + case CPU_STARTING_FROZEN: 823 + cpu_init_hyp_mode(NULL); 824 + break; 825 + } 826 + 827 + return NOTIFY_OK; 828 + } 829 + 830 + static struct notifier_block hyp_init_cpu_nb = { 831 + .notifier_call = hyp_init_cpu_notify, 832 + }; 807 833 808 834 /** 809 835 * Inits Hyp-mode on all online CPUs 810 836 */ 811 837 static int init_hyp_mode(void) 812 838 { 813 - phys_addr_t init_phys_addr; 814 839 int cpu; 815 840 int err = 0; 816 841 ··· 861 850 } 862 851 863 852 /* 864 - * Execute the init code on each CPU. 865 - * 866 - * Note: The stack is not mapped yet, so don't do anything else than 867 - * initializing the hypervisor mode on each CPU using a local stack 868 - * space for temporary storage. 869 - */ 870 - init_phys_addr = virt_to_phys(__kvm_hyp_init); 871 - for_each_online_cpu(cpu) { 872 - smp_call_function_single(cpu, cpu_init_hyp_mode, 873 - (void *)(long)init_phys_addr, 1); 874 - } 875 - 876 - /* 877 - * Unmap the identity mapping 878 - */ 879 - kvm_clear_hyp_idmap(); 880 - 881 - /* 882 853 * Map the Hyp-code called directly from the host 883 854 */ 884 855 err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end); ··· 883 890 } 884 891 885 892 /* 886 - * Map the host VFP structures 893 + * Map the host CPU structures 887 894 */ 888 - kvm_host_vfp_state = alloc_percpu(kvm_kernel_vfp_t); 889 - if (!kvm_host_vfp_state) { 895 + kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t); 896 + if (!kvm_host_cpu_state) { 890 897 err = -ENOMEM; 891 - kvm_err("Cannot allocate host VFP state\n"); 898 + kvm_err("Cannot allocate host CPU state\n"); 892 899 goto out_free_mappings; 893 900 } 894 901 895 902 for_each_possible_cpu(cpu) { 896 - kvm_kernel_vfp_t *vfp; 903 + kvm_cpu_context_t *cpu_ctxt; 897 904 898 - vfp = per_cpu_ptr(kvm_host_vfp_state, cpu); 899 - err = create_hyp_mappings(vfp, vfp + 1); 905 + cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu); 906 + err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1); 900 907 901 908 if (err) { 902 - kvm_err("Cannot map host VFP state: %d\n", err); 903 - goto out_free_vfp; 909 + kvm_err("Cannot map host CPU state: %d\n", err); 910 + goto out_free_context; 904 911 } 905 912 } 913 + 914 + /* 915 + * Execute the init code on each CPU. 916 + */ 917 + on_each_cpu(cpu_init_hyp_mode, NULL, 1); 906 918 907 919 /* 908 920 * Init HYP view of VGIC 909 921 */ 910 922 err = kvm_vgic_hyp_init(); 911 923 if (err) 912 - goto out_free_vfp; 924 + goto out_free_context; 913 925 914 926 #ifdef CONFIG_KVM_ARM_VGIC 915 927 vgic_present = true; ··· 927 929 if (err) 928 930 goto out_free_mappings; 929 931 932 + #ifndef CONFIG_HOTPLUG_CPU 933 + free_boot_hyp_pgd(); 934 + #endif 935 + 936 + kvm_perf_init(); 937 + 930 938 kvm_info("Hyp mode initialized successfully\n"); 939 + 931 940 return 0; 932 - out_free_vfp: 933 - free_percpu(kvm_host_vfp_state); 941 + out_free_context: 942 + free_percpu(kvm_host_cpu_state); 934 943 out_free_mappings: 935 - free_hyp_pmds(); 944 + free_hyp_pgds(); 936 945 out_free_stack_pages: 937 946 for_each_possible_cpu(cpu) 938 947 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); ··· 948 943 return err; 949 944 } 950 945 946 + static void check_kvm_target_cpu(void *ret) 947 + { 948 + *(int *)ret = kvm_target_cpu(); 949 + } 950 + 951 951 /** 952 952 * Initialize Hyp-mode and memory mappings on all CPUs. 953 953 */ 954 954 int kvm_arch_init(void *opaque) 955 955 { 956 956 int err; 957 + int ret, cpu; 957 958 958 959 if (!is_hyp_mode_available()) { 959 960 kvm_err("HYP mode not available\n"); 960 961 return -ENODEV; 961 962 } 962 963 963 - if (kvm_target_cpu() < 0) { 964 - kvm_err("Target CPU not supported!\n"); 965 - return -ENODEV; 964 + for_each_online_cpu(cpu) { 965 + smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); 966 + if (ret < 0) { 967 + kvm_err("Error, CPU %d not supported!\n", cpu); 968 + return -ENODEV; 969 + } 966 970 } 967 971 968 972 err = init_hyp_mode(); 969 973 if (err) 970 974 goto out_err; 975 + 976 + err = register_cpu_notifier(&hyp_init_cpu_nb); 977 + if (err) { 978 + kvm_err("Cannot register HYP init CPU notifier (%d)\n", err); 979 + goto out_err; 980 + } 971 981 972 982 kvm_coproc_table_init(); 973 983 return 0; ··· 993 973 /* NOP: Compiling as a module not supported */ 994 974 void kvm_arch_exit(void) 995 975 { 976 + kvm_perf_teardown(); 996 977 } 997 978 998 979 static int arm_init(void)

+59 -19

arch/arm/kvm/init.S

··· 21 21 #include <asm/asm-offsets.h> 22 22 #include <asm/kvm_asm.h> 23 23 #include <asm/kvm_arm.h> 24 + #include <asm/kvm_mmu.h> 24 25 25 26 /******************************************************************** 26 27 * Hypervisor initialization 27 28 * - should be called with: 28 - * r0,r1 = Hypervisor pgd pointer 29 - * r2 = top of Hyp stack (kernel VA) 30 - * r3 = pointer to hyp vectors 29 + * r0 = top of Hyp stack (kernel VA) 30 + * r1 = pointer to hyp vectors 31 + * r2,r3 = Hypervisor pgd pointer 32 + * 33 + * The init scenario is: 34 + * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd, 35 + * runtime stack, runtime vectors 36 + * - Enable the MMU with the boot pgd 37 + * - Jump to a target into the trampoline page (remember, this is the same 38 + * physical page!) 39 + * - Now switch to the runtime pgd (same VA, and still the same physical 40 + * page!) 41 + * - Invalidate TLBs 42 + * - Set stack and vectors 43 + * - Profit! (or eret, if you only care about the code). 44 + * 45 + * As we only have four registers available to pass parameters (and we 46 + * need six), we split the init in two phases: 47 + * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD. 48 + * Provides the basic HYP init, and enable the MMU. 49 + * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD. 50 + * Switches to the runtime PGD, set stack and vectors. 31 51 */ 32 52 33 53 .text ··· 67 47 W(b) . 68 48 69 49 __do_hyp_init: 50 + cmp r0, #0 @ We have a SP? 51 + bne phase2 @ Yes, second stage init 52 + 70 53 @ Set the HTTBR to point to the hypervisor PGD pointer passed 71 - mcrr p15, 4, r0, r1, c2 54 + mcrr p15, 4, r2, r3, c2 72 55 73 56 @ Set the HTCR and VTCR to the same shareability and cacheability 74 57 @ settings as the non-secure TTBCR and with T0SZ == 0. 75 58 mrc p15, 4, r0, c2, c0, 2 @ HTCR 76 - ldr r12, =HTCR_MASK 77 - bic r0, r0, r12 59 + ldr r2, =HTCR_MASK 60 + bic r0, r0, r2 78 61 mrc p15, 0, r1, c2, c0, 2 @ TTBCR 79 62 and r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ) 80 63 orr r0, r0, r1 81 64 mcr p15, 4, r0, c2, c0, 2 @ HTCR 82 65 83 66 mrc p15, 4, r1, c2, c1, 2 @ VTCR 84 - ldr r12, =VTCR_MASK 85 - bic r1, r1, r12 67 + ldr r2, =VTCR_MASK 68 + bic r1, r1, r2 86 69 bic r0, r0, #(~VTCR_HTCR_SH) @ clear non-reusable HTCR bits 87 70 orr r1, r0, r1 88 71 orr r1, r1, #(KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S) ··· 108 85 @ - Memory alignment checks: enabled 109 86 @ - MMU: enabled (this code must be run from an identity mapping) 110 87 mrc p15, 4, r0, c1, c0, 0 @ HSCR 111 - ldr r12, =HSCTLR_MASK 112 - bic r0, r0, r12 88 + ldr r2, =HSCTLR_MASK 89 + bic r0, r0, r2 113 90 mrc p15, 0, r1, c1, c0, 0 @ SCTLR 114 - ldr r12, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) 115 - and r1, r1, r12 116 - ARM( ldr r12, =(HSCTLR_M | HSCTLR_A) ) 117 - THUMB( ldr r12, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) 118 - orr r1, r1, r12 91 + ldr r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) 92 + and r1, r1, r2 93 + ARM( ldr r2, =(HSCTLR_M | HSCTLR_A) ) 94 + THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) 95 + orr r1, r1, r2 119 96 orr r0, r0, r1 120 97 isb 121 98 mcr p15, 4, r0, c1, c0, 0 @ HSCR 122 - isb 123 99 124 - @ Set stack pointer and return to the kernel 125 - mov sp, r2 100 + @ End of init phase-1 101 + eret 102 + 103 + phase2: 104 + @ Set stack pointer 105 + mov sp, r0 126 106 127 107 @ Set HVBAR to point to the HYP vectors 128 - mcr p15, 4, r3, c12, c0, 0 @ HVBAR 108 + mcr p15, 4, r1, c12, c0, 0 @ HVBAR 109 + 110 + @ Jump to the trampoline page 111 + ldr r0, =TRAMPOLINE_VA 112 + adr r1, target 113 + bfi r0, r1, #0, #PAGE_SHIFT 114 + mov pc, r0 115 + 116 + target: @ We're now in the trampoline code, switch page tables 117 + mcrr p15, 4, r2, r3, c2 118 + isb 119 + 120 + @ Invalidate the old TLBs 121 + mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH 122 + dsb 129 123 130 124 eret 131 125

+343 -284

arch/arm/kvm/mmu.c

··· 32 32 33 33 extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; 34 34 35 + static pgd_t *boot_hyp_pgd; 36 + static pgd_t *hyp_pgd; 35 37 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 38 + 39 + static void *init_bounce_page; 40 + static unsigned long hyp_idmap_start; 41 + static unsigned long hyp_idmap_end; 42 + static phys_addr_t hyp_idmap_vector; 36 43 37 44 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 38 45 { ··· 78 71 return p; 79 72 } 80 73 81 - static void free_ptes(pmd_t *pmd, unsigned long addr) 82 - { 83 - pte_t *pte; 84 - unsigned int i; 85 - 86 - for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { 87 - if (!pmd_none(*pmd) && pmd_table(*pmd)) { 88 - pte = pte_offset_kernel(pmd, addr); 89 - pte_free_kernel(NULL, pte); 90 - } 91 - pmd++; 92 - } 93 - } 94 - 95 - static void free_hyp_pgd_entry(unsigned long addr) 96 - { 97 - pgd_t *pgd; 98 - pud_t *pud; 99 - pmd_t *pmd; 100 - unsigned long hyp_addr = KERN_TO_HYP(addr); 101 - 102 - pgd = hyp_pgd + pgd_index(hyp_addr); 103 - pud = pud_offset(pgd, hyp_addr); 104 - 105 - if (pud_none(*pud)) 106 - return; 107 - BUG_ON(pud_bad(*pud)); 108 - 109 - pmd = pmd_offset(pud, hyp_addr); 110 - free_ptes(pmd, addr); 111 - pmd_free(NULL, pmd); 112 - pud_clear(pud); 113 - } 114 - 115 - /** 116 - * free_hyp_pmds - free a Hyp-mode level-2 tables and child level-3 tables 117 - * 118 - * Assumes this is a page table used strictly in Hyp-mode and therefore contains 119 - * either mappings in the kernel memory area (above PAGE_OFFSET), or 120 - * device mappings in the vmalloc range (from VMALLOC_START to VMALLOC_END). 121 - */ 122 - void free_hyp_pmds(void) 123 - { 124 - unsigned long addr; 125 - 126 - mutex_lock(&kvm_hyp_pgd_mutex); 127 - for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) 128 - free_hyp_pgd_entry(addr); 129 - for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) 130 - free_hyp_pgd_entry(addr); 131 - mutex_unlock(&kvm_hyp_pgd_mutex); 132 - } 133 - 134 - static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 135 - unsigned long end) 136 - { 137 - pte_t *pte; 138 - unsigned long addr; 139 - struct page *page; 140 - 141 - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 142 - unsigned long hyp_addr = KERN_TO_HYP(addr); 143 - 144 - pte = pte_offset_kernel(pmd, hyp_addr); 145 - BUG_ON(!virt_addr_valid(addr)); 146 - page = virt_to_page(addr); 147 - kvm_set_pte(pte, mk_pte(page, PAGE_HYP)); 148 - } 149 - } 150 - 151 - static void create_hyp_io_pte_mappings(pmd_t *pmd, unsigned long start, 152 - unsigned long end, 153 - unsigned long *pfn_base) 154 - { 155 - pte_t *pte; 156 - unsigned long addr; 157 - 158 - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 159 - unsigned long hyp_addr = KERN_TO_HYP(addr); 160 - 161 - pte = pte_offset_kernel(pmd, hyp_addr); 162 - BUG_ON(pfn_valid(*pfn_base)); 163 - kvm_set_pte(pte, pfn_pte(*pfn_base, PAGE_HYP_DEVICE)); 164 - (*pfn_base)++; 165 - } 166 - } 167 - 168 - static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 169 - unsigned long end, unsigned long *pfn_base) 170 - { 171 - pmd_t *pmd; 172 - pte_t *pte; 173 - unsigned long addr, next; 174 - 175 - for (addr = start; addr < end; addr = next) { 176 - unsigned long hyp_addr = KERN_TO_HYP(addr); 177 - pmd = pmd_offset(pud, hyp_addr); 178 - 179 - BUG_ON(pmd_sect(*pmd)); 180 - 181 - if (pmd_none(*pmd)) { 182 - pte = pte_alloc_one_kernel(NULL, hyp_addr); 183 - if (!pte) { 184 - kvm_err("Cannot allocate Hyp pte\n"); 185 - return -ENOMEM; 186 - } 187 - pmd_populate_kernel(NULL, pmd, pte); 188 - } 189 - 190 - next = pmd_addr_end(addr, end); 191 - 192 - /* 193 - * If pfn_base is NULL, we map kernel pages into HYP with the 194 - * virtual address. Otherwise, this is considered an I/O 195 - * mapping and we map the physical region starting at 196 - * *pfn_base to [start, end[. 197 - */ 198 - if (!pfn_base) 199 - create_hyp_pte_mappings(pmd, addr, next); 200 - else 201 - create_hyp_io_pte_mappings(pmd, addr, next, pfn_base); 202 - } 203 - 204 - return 0; 205 - } 206 - 207 - static int __create_hyp_mappings(void *from, void *to, unsigned long *pfn_base) 208 - { 209 - unsigned long start = (unsigned long)from; 210 - unsigned long end = (unsigned long)to; 211 - pgd_t *pgd; 212 - pud_t *pud; 213 - pmd_t *pmd; 214 - unsigned long addr, next; 215 - int err = 0; 216 - 217 - if (start >= end) 218 - return -EINVAL; 219 - /* Check for a valid kernel memory mapping */ 220 - if (!pfn_base && (!virt_addr_valid(from) || !virt_addr_valid(to - 1))) 221 - return -EINVAL; 222 - /* Check for a valid kernel IO mapping */ 223 - if (pfn_base && (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))) 224 - return -EINVAL; 225 - 226 - mutex_lock(&kvm_hyp_pgd_mutex); 227 - for (addr = start; addr < end; addr = next) { 228 - unsigned long hyp_addr = KERN_TO_HYP(addr); 229 - pgd = hyp_pgd + pgd_index(hyp_addr); 230 - pud = pud_offset(pgd, hyp_addr); 231 - 232 - if (pud_none_or_clear_bad(pud)) { 233 - pmd = pmd_alloc_one(NULL, hyp_addr); 234 - if (!pmd) { 235 - kvm_err("Cannot allocate Hyp pmd\n"); 236 - err = -ENOMEM; 237 - goto out; 238 - } 239 - pud_populate(NULL, pud, pmd); 240 - } 241 - 242 - next = pgd_addr_end(addr, end); 243 - err = create_hyp_pmd_mappings(pud, addr, next, pfn_base); 244 - if (err) 245 - goto out; 246 - } 247 - out: 248 - mutex_unlock(&kvm_hyp_pgd_mutex); 249 - return err; 250 - } 251 - 252 - /** 253 - * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 254 - * @from: The virtual kernel start address of the range 255 - * @to: The virtual kernel end address of the range (exclusive) 256 - * 257 - * The same virtual address as the kernel virtual address is also used 258 - * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 259 - * physical pages. 260 - * 261 - * Note: Wrapping around zero in the "to" address is not supported. 262 - */ 263 - int create_hyp_mappings(void *from, void *to) 264 - { 265 - return __create_hyp_mappings(from, to, NULL); 266 - } 267 - 268 - /** 269 - * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode 270 - * @from: The kernel start VA of the range 271 - * @to: The kernel end VA of the range (exclusive) 272 - * @addr: The physical start address which gets mapped 273 - * 274 - * The resulting HYP VA is the same as the kernel VA, modulo 275 - * HYP_PAGE_OFFSET. 276 - */ 277 - int create_hyp_io_mappings(void *from, void *to, phys_addr_t addr) 278 - { 279 - unsigned long pfn = __phys_to_pfn(addr); 280 - return __create_hyp_mappings(from, to, &pfn); 281 - } 282 - 283 - /** 284 - * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 285 - * @kvm: The KVM struct pointer for the VM. 286 - * 287 - * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can 288 - * support either full 40-bit input addresses or limited to 32-bit input 289 - * addresses). Clears the allocated pages. 290 - * 291 - * Note we don't need locking here as this is only called when the VM is 292 - * created, which can only be done once. 293 - */ 294 - int kvm_alloc_stage2_pgd(struct kvm *kvm) 295 - { 296 - pgd_t *pgd; 297 - 298 - if (kvm->arch.pgd != NULL) { 299 - kvm_err("kvm_arch already initialized?\n"); 300 - return -EINVAL; 301 - } 302 - 303 - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER); 304 - if (!pgd) 305 - return -ENOMEM; 306 - 307 - /* stage-2 pgd must be aligned to its size */ 308 - VM_BUG_ON((unsigned long)pgd & (S2_PGD_SIZE - 1)); 309 - 310 - memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t)); 311 - kvm_clean_pgd(pgd); 312 - kvm->arch.pgd = pgd; 313 - 314 - return 0; 315 - } 316 - 317 74 static void clear_pud_entry(pud_t *pud) 318 75 { 319 76 pmd_t *pmd_table = pmd_offset(pud, 0); ··· 114 343 return page_count(pte_page) == 1; 115 344 } 116 345 117 - /** 118 - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 119 - * @kvm: The VM pointer 120 - * @start: The intermediate physical base address of the range to unmap 121 - * @size: The size of the area to unmap 122 - * 123 - * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 124 - * be called while holding mmu_lock (unless for freeing the stage2 pgd before 125 - * destroying the VM), otherwise another faulting VCPU may come in and mess 126 - * with things behind our backs. 127 - */ 128 - static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 346 + static void unmap_range(pgd_t *pgdp, unsigned long long start, u64 size) 129 347 { 130 348 pgd_t *pgd; 131 349 pud_t *pud; 132 350 pmd_t *pmd; 133 351 pte_t *pte; 134 - phys_addr_t addr = start, end = start + size; 352 + unsigned long long addr = start, end = start + size; 135 353 u64 range; 136 354 137 355 while (addr < end) { 138 - pgd = kvm->arch.pgd + pgd_index(addr); 356 + pgd = pgdp + pgd_index(addr); 139 357 pud = pud_offset(pgd, addr); 140 358 if (pud_none(*pud)) { 141 359 addr += PUD_SIZE; ··· 153 393 154 394 addr += range; 155 395 } 396 + } 397 + 398 + /** 399 + * free_boot_hyp_pgd - free HYP boot page tables 400 + * 401 + * Free the HYP boot page tables. The bounce page is also freed. 402 + */ 403 + void free_boot_hyp_pgd(void) 404 + { 405 + mutex_lock(&kvm_hyp_pgd_mutex); 406 + 407 + if (boot_hyp_pgd) { 408 + unmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); 409 + unmap_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 410 + kfree(boot_hyp_pgd); 411 + boot_hyp_pgd = NULL; 412 + } 413 + 414 + if (hyp_pgd) 415 + unmap_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 416 + 417 + kfree(init_bounce_page); 418 + init_bounce_page = NULL; 419 + 420 + mutex_unlock(&kvm_hyp_pgd_mutex); 421 + } 422 + 423 + /** 424 + * free_hyp_pgds - free Hyp-mode page tables 425 + * 426 + * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 427 + * therefore contains either mappings in the kernel memory area (above 428 + * PAGE_OFFSET), or device mappings in the vmalloc range (from 429 + * VMALLOC_START to VMALLOC_END). 430 + * 431 + * boot_hyp_pgd should only map two pages for the init code. 432 + */ 433 + void free_hyp_pgds(void) 434 + { 435 + unsigned long addr; 436 + 437 + free_boot_hyp_pgd(); 438 + 439 + mutex_lock(&kvm_hyp_pgd_mutex); 440 + 441 + if (hyp_pgd) { 442 + for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) 443 + unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 444 + for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) 445 + unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 446 + kfree(hyp_pgd); 447 + hyp_pgd = NULL; 448 + } 449 + 450 + mutex_unlock(&kvm_hyp_pgd_mutex); 451 + } 452 + 453 + static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 454 + unsigned long end, unsigned long pfn, 455 + pgprot_t prot) 456 + { 457 + pte_t *pte; 458 + unsigned long addr; 459 + 460 + addr = start; 461 + do { 462 + pte = pte_offset_kernel(pmd, addr); 463 + kvm_set_pte(pte, pfn_pte(pfn, prot)); 464 + get_page(virt_to_page(pte)); 465 + kvm_flush_dcache_to_poc(pte, sizeof(*pte)); 466 + pfn++; 467 + } while (addr += PAGE_SIZE, addr != end); 468 + } 469 + 470 + static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 471 + unsigned long end, unsigned long pfn, 472 + pgprot_t prot) 473 + { 474 + pmd_t *pmd; 475 + pte_t *pte; 476 + unsigned long addr, next; 477 + 478 + addr = start; 479 + do { 480 + pmd = pmd_offset(pud, addr); 481 + 482 + BUG_ON(pmd_sect(*pmd)); 483 + 484 + if (pmd_none(*pmd)) { 485 + pte = pte_alloc_one_kernel(NULL, addr); 486 + if (!pte) { 487 + kvm_err("Cannot allocate Hyp pte\n"); 488 + return -ENOMEM; 489 + } 490 + pmd_populate_kernel(NULL, pmd, pte); 491 + get_page(virt_to_page(pmd)); 492 + kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); 493 + } 494 + 495 + next = pmd_addr_end(addr, end); 496 + 497 + create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 498 + pfn += (next - addr) >> PAGE_SHIFT; 499 + } while (addr = next, addr != end); 500 + 501 + return 0; 502 + } 503 + 504 + static int __create_hyp_mappings(pgd_t *pgdp, 505 + unsigned long start, unsigned long end, 506 + unsigned long pfn, pgprot_t prot) 507 + { 508 + pgd_t *pgd; 509 + pud_t *pud; 510 + pmd_t *pmd; 511 + unsigned long addr, next; 512 + int err = 0; 513 + 514 + mutex_lock(&kvm_hyp_pgd_mutex); 515 + addr = start & PAGE_MASK; 516 + end = PAGE_ALIGN(end); 517 + do { 518 + pgd = pgdp + pgd_index(addr); 519 + pud = pud_offset(pgd, addr); 520 + 521 + if (pud_none_or_clear_bad(pud)) { 522 + pmd = pmd_alloc_one(NULL, addr); 523 + if (!pmd) { 524 + kvm_err("Cannot allocate Hyp pmd\n"); 525 + err = -ENOMEM; 526 + goto out; 527 + } 528 + pud_populate(NULL, pud, pmd); 529 + get_page(virt_to_page(pud)); 530 + kvm_flush_dcache_to_poc(pud, sizeof(*pud)); 531 + } 532 + 533 + next = pgd_addr_end(addr, end); 534 + err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 535 + if (err) 536 + goto out; 537 + pfn += (next - addr) >> PAGE_SHIFT; 538 + } while (addr = next, addr != end); 539 + out: 540 + mutex_unlock(&kvm_hyp_pgd_mutex); 541 + return err; 542 + } 543 + 544 + /** 545 + * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 546 + * @from: The virtual kernel start address of the range 547 + * @to: The virtual kernel end address of the range (exclusive) 548 + * 549 + * The same virtual address as the kernel virtual address is also used 550 + * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 551 + * physical pages. 552 + */ 553 + int create_hyp_mappings(void *from, void *to) 554 + { 555 + unsigned long phys_addr = virt_to_phys(from); 556 + unsigned long start = KERN_TO_HYP((unsigned long)from); 557 + unsigned long end = KERN_TO_HYP((unsigned long)to); 558 + 559 + /* Check for a valid kernel memory mapping */ 560 + if (!virt_addr_valid(from) || !virt_addr_valid(to - 1)) 561 + return -EINVAL; 562 + 563 + return __create_hyp_mappings(hyp_pgd, start, end, 564 + __phys_to_pfn(phys_addr), PAGE_HYP); 565 + } 566 + 567 + /** 568 + * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode 569 + * @from: The kernel start VA of the range 570 + * @to: The kernel end VA of the range (exclusive) 571 + * @phys_addr: The physical start address which gets mapped 572 + * 573 + * The resulting HYP VA is the same as the kernel VA, modulo 574 + * HYP_PAGE_OFFSET. 575 + */ 576 + int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) 577 + { 578 + unsigned long start = KERN_TO_HYP((unsigned long)from); 579 + unsigned long end = KERN_TO_HYP((unsigned long)to); 580 + 581 + /* Check for a valid kernel IO mapping */ 582 + if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) 583 + return -EINVAL; 584 + 585 + return __create_hyp_mappings(hyp_pgd, start, end, 586 + __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); 587 + } 588 + 589 + /** 590 + * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 591 + * @kvm: The KVM struct pointer for the VM. 592 + * 593 + * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can 594 + * support either full 40-bit input addresses or limited to 32-bit input 595 + * addresses). Clears the allocated pages. 596 + * 597 + * Note we don't need locking here as this is only called when the VM is 598 + * created, which can only be done once. 599 + */ 600 + int kvm_alloc_stage2_pgd(struct kvm *kvm) 601 + { 602 + pgd_t *pgd; 603 + 604 + if (kvm->arch.pgd != NULL) { 605 + kvm_err("kvm_arch already initialized?\n"); 606 + return -EINVAL; 607 + } 608 + 609 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER); 610 + if (!pgd) 611 + return -ENOMEM; 612 + 613 + /* stage-2 pgd must be aligned to its size */ 614 + VM_BUG_ON((unsigned long)pgd & (S2_PGD_SIZE - 1)); 615 + 616 + memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t)); 617 + kvm_clean_pgd(pgd); 618 + kvm->arch.pgd = pgd; 619 + 620 + return 0; 621 + } 622 + 623 + /** 624 + * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 625 + * @kvm: The VM pointer 626 + * @start: The intermediate physical base address of the range to unmap 627 + * @size: The size of the area to unmap 628 + * 629 + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 630 + * be called while holding mmu_lock (unless for freeing the stage2 pgd before 631 + * destroying the VM), otherwise another faulting VCPU may come in and mess 632 + * with things behind our backs. 633 + */ 634 + static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 635 + { 636 + unmap_range(kvm->arch.pgd, start, size); 156 637 } 157 638 158 639 /** ··· 729 728 730 729 phys_addr_t kvm_mmu_get_httbr(void) 731 730 { 732 - VM_BUG_ON(!virt_addr_valid(hyp_pgd)); 733 731 return virt_to_phys(hyp_pgd); 732 + } 733 + 734 + phys_addr_t kvm_mmu_get_boot_httbr(void) 735 + { 736 + return virt_to_phys(boot_hyp_pgd); 737 + } 738 + 739 + phys_addr_t kvm_get_idmap_vector(void) 740 + { 741 + return hyp_idmap_vector; 734 742 } 735 743 736 744 int kvm_mmu_init(void) 737 745 { 738 - if (!hyp_pgd) { 746 + int err; 747 + 748 + hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start); 749 + hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end); 750 + hyp_idmap_vector = virt_to_phys(__kvm_hyp_init); 751 + 752 + if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) { 753 + /* 754 + * Our init code is crossing a page boundary. Allocate 755 + * a bounce page, copy the code over and use that. 756 + */ 757 + size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start; 758 + phys_addr_t phys_base; 759 + 760 + init_bounce_page = kmalloc(PAGE_SIZE, GFP_KERNEL); 761 + if (!init_bounce_page) { 762 + kvm_err("Couldn't allocate HYP init bounce page\n"); 763 + err = -ENOMEM; 764 + goto out; 765 + } 766 + 767 + memcpy(init_bounce_page, __hyp_idmap_text_start, len); 768 + /* 769 + * Warning: the code we just copied to the bounce page 770 + * must be flushed to the point of coherency. 771 + * Otherwise, the data may be sitting in L2, and HYP 772 + * mode won't be able to observe it as it runs with 773 + * caches off at that point. 774 + */ 775 + kvm_flush_dcache_to_poc(init_bounce_page, len); 776 + 777 + phys_base = virt_to_phys(init_bounce_page); 778 + hyp_idmap_vector += phys_base - hyp_idmap_start; 779 + hyp_idmap_start = phys_base; 780 + hyp_idmap_end = phys_base + len; 781 + 782 + kvm_info("Using HYP init bounce page @%lx\n", 783 + (unsigned long)phys_base); 784 + } 785 + 786 + hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); 787 + boot_hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); 788 + if (!hyp_pgd || !boot_hyp_pgd) { 739 789 kvm_err("Hyp mode PGD not allocated\n"); 740 - return -ENOMEM; 790 + err = -ENOMEM; 791 + goto out; 792 + } 793 + 794 + /* Create the idmap in the boot page tables */ 795 + err = __create_hyp_mappings(boot_hyp_pgd, 796 + hyp_idmap_start, hyp_idmap_end, 797 + __phys_to_pfn(hyp_idmap_start), 798 + PAGE_HYP); 799 + 800 + if (err) { 801 + kvm_err("Failed to idmap %lx-%lx\n", 802 + hyp_idmap_start, hyp_idmap_end); 803 + goto out; 804 + } 805 + 806 + /* Map the very same page at the trampoline VA */ 807 + err = __create_hyp_mappings(boot_hyp_pgd, 808 + TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, 809 + __phys_to_pfn(hyp_idmap_start), 810 + PAGE_HYP); 811 + if (err) { 812 + kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n", 813 + TRAMPOLINE_VA); 814 + goto out; 815 + } 816 + 817 + /* Map the same page again into the runtime page tables */ 818 + err = __create_hyp_mappings(hyp_pgd, 819 + TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, 820 + __phys_to_pfn(hyp_idmap_start), 821 + PAGE_HYP); 822 + if (err) { 823 + kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n", 824 + TRAMPOLINE_VA); 825 + goto out; 741 826 } 742 827 743 828 return 0; 744 - } 745 - 746 - /** 747 - * kvm_clear_idmap - remove all idmaps from the hyp pgd 748 - * 749 - * Free the underlying pmds for all pgds in range and clear the pgds (but 750 - * don't free them) afterwards. 751 - */ 752 - void kvm_clear_hyp_idmap(void) 753 - { 754 - unsigned long addr, end; 755 - unsigned long next; 756 - pgd_t *pgd = hyp_pgd; 757 - pud_t *pud; 758 - pmd_t *pmd; 759 - 760 - addr = virt_to_phys(__hyp_idmap_text_start); 761 - end = virt_to_phys(__hyp_idmap_text_end); 762 - 763 - pgd += pgd_index(addr); 764 - do { 765 - next = pgd_addr_end(addr, end); 766 - if (pgd_none_or_clear_bad(pgd)) 767 - continue; 768 - pud = pud_offset(pgd, addr); 769 - pmd = pmd_offset(pud, addr); 770 - 771 - pud_clear(pud); 772 - kvm_clean_pmd_entry(pmd); 773 - pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); 774 - } while (pgd++, addr = next, addr < end); 829 + out: 830 + free_hyp_pgds(); 831 + return err; 775 832 }

+68

arch/arm/kvm/perf.c

··· 1 + /* 2 + * Based on the x86 implementation. 3 + * 4 + * Copyright (C) 2012 ARM Ltd. 5 + * Author: Marc Zyngier <marc.zyngier@arm.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + * 11 + * This program is distributed in the hope that it will be useful, 12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 + * GNU General Public License for more details. 15 + * 16 + * You should have received a copy of the GNU General Public License 17 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 18 + */ 19 + 20 + #include <linux/perf_event.h> 21 + #include <linux/kvm_host.h> 22 + 23 + #include <asm/kvm_emulate.h> 24 + 25 + static int kvm_is_in_guest(void) 26 + { 27 + return kvm_arm_get_running_vcpu() != NULL; 28 + } 29 + 30 + static int kvm_is_user_mode(void) 31 + { 32 + struct kvm_vcpu *vcpu; 33 + 34 + vcpu = kvm_arm_get_running_vcpu(); 35 + 36 + if (vcpu) 37 + return !vcpu_mode_priv(vcpu); 38 + 39 + return 0; 40 + } 41 + 42 + static unsigned long kvm_get_guest_ip(void) 43 + { 44 + struct kvm_vcpu *vcpu; 45 + 46 + vcpu = kvm_arm_get_running_vcpu(); 47 + 48 + if (vcpu) 49 + return *vcpu_pc(vcpu); 50 + 51 + return 0; 52 + } 53 + 54 + static struct perf_guest_info_callbacks kvm_guest_cbs = { 55 + .is_in_guest = kvm_is_in_guest, 56 + .is_user_mode = kvm_is_user_mode, 57 + .get_guest_ip = kvm_get_guest_ip, 58 + }; 59 + 60 + int kvm_perf_init(void) 61 + { 62 + return perf_register_guest_info_callbacks(&kvm_guest_cbs); 63 + } 64 + 65 + int kvm_perf_teardown(void) 66 + { 67 + return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); 68 + }

+1 -31

arch/arm/mm/idmap.c

··· 8 8 #include <asm/pgtable.h> 9 9 #include <asm/sections.h> 10 10 #include <asm/system_info.h> 11 - #include <asm/virt.h> 12 11 13 12 pgd_t *idmap_pgd; 14 13 ··· 82 83 } while (pgd++, addr = next, addr != end); 83 84 } 84 85 85 - #if defined(CONFIG_ARM_VIRT_EXT) && defined(CONFIG_ARM_LPAE) 86 - pgd_t *hyp_pgd; 87 - 88 - extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; 89 - 90 - static int __init init_static_idmap_hyp(void) 91 - { 92 - hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); 93 - if (!hyp_pgd) 94 - return -ENOMEM; 95 - 96 - pr_info("Setting up static HYP identity map for 0x%p - 0x%p\n", 97 - __hyp_idmap_text_start, __hyp_idmap_text_end); 98 - identity_mapping_add(hyp_pgd, __hyp_idmap_text_start, 99 - __hyp_idmap_text_end, PMD_SECT_AP1); 100 - 101 - return 0; 102 - } 103 - #else 104 - static int __init init_static_idmap_hyp(void) 105 - { 106 - return 0; 107 - } 108 - #endif 109 - 110 86 extern char __idmap_text_start[], __idmap_text_end[]; 111 87 112 88 static int __init init_static_idmap(void) 113 89 { 114 - int ret; 115 - 116 90 idmap_pgd = pgd_alloc(&init_mm); 117 91 if (!idmap_pgd) 118 92 return -ENOMEM; ··· 95 123 identity_mapping_add(idmap_pgd, __idmap_text_start, 96 124 __idmap_text_end, 0); 97 125 98 - ret = init_static_idmap_hyp(); 99 - 100 126 /* Flush L1 for the hardware to see this page table content */ 101 127 flush_cache_louis(); 102 128 103 - return ret; 129 + return 0; 104 130 } 105 131 early_initcall(init_static_idmap); 106 132

+1

arch/ia64/include/asm/kvm_host.h

··· 26 26 #define KVM_USER_MEM_SLOTS 32 27 27 28 28 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 29 + #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS 29 30 30 31 /* define exit reasons from vmm to kvm*/ 31 32 #define EXIT_REASON_VM_PANIC 0

-1

arch/ia64/include/uapi/asm/kvm.h

··· 27 27 /* Select x86 specific features in <linux/kvm.h> */ 28 28 #define __KVM_HAVE_IOAPIC 29 29 #define __KVM_HAVE_IRQ_LINE 30 - #define __KVM_HAVE_DEVICE_ASSIGNMENT 31 30 32 31 /* Architectural interrupt line count. */ 33 32 #define KVM_NR_INTERRUPTS 256

+12 -2

arch/ia64/kvm/Kconfig

··· 21 21 tristate "Kernel-based Virtual Machine (KVM) support" 22 22 depends on BROKEN 23 23 depends on HAVE_KVM && MODULES 24 - # for device assignment: 25 - depends on PCI 26 24 depends on BROKEN 27 25 select PREEMPT_NOTIFIERS 28 26 select ANON_INODES 29 27 select HAVE_KVM_IRQCHIP 28 + select HAVE_KVM_IRQ_ROUTING 30 29 select KVM_APIC_ARCHITECTURE 31 30 select KVM_MMIO 32 31 ---help--- ··· 48 49 ---help--- 49 50 Provides support for KVM on Itanium 2 processors equipped with the VT 50 51 extensions. 52 + 53 + config KVM_DEVICE_ASSIGNMENT 54 + bool "KVM legacy PCI device assignment support" 55 + depends on KVM && PCI && IOMMU_API 56 + default y 57 + ---help--- 58 + Provide support for legacy PCI device assignment through KVM. The 59 + kernel now also supports a full featured userspace device driver 60 + framework through VFIO, which supersedes much of this support. 61 + 62 + If unsure, say Y. 51 63 52 64 source drivers/vhost/Kconfig 53 65

+3 -3

arch/ia64/kvm/Makefile

··· 49 49 asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ 50 50 51 51 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 52 - coalesced_mmio.o irq_comm.o assigned-dev.o) 52 + coalesced_mmio.o irq_comm.o) 53 53 54 - ifeq ($(CONFIG_IOMMU_API),y) 55 - common-objs += $(addprefix ../../../virt/kvm/, iommu.o) 54 + ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y) 55 + common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o) 56 56 endif 57 57 58 58 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o

+9 -26

arch/ia64/kvm/kvm-ia64.c

··· 204 204 case KVM_CAP_COALESCED_MMIO: 205 205 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 206 206 break; 207 + #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 207 208 case KVM_CAP_IOMMU: 208 209 r = iommu_present(&pci_bus_type); 209 210 break; 211 + #endif 210 212 default: 211 213 r = 0; 212 214 } ··· 926 924 return 0; 927 925 } 928 926 929 - int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) 927 + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, 928 + bool line_status) 930 929 { 931 930 if (!irqchip_in_kernel(kvm)) 932 931 return -ENXIO; 933 932 934 933 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 935 - irq_event->irq, irq_event->level); 934 + irq_event->irq, irq_event->level, 935 + line_status); 936 936 return 0; 937 937 } 938 938 ··· 946 942 int r = -ENOTTY; 947 943 948 944 switch (ioctl) { 949 - case KVM_SET_MEMORY_REGION: { 950 - struct kvm_memory_region kvm_mem; 951 - struct kvm_userspace_memory_region kvm_userspace_mem; 952 - 953 - r = -EFAULT; 954 - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 955 - goto out; 956 - kvm_userspace_mem.slot = kvm_mem.slot; 957 - kvm_userspace_mem.flags = kvm_mem.flags; 958 - kvm_userspace_mem.guest_phys_addr = 959 - kvm_mem.guest_phys_addr; 960 - kvm_userspace_mem.memory_size = kvm_mem.memory_size; 961 - r = kvm_vm_ioctl_set_memory_region(kvm, 962 - &kvm_userspace_mem, false); 963 - if (r) 964 - goto out; 965 - break; 966 - } 967 945 case KVM_CREATE_IRQCHIP: 968 946 r = -EFAULT; 969 947 r = kvm_ioapic_init(kvm); ··· 1370 1384 void kvm_arch_destroy_vm(struct kvm *kvm) 1371 1385 { 1372 1386 kvm_iommu_unmap_guest(kvm); 1373 - #ifdef KVM_CAP_DEVICE_ASSIGNMENT 1374 1387 kvm_free_all_assigned_devices(kvm); 1375 - #endif 1376 1388 kfree(kvm->arch.vioapic); 1377 1389 kvm_release_vm_pages(kvm); 1378 1390 } ··· 1562 1578 1563 1579 int kvm_arch_prepare_memory_region(struct kvm *kvm, 1564 1580 struct kvm_memory_slot *memslot, 1565 - struct kvm_memory_slot old, 1566 1581 struct kvm_userspace_memory_region *mem, 1567 - bool user_alloc) 1582 + enum kvm_mr_change change) 1568 1583 { 1569 1584 unsigned long i; 1570 1585 unsigned long pfn; ··· 1593 1610 1594 1611 void kvm_arch_commit_memory_region(struct kvm *kvm, 1595 1612 struct kvm_userspace_memory_region *mem, 1596 - struct kvm_memory_slot old, 1597 - bool user_alloc) 1613 + const struct kvm_memory_slot *old, 1614 + enum kvm_mr_change change) 1598 1615 { 1599 1616 return; 1600 1617 }

-6

arch/ia64/kvm/lapic.h

··· 27 27 #define kvm_apic_present(x) (true) 28 28 #define kvm_lapic_enabled(x) (true) 29 29 30 - static inline bool kvm_apic_vid_enabled(void) 31 - { 32 - /* IA64 has no apicv supporting, do nothing here */ 33 - return false; 34 - } 35 - 36 30 #endif

+3

arch/powerpc/include/asm/hvcall.h

··· 270 270 #define H_SET_MODE 0x31C 271 271 #define MAX_HCALL_OPCODE H_SET_MODE 272 272 273 + /* Platform specific hcalls, used by KVM */ 274 + #define H_RTAS 0xf000 275 + 273 276 #ifndef __ASSEMBLY__ 274 277 275 278 /**

+6 -1

arch/powerpc/include/asm/kvm_book3s.h

··· 142 142 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 143 143 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 144 144 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); 145 + extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, 146 + unsigned int vec); 145 147 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags); 146 148 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, 147 149 bool upper, u32 val); ··· 158 156 unsigned long pte_index); 159 157 extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, 160 158 unsigned long *nb_ret); 161 - extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); 159 + extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr, 160 + unsigned long gpa, bool dirty); 162 161 extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 163 162 long pte_index, unsigned long pteh, unsigned long ptel); 164 163 extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, ··· 461 458 #define OSI_SC_MAGIC_R4 0x77810F9B 462 459 463 460 #define INS_DCBZ 0x7c0007ec 461 + /* TO = 31 for unconditional trap */ 462 + #define INS_TW 0x7fe00008 464 463 465 464 /* LPIDs we support with this build -- runtime limit may be lower */ 466 465 #define KVMPPC_NR_LPIDS (LPID_RSVD + 1)

+13

arch/powerpc/include/asm/kvm_book3s_64.h

··· 268 268 (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); 269 269 } 270 270 271 + #ifdef CONFIG_KVM_BOOK3S_64_HV 272 + /* 273 + * Note modification of an HPTE; set the HPTE modified bit 274 + * if anyone is interested. 275 + */ 276 + static inline void note_hpte_modification(struct kvm *kvm, 277 + struct revmap_entry *rev) 278 + { 279 + if (atomic_read(&kvm->arch.hpte_mod_interest)) 280 + rev->guest_rpte |= HPTE_GR_MODIFIED; 281 + } 282 + #endif /* CONFIG_KVM_BOOK3S_64_HV */ 283 + 271 284 #endif /* __ASM_KVM_BOOK3S_64_H__ */

+7 -1

arch/powerpc/include/asm/kvm_book3s_asm.h

··· 20 20 #ifndef __ASM_KVM_BOOK3S_ASM_H__ 21 21 #define __ASM_KVM_BOOK3S_ASM_H__ 22 22 23 + /* XICS ICP register offsets */ 24 + #define XICS_XIRR 4 25 + #define XICS_MFRR 0xc 26 + #define XICS_IPI 2 /* interrupt source # for IPIs */ 27 + 23 28 #ifdef __ASSEMBLY__ 24 29 25 30 #ifdef CONFIG_KVM_BOOK3S_HANDLER ··· 86 81 #ifdef CONFIG_KVM_BOOK3S_64_HV 87 82 u8 hwthread_req; 88 83 u8 hwthread_state; 89 - 84 + u8 host_ipi; 90 85 struct kvm_vcpu *kvm_vcpu; 91 86 struct kvmppc_vcore *kvm_vcore; 92 87 unsigned long xics_phys; 88 + u32 saved_xirr; 93 89 u64 dabr; 94 90 u64 host_mmcr[3]; 95 91 u32 host_pmc[8];

+2

arch/powerpc/include/asm/kvm_booke.h

··· 26 26 /* LPIDs we support with this build -- runtime limit may be lower */ 27 27 #define KVMPPC_NR_LPIDS 64 28 28 29 + #define KVMPPC_INST_EHPRIV 0x7c00021c 30 + 29 31 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) 30 32 { 31 33 vcpu->arch.gpr[num] = val;

+40 -1

arch/powerpc/include/asm/kvm_host.h

··· 44 44 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 45 45 #endif 46 46 47 + /* These values are internal and can be increased later */ 48 + #define KVM_NR_IRQCHIPS 1 49 + #define KVM_IRQCHIP_NUM_PINS 256 50 + 47 51 #if !defined(CONFIG_KVM_440) 48 52 #include <linux/mmu_notifier.h> 49 53 ··· 192 188 int type; 193 189 }; 194 190 191 + /* XICS components, defined in book3s_xics.c */ 192 + struct kvmppc_xics; 193 + struct kvmppc_icp; 194 + 195 195 /* 196 196 * The reverse mapping array has one entry for each HPTE, 197 197 * which stores the guest's view of the second word of the HPTE ··· 263 255 #endif /* CONFIG_KVM_BOOK3S_64_HV */ 264 256 #ifdef CONFIG_PPC_BOOK3S_64 265 257 struct list_head spapr_tce_tables; 258 + struct list_head rtas_tokens; 259 + #endif 260 + #ifdef CONFIG_KVM_MPIC 261 + struct openpic *mpic; 262 + #endif 263 + #ifdef CONFIG_KVM_XICS 264 + struct kvmppc_xics *xics; 266 265 #endif 267 266 }; 268 267 ··· 316 301 * that a guest can register. 317 302 */ 318 303 struct kvmppc_vpa { 304 + unsigned long gpa; /* Current guest phys addr */ 319 305 void *pinned_addr; /* Address in kernel linear mapping */ 320 306 void *pinned_end; /* End of region */ 321 307 unsigned long next_gpa; /* Guest phys addr for update */ 322 308 unsigned long len; /* Number of bytes required */ 323 309 u8 update_pending; /* 1 => update pinned_addr from next_gpa */ 310 + bool dirty; /* true => area has been modified by kernel */ 324 311 }; 325 312 326 313 struct kvmppc_pte { ··· 376 359 #define KVMPPC_BOOKE_MAX_IAC 4 377 360 #define KVMPPC_BOOKE_MAX_DAC 2 378 361 362 + /* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */ 363 + #define KVMPPC_EPR_NONE 0 /* EPR not supported */ 364 + #define KVMPPC_EPR_USER 1 /* exit to userspace to fill EPR */ 365 + #define KVMPPC_EPR_KERNEL 2 /* in-kernel irqchip */ 366 + 379 367 struct kvmppc_booke_debug_reg { 380 368 u32 dbcr0; 381 369 u32 dbcr1; ··· 391 369 u64 iac[KVMPPC_BOOKE_MAX_IAC]; 392 370 u64 dac[KVMPPC_BOOKE_MAX_DAC]; 393 371 }; 372 + 373 + #define KVMPPC_IRQ_DEFAULT 0 374 + #define KVMPPC_IRQ_MPIC 1 375 + #define KVMPPC_IRQ_XICS 2 376 + 377 + struct openpic; 394 378 395 379 struct kvm_vcpu_arch { 396 380 ulong host_stack; ··· 530 502 spinlock_t wdt_lock; 531 503 struct timer_list wdt_timer; 532 504 u32 tlbcfg[4]; 505 + u32 tlbps[4]; 533 506 u32 mmucfg; 507 + u32 eptcfg; 534 508 u32 epr; 509 + u32 crit_save; 535 510 struct kvmppc_booke_debug_reg dbg_reg; 536 511 #endif 537 512 gpa_t paddr_accessed; ··· 552 521 u8 sane; 553 522 u8 cpu_type; 554 523 u8 hcall_needed; 555 - u8 epr_enabled; 524 + u8 epr_flags; /* KVMPPC_EPR_xxx */ 556 525 u8 epr_needed; 557 526 558 527 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ ··· 578 547 struct kvm_vcpu_arch_shared *shared; 579 548 unsigned long magic_page_pa; /* phys addr to map the magic page to */ 580 549 unsigned long magic_page_ea; /* effect. addr to map the magic page to */ 550 + 551 + int irq_type; /* one of KVM_IRQ_* */ 552 + int irq_cpu_id; 553 + struct openpic *mpic; /* KVM_IRQ_MPIC */ 554 + #ifdef CONFIG_KVM_XICS 555 + struct kvmppc_icp *icp; /* XICS presentation controller */ 556 + #endif 581 557 582 558 #ifdef CONFIG_KVM_BOOK3S_64_HV 583 559 struct kvm_vcpu_arch_shared shregs; ··· 626 588 #define KVM_MMIO_REG_FQPR 0x0060 627 589 628 590 #define __KVM_HAVE_ARCH_WQP 591 + #define __KVM_HAVE_CREATE_DEVICE 629 592 630 593 #endif /* __POWERPC_KVM_HOST_H__ */

+109 -5

arch/powerpc/include/asm/kvm_ppc.h

··· 44 44 EMULATE_DO_DCR, /* kvm_run filled with DCR request */ 45 45 EMULATE_FAIL, /* can't emulate this instruction */ 46 46 EMULATE_AGAIN, /* something went wrong. go again */ 47 - EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */ 47 + EMULATE_EXIT_USER, /* emulation requires exit to user-space */ 48 48 }; 49 49 50 50 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); ··· 104 104 extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu); 105 105 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, 106 106 struct kvm_interrupt *irq); 107 - extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 108 - struct kvm_interrupt *irq); 107 + extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu); 109 108 extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu); 110 109 111 110 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, ··· 130 131 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, 131 132 struct kvm_memory_slot *memslot, unsigned long porder); 132 133 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); 134 + 133 135 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, 134 136 struct kvm_create_spapr_tce *args); 135 137 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ··· 152 152 struct kvm_userspace_memory_region *mem); 153 153 extern void kvmppc_core_commit_memory_region(struct kvm *kvm, 154 154 struct kvm_userspace_memory_region *mem, 155 - struct kvm_memory_slot old); 155 + const struct kvm_memory_slot *old); 156 156 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, 157 157 struct kvm_ppc_smmu_info *info); 158 158 extern void kvmppc_core_flush_memslot(struct kvm *kvm, ··· 164 164 extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); 165 165 166 166 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); 167 + 168 + int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); 169 + 170 + extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp); 171 + extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu); 172 + extern void kvmppc_rtas_tokens_free(struct kvm *kvm); 173 + extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, 174 + u32 priority); 175 + extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, 176 + u32 *priority); 177 + extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq); 178 + extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq); 167 179 168 180 /* 169 181 * Cuts out inst bits with ordering according to spec. ··· 258 246 259 247 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); 260 248 249 + struct openpic; 250 + 261 251 #ifdef CONFIG_KVM_BOOK3S_64_HV 262 252 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) 263 253 { 264 254 paca[cpu].kvm_hstate.xics_phys = addr; 265 255 } 266 256 257 + static inline u32 kvmppc_get_xics_latch(void) 258 + { 259 + u32 xirr = get_paca()->kvm_hstate.saved_xirr; 260 + 261 + get_paca()->kvm_hstate.saved_xirr = 0; 262 + 263 + return xirr; 264 + } 265 + 266 + static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) 267 + { 268 + paca[cpu].kvm_hstate.host_ipi = host_ipi; 269 + } 270 + 271 + extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu); 267 272 extern void kvm_linear_init(void); 268 273 269 274 #else ··· 289 260 290 261 static inline void kvm_linear_init(void) 291 262 {} 263 + 264 + static inline u32 kvmppc_get_xics_latch(void) 265 + { 266 + return 0; 267 + } 268 + 269 + static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) 270 + {} 271 + 272 + static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) 273 + { 274 + kvm_vcpu_kick(vcpu); 275 + } 276 + #endif 277 + 278 + #ifdef CONFIG_KVM_XICS 279 + static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) 280 + { 281 + return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; 282 + } 283 + extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); 284 + extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); 285 + extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); 286 + extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); 287 + extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); 288 + extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); 289 + extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, 290 + struct kvm_vcpu *vcpu, u32 cpu); 291 + #else 292 + static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) 293 + { return 0; } 294 + static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } 295 + static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, 296 + unsigned long server) 297 + { return -EINVAL; } 298 + static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm, 299 + struct kvm_irq_level *args) 300 + { return -ENOTTY; } 301 + static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) 302 + { return 0; } 292 303 #endif 293 304 294 305 static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr) ··· 339 270 vcpu->arch.epr = epr; 340 271 #endif 341 272 } 273 + 274 + #ifdef CONFIG_KVM_MPIC 275 + 276 + void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu); 277 + int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, 278 + u32 cpu); 279 + void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu); 280 + 281 + #else 282 + 283 + static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) 284 + { 285 + } 286 + 287 + static inline int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, 288 + struct kvm_vcpu *vcpu, u32 cpu) 289 + { 290 + return -EINVAL; 291 + } 292 + 293 + static inline void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, 294 + struct kvm_vcpu *vcpu) 295 + { 296 + } 297 + 298 + #endif /* CONFIG_KVM_MPIC */ 342 299 343 300 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, 344 301 struct kvm_config_tlb *cfg); ··· 378 283 379 284 static inline void kvmppc_mmu_flush_icache(pfn_t pfn) 380 285 { 381 - /* Clear i-cache for new pages */ 382 286 struct page *page; 287 + /* 288 + * We can only access pages that the kernel maps 289 + * as memory. Bail out for unmapped ones. 290 + */ 291 + if (!pfn_valid(pfn)) 292 + return; 293 + 294 + /* Clear i-cache for new pages */ 383 295 page = pfn_to_page(pfn); 384 296 if (!test_bit(PG_arch_1, &page->flags)) { 385 297 flush_dcache_icache_page(page); ··· 425 323 426 324 return ea; 427 325 } 326 + 327 + extern void xics_wake_cpu(int cpu); 428 328 429 329 #endif /* __POWERPC_KVM_PPC_H__ */

+1

arch/powerpc/include/asm/reg.h

··· 300 300 #define LPCR_PECE1 0x00002000 /* decrementer can cause exit */ 301 301 #define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ 302 302 #define LPCR_MER 0x00000800 /* Mediated External Exception */ 303 + #define LPCR_MER_SH 11 303 304 #define LPCR_LPES 0x0000000c 304 305 #define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ 305 306 #define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */

+94

arch/powerpc/include/uapi/asm/kvm.h

··· 25 25 /* Select powerpc specific features in <linux/kvm.h> */ 26 26 #define __KVM_HAVE_SPAPR_TCE 27 27 #define __KVM_HAVE_PPC_SMT 28 + #define __KVM_HAVE_IRQCHIP 29 + #define __KVM_HAVE_IRQ_LINE 28 30 29 31 struct kvm_regs { 30 32 __u64 pc; ··· 274 272 275 273 /* for KVM_SET_GUEST_DEBUG */ 276 274 struct kvm_guest_debug_arch { 275 + struct { 276 + /* H/W breakpoint/watchpoint address */ 277 + __u64 addr; 278 + /* 279 + * Type denotes h/w breakpoint, read watchpoint, write 280 + * watchpoint or watchpoint (both read and write). 281 + */ 282 + #define KVMPPC_DEBUG_NONE 0x0 283 + #define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) 284 + #define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) 285 + #define KVMPPC_DEBUG_WATCH_READ (1UL << 3) 286 + __u32 type; 287 + __u32 reserved; 288 + } bp[16]; 277 289 }; 290 + 291 + /* Debug related defines */ 292 + /* 293 + * kvm_guest_debug->control is a 32 bit field. The lower 16 bits are generic 294 + * and upper 16 bits are architecture specific. Architecture specific defines 295 + * that ioctl is for setting hardware breakpoint or software breakpoint. 296 + */ 297 + #define KVM_GUESTDBG_USE_SW_BP 0x00010000 298 + #define KVM_GUESTDBG_USE_HW_BP 0x00020000 278 299 279 300 /* definition of registers in kvm_run */ 280 301 struct kvm_sync_regs { ··· 322 297 /* for KVM_ALLOCATE_RMA */ 323 298 struct kvm_allocate_rma { 324 299 __u64 rma_size; 300 + }; 301 + 302 + /* for KVM_CAP_PPC_RTAS */ 303 + struct kvm_rtas_token_args { 304 + char name[120]; 305 + __u64 token; /* Use a token of 0 to undefine a mapping */ 325 306 }; 326 307 327 308 struct kvm_book3e_206_tlb_entry { ··· 390 359 __u16 n_invalid; 391 360 }; 392 361 362 + /* Per-vcpu XICS interrupt controller state */ 363 + #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) 364 + 365 + #define KVM_REG_PPC_ICP_CPPR_SHIFT 56 /* current proc priority */ 366 + #define KVM_REG_PPC_ICP_CPPR_MASK 0xff 367 + #define KVM_REG_PPC_ICP_XISR_SHIFT 32 /* interrupt status field */ 368 + #define KVM_REG_PPC_ICP_XISR_MASK 0xffffff 369 + #define KVM_REG_PPC_ICP_MFRR_SHIFT 24 /* pending IPI priority */ 370 + #define KVM_REG_PPC_ICP_MFRR_MASK 0xff 371 + #define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */ 372 + #define KVM_REG_PPC_ICP_PPRI_MASK 0xff 373 + 374 + /* Device control API: PPC-specific devices */ 375 + #define KVM_DEV_MPIC_GRP_MISC 1 376 + #define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */ 377 + 378 + #define KVM_DEV_MPIC_GRP_REGISTER 2 /* 32-bit */ 379 + #define KVM_DEV_MPIC_GRP_IRQ_ACTIVE 3 /* 32-bit */ 380 + 381 + /* One-Reg API: PPC-specific registers */ 393 382 #define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) 394 383 #define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) 395 384 #define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) ··· 467 416 468 417 #define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) 469 418 #define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) 419 + 420 + /* Timer Status Register OR/CLEAR interface */ 421 + #define KVM_REG_PPC_OR_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87) 422 + #define KVM_REG_PPC_CLEAR_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88) 423 + #define KVM_REG_PPC_TCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89) 424 + #define KVM_REG_PPC_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a) 425 + 426 + /* Debugging: Special instruction for software breakpoint */ 427 + #define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b) 428 + 429 + /* MMU registers */ 430 + #define KVM_REG_PPC_MAS0 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8c) 431 + #define KVM_REG_PPC_MAS1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8d) 432 + #define KVM_REG_PPC_MAS2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8e) 433 + #define KVM_REG_PPC_MAS7_3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8f) 434 + #define KVM_REG_PPC_MAS4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x90) 435 + #define KVM_REG_PPC_MAS6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x91) 436 + #define KVM_REG_PPC_MMUCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x92) 437 + /* 438 + * TLBnCFG fields TLBnCFG_N_ENTRY and TLBnCFG_ASSOC can be changed only using 439 + * KVM_CAP_SW_TLB ioctl 440 + */ 441 + #define KVM_REG_PPC_TLB0CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x93) 442 + #define KVM_REG_PPC_TLB1CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94) 443 + #define KVM_REG_PPC_TLB2CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95) 444 + #define KVM_REG_PPC_TLB3CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96) 445 + #define KVM_REG_PPC_TLB0PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x97) 446 + #define KVM_REG_PPC_TLB1PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98) 447 + #define KVM_REG_PPC_TLB2PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99) 448 + #define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a) 449 + #define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b) 450 + 451 + /* PPC64 eXternal Interrupt Controller Specification */ 452 + #define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ 453 + 454 + /* Layout of 64-bit source attribute values */ 455 + #define KVM_XICS_DESTINATION_SHIFT 0 456 + #define KVM_XICS_DESTINATION_MASK 0xffffffffULL 457 + #define KVM_XICS_PRIORITY_SHIFT 32 458 + #define KVM_XICS_PRIORITY_MASK 0xff 459 + #define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) 460 + #define KVM_XICS_MASKED (1ULL << 41) 461 + #define KVM_XICS_PENDING (1ULL << 42) 470 462 471 463 #endif /* __LINUX_KVM_POWERPC_H */

+4

arch/powerpc/kernel/asm-offsets.c

··· 480 480 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); 481 481 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); 482 482 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); 483 + DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty)); 483 484 #endif 484 485 #ifdef CONFIG_PPC_BOOK3S 485 486 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); ··· 577 576 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); 578 577 HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); 579 578 HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); 579 + HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); 580 + HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); 580 581 HSTATE_FIELD(HSTATE_MMCR, host_mmcr); 581 582 HSTATE_FIELD(HSTATE_PMC, host_pmc); 582 583 HSTATE_FIELD(HSTATE_PURR, host_purr); ··· 602 599 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 603 600 DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); 604 601 DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); 602 + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); 605 603 #endif /* CONFIG_PPC_BOOK3S */ 606 604 #endif /* CONFIG_KVM */ 607 605

+12

arch/powerpc/kvm/44x.c

··· 124 124 return kvmppc_set_sregs_ivor(vcpu, sregs); 125 125 } 126 126 127 + int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, 128 + union kvmppc_one_reg *val) 129 + { 130 + return -EINVAL; 131 + } 132 + 133 + int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, 134 + union kvmppc_one_reg *val) 135 + { 136 + return -EINVAL; 137 + } 138 + 127 139 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) 128 140 { 129 141 struct kvmppc_vcpu_44x *vcpu_44x;

+23 -3

arch/powerpc/kvm/Kconfig

··· 136 136 If unsure, say N. 137 137 138 138 config KVM_E500MC 139 - bool "KVM support for PowerPC E500MC/E5500 processors" 139 + bool "KVM support for PowerPC E500MC/E5500/E6500 processors" 140 140 depends on PPC_E500MC 141 141 select KVM 142 142 select KVM_MMIO 143 143 select KVM_BOOKE_HV 144 144 select MMU_NOTIFIER 145 145 ---help--- 146 - Support running unmodified E500MC/E5500 (32-bit) guest kernels in 147 - virtual machines on E500MC/E5500 host processors. 146 + Support running unmodified E500MC/E5500/E6500 guest kernels in 147 + virtual machines on E500MC/E5500/E6500 host processors. 148 148 149 149 This module provides access to the hardware capabilities through 150 150 a character device node named /dev/kvm. 151 151 152 152 If unsure, say N. 153 + 154 + config KVM_MPIC 155 + bool "KVM in-kernel MPIC emulation" 156 + depends on KVM && E500 157 + select HAVE_KVM_IRQCHIP 158 + select HAVE_KVM_IRQ_ROUTING 159 + select HAVE_KVM_MSI 160 + help 161 + Enable support for emulating MPIC devices inside the 162 + host kernel, rather than relying on userspace to emulate. 163 + Currently, support is limited to certain versions of 164 + Freescale's MPIC implementation. 165 + 166 + config KVM_XICS 167 + bool "KVM in-kernel XICS emulation" 168 + depends on KVM_BOOK3S_64 && !KVM_MPIC 169 + ---help--- 170 + Include support for the XICS (eXternal Interrupt Controller 171 + Specification) interrupt controller architecture used on 172 + IBM POWER (pSeries) servers. 153 173 154 174 source drivers/vhost/Kconfig 155 175

+11 -1

arch/powerpc/kvm/Makefile

··· 72 72 book3s_hv.o \ 73 73 book3s_hv_interrupts.o \ 74 74 book3s_64_mmu_hv.o 75 + kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ 76 + book3s_hv_rm_xics.o 75 77 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ 76 78 book3s_hv_rmhandlers.o \ 77 79 book3s_hv_rm_mmu.o \ 78 80 book3s_64_vio_hv.o \ 79 81 book3s_hv_ras.o \ 80 - book3s_hv_builtin.o 82 + book3s_hv_builtin.o \ 83 + $(kvm-book3s_64-builtin-xics-objs-y) 84 + 85 + kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ 86 + book3s_xics.o 81 87 82 88 kvm-book3s_64-module-objs := \ 83 89 ../../../virt/kvm/kvm_main.o \ ··· 92 86 emulate.o \ 93 87 book3s.o \ 94 88 book3s_64_vio.o \ 89 + book3s_rtas.o \ 95 90 $(kvm-book3s_64-objs-y) 96 91 97 92 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) ··· 109 102 book3s_32_mmu_host.o \ 110 103 book3s_32_mmu.o 111 104 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) 105 + 106 + kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o 107 + kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o) 112 108 113 109 kvm-objs := $(kvm-objs-m) $(kvm-objs-y) 114 110

+33 -3

arch/powerpc/kvm/book3s.c

··· 104 104 return prio; 105 105 } 106 106 107 - static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, 107 + void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, 108 108 unsigned int vec) 109 109 { 110 110 unsigned long old_pending = vcpu->arch.pending_exceptions; ··· 160 160 kvmppc_book3s_queue_irqprio(vcpu, vec); 161 161 } 162 162 163 - void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 164 - struct kvm_interrupt *irq) 163 + void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) 165 164 { 166 165 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); 167 166 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); ··· 529 530 val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); 530 531 break; 531 532 #endif /* CONFIG_ALTIVEC */ 533 + case KVM_REG_PPC_DEBUG_INST: { 534 + u32 opcode = INS_TW; 535 + r = copy_to_user((u32 __user *)(long)reg->addr, 536 + &opcode, sizeof(u32)); 537 + break; 538 + } 539 + #ifdef CONFIG_KVM_XICS 540 + case KVM_REG_PPC_ICP_STATE: 541 + if (!vcpu->arch.icp) { 542 + r = -ENXIO; 543 + break; 544 + } 545 + val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu)); 546 + break; 547 + #endif /* CONFIG_KVM_XICS */ 532 548 default: 533 549 r = -EINVAL; 534 550 break; ··· 606 592 vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); 607 593 break; 608 594 #endif /* CONFIG_ALTIVEC */ 595 + #ifdef CONFIG_KVM_XICS 596 + case KVM_REG_PPC_ICP_STATE: 597 + if (!vcpu->arch.icp) { 598 + r = -ENXIO; 599 + break; 600 + } 601 + r = kvmppc_xics_set_icp(vcpu, 602 + set_reg_val(reg->id, val)); 603 + break; 604 + #endif /* CONFIG_KVM_XICS */ 609 605 default: 610 606 r = -EINVAL; 611 607 break; ··· 629 605 struct kvm_translation *tr) 630 606 { 631 607 return 0; 608 + } 609 + 610 + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 611 + struct kvm_guest_debug *dbg) 612 + { 613 + return -EINVAL; 632 614 } 633 615 634 616 void kvmppc_decrementer_func(unsigned long data)

+102 -18

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 893 893 /* Harvest R and C */ 894 894 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); 895 895 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 896 - rev[i].guest_rpte = ptel | rcbits; 896 + if (rcbits & ~rev[i].guest_rpte) { 897 + rev[i].guest_rpte = ptel | rcbits; 898 + note_hpte_modification(kvm, &rev[i]); 899 + } 897 900 } 898 901 unlock_rmap(rmapp); 899 902 hptep[0] &= ~HPTE_V_HVLOCK; ··· 979 976 /* Now check and modify the HPTE */ 980 977 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { 981 978 kvmppc_clear_ref_hpte(kvm, hptep, i); 982 - rev[i].guest_rpte |= HPTE_R_R; 979 + if (!(rev[i].guest_rpte & HPTE_R_R)) { 980 + rev[i].guest_rpte |= HPTE_R_R; 981 + note_hpte_modification(kvm, &rev[i]); 982 + } 983 983 ret = 1; 984 984 } 985 985 hptep[0] &= ~HPTE_V_HVLOCK; ··· 1086 1080 hptep[1] &= ~HPTE_R_C; 1087 1081 eieio(); 1088 1082 hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 1089 - rev[i].guest_rpte |= HPTE_R_C; 1083 + if (!(rev[i].guest_rpte & HPTE_R_C)) { 1084 + rev[i].guest_rpte |= HPTE_R_C; 1085 + note_hpte_modification(kvm, &rev[i]); 1086 + } 1090 1087 ret = 1; 1091 1088 } 1092 1089 hptep[0] &= ~HPTE_V_HVLOCK; ··· 1099 1090 return ret; 1100 1091 } 1101 1092 1093 + static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1094 + struct kvm_memory_slot *memslot, 1095 + unsigned long *map) 1096 + { 1097 + unsigned long gfn; 1098 + 1099 + if (!vpa->dirty || !vpa->pinned_addr) 1100 + return; 1101 + gfn = vpa->gpa >> PAGE_SHIFT; 1102 + if (gfn < memslot->base_gfn || 1103 + gfn >= memslot->base_gfn + memslot->npages) 1104 + return; 1105 + 1106 + vpa->dirty = false; 1107 + if (map) 1108 + __set_bit_le(gfn - memslot->base_gfn, map); 1109 + } 1110 + 1102 1111 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, 1103 1112 unsigned long *map) 1104 1113 { 1105 1114 unsigned long i; 1106 1115 unsigned long *rmapp; 1116 + struct kvm_vcpu *vcpu; 1107 1117 1108 1118 preempt_disable(); 1109 1119 rmapp = memslot->arch.rmap; ··· 1130 1102 if (kvm_test_clear_dirty(kvm, rmapp) && map) 1131 1103 __set_bit_le(i, map); 1132 1104 ++rmapp; 1105 + } 1106 + 1107 + /* Harvest dirty bits from VPA and DTL updates */ 1108 + /* Note: we never modify the SLB shadow buffer areas */ 1109 + kvm_for_each_vcpu(i, vcpu, kvm) { 1110 + spin_lock(&vcpu->arch.vpa_update_lock); 1111 + harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); 1112 + harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); 1113 + spin_unlock(&vcpu->arch.vpa_update_lock); 1133 1114 } 1134 1115 preempt_enable(); 1135 1116 return 0; ··· 1151 1114 unsigned long gfn = gpa >> PAGE_SHIFT; 1152 1115 struct page *page, *pages[1]; 1153 1116 int npages; 1154 - unsigned long hva, psize, offset; 1117 + unsigned long hva, offset; 1155 1118 unsigned long pa; 1156 1119 unsigned long *physp; 1157 1120 int srcu_idx; ··· 1183 1146 } 1184 1147 srcu_read_unlock(&kvm->srcu, srcu_idx); 1185 1148 1186 - psize = PAGE_SIZE; 1187 - if (PageHuge(page)) { 1188 - page = compound_head(page); 1189 - psize <<= compound_order(page); 1190 - } 1191 - offset = gpa & (psize - 1); 1149 + offset = gpa & (PAGE_SIZE - 1); 1192 1150 if (nb_ret) 1193 - *nb_ret = psize - offset; 1151 + *nb_ret = PAGE_SIZE - offset; 1194 1152 return page_address(page) + offset; 1195 1153 1196 1154 err: ··· 1193 1161 return NULL; 1194 1162 } 1195 1163 1196 - void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) 1164 + void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1165 + bool dirty) 1197 1166 { 1198 1167 struct page *page = virt_to_page(va); 1168 + struct kvm_memory_slot *memslot; 1169 + unsigned long gfn; 1170 + unsigned long *rmap; 1171 + int srcu_idx; 1199 1172 1200 1173 put_page(page); 1174 + 1175 + if (!dirty || !kvm->arch.using_mmu_notifiers) 1176 + return; 1177 + 1178 + /* We need to mark this page dirty in the rmap chain */ 1179 + gfn = gpa >> PAGE_SHIFT; 1180 + srcu_idx = srcu_read_lock(&kvm->srcu); 1181 + memslot = gfn_to_memslot(kvm, gfn); 1182 + if (memslot) { 1183 + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1184 + lock_rmap(rmap); 1185 + *rmap |= KVMPPC_RMAP_CHANGED; 1186 + unlock_rmap(rmap); 1187 + } 1188 + srcu_read_unlock(&kvm->srcu, srcu_idx); 1201 1189 } 1202 1190 1203 1191 /* ··· 1245 1193 1246 1194 #define HPTE_SIZE (2 * sizeof(unsigned long)) 1247 1195 1196 + /* 1197 + * Returns 1 if this HPT entry has been modified or has pending 1198 + * R/C bit changes. 1199 + */ 1200 + static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) 1201 + { 1202 + unsigned long rcbits_unset; 1203 + 1204 + if (revp->guest_rpte & HPTE_GR_MODIFIED) 1205 + return 1; 1206 + 1207 + /* Also need to consider changes in reference and changed bits */ 1208 + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1209 + if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) 1210 + return 1; 1211 + 1212 + return 0; 1213 + } 1214 + 1248 1215 static long record_hpte(unsigned long flags, unsigned long *hptp, 1249 1216 unsigned long *hpte, struct revmap_entry *revp, 1250 1217 int want_valid, int first_pass) 1251 1218 { 1252 1219 unsigned long v, r; 1220 + unsigned long rcbits_unset; 1253 1221 int ok = 1; 1254 1222 int valid, dirty; 1255 1223 1256 1224 /* Unmodified entries are uninteresting except on the first pass */ 1257 - dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1225 + dirty = hpte_dirty(revp, hptp); 1258 1226 if (!first_pass && !dirty) 1259 1227 return 0; 1260 1228 ··· 1295 1223 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1296 1224 cpu_relax(); 1297 1225 v = hptp[0]; 1226 + 1227 + /* re-evaluate valid and dirty from synchronized HPTE value */ 1228 + valid = !!(v & HPTE_V_VALID); 1229 + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1230 + 1231 + /* Harvest R and C into guest view if necessary */ 1232 + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1233 + if (valid && (rcbits_unset & hptp[1])) { 1234 + revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | 1235 + HPTE_GR_MODIFIED; 1236 + dirty = 1; 1237 + } 1238 + 1298 1239 if (v & HPTE_V_ABSENT) { 1299 1240 v &= ~HPTE_V_ABSENT; 1300 1241 v |= HPTE_V_VALID; 1242 + valid = 1; 1301 1243 } 1302 - /* re-evaluate valid and dirty from synchronized HPTE value */ 1303 - valid = !!(v & HPTE_V_VALID); 1304 1244 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1305 1245 valid = 0; 1306 - r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); 1307 - dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1246 + 1247 + r = revp->guest_rpte; 1308 1248 /* only clear modified if this is the right sort of entry */ 1309 1249 if (valid == want_valid && dirty) { 1310 1250 r &= ~HPTE_GR_MODIFIED; ··· 1372 1288 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1373 1289 if (!first_pass) { 1374 1290 while (i < kvm->arch.hpt_npte && 1375 - !(revp->guest_rpte & HPTE_GR_MODIFIED)) { 1291 + !hpte_dirty(revp, hptp)) { 1376 1292 ++i; 1377 1293 hptp += 2; 1378 1294 ++revp;

+3 -1

arch/powerpc/kvm/book3s_emulate.c

··· 194 194 run->papr_hcall.args[i] = gpr; 195 195 } 196 196 197 - emulated = EMULATE_DO_PAPR; 197 + run->exit_reason = KVM_EXIT_PAPR_HCALL; 198 + vcpu->arch.hcall_needed = 1; 199 + emulated = EMULATE_EXIT_USER; 198 200 break; 199 201 } 200 202 #endif

+76 -16

arch/powerpc/kvm/book3s_hv.c

··· 66 66 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 67 67 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 68 68 69 + void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) 70 + { 71 + int me; 72 + int cpu = vcpu->cpu; 73 + wait_queue_head_t *wqp; 74 + 75 + wqp = kvm_arch_vcpu_wq(vcpu); 76 + if (waitqueue_active(wqp)) { 77 + wake_up_interruptible(wqp); 78 + ++vcpu->stat.halt_wakeup; 79 + } 80 + 81 + me = get_cpu(); 82 + 83 + /* CPU points to the first thread of the core */ 84 + if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { 85 + int real_cpu = cpu + vcpu->arch.ptid; 86 + if (paca[real_cpu].kvm_hstate.xics_phys) 87 + xics_wake_cpu(real_cpu); 88 + else if (cpu_online(cpu)) 89 + smp_send_reschedule(cpu); 90 + } 91 + put_cpu(); 92 + } 93 + 69 94 /* 70 95 * We use the vcpu_load/put functions to measure stolen time. 71 96 * Stolen time is counted as time when either the vcpu is able to ··· 284 259 len = ((struct reg_vpa *)va)->length.hword; 285 260 else 286 261 len = ((struct reg_vpa *)va)->length.word; 287 - kvmppc_unpin_guest_page(kvm, va); 262 + kvmppc_unpin_guest_page(kvm, va, vpa, false); 288 263 289 264 /* Check length */ 290 265 if (len > nb || len < sizeof(struct reg_vpa)) ··· 384 359 va = NULL; 385 360 nb = 0; 386 361 if (gpa) 387 - va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb); 362 + va = kvmppc_pin_guest_page(kvm, gpa, &nb); 388 363 spin_lock(&vcpu->arch.vpa_update_lock); 389 364 if (gpa == vpap->next_gpa) 390 365 break; 391 366 /* sigh... unpin that one and try again */ 392 367 if (va) 393 - kvmppc_unpin_guest_page(kvm, va); 368 + kvmppc_unpin_guest_page(kvm, va, gpa, false); 394 369 } 395 370 396 371 vpap->update_pending = 0; ··· 400 375 * has changed the mappings underlying guest memory, 401 376 * so unregister the region. 402 377 */ 403 - kvmppc_unpin_guest_page(kvm, va); 378 + kvmppc_unpin_guest_page(kvm, va, gpa, false); 404 379 va = NULL; 405 380 } 406 381 if (vpap->pinned_addr) 407 - kvmppc_unpin_guest_page(kvm, vpap->pinned_addr); 382 + kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, 383 + vpap->dirty); 384 + vpap->gpa = gpa; 408 385 vpap->pinned_addr = va; 386 + vpap->dirty = false; 409 387 if (va) 410 388 vpap->pinned_end = va + vpap->len; 411 389 } ··· 500 472 /* order writing *dt vs. writing vpa->dtl_idx */ 501 473 smp_wmb(); 502 474 vpa->dtl_idx = ++vcpu->arch.dtl_index; 475 + vcpu->arch.dtl.dirty = true; 503 476 } 504 477 505 478 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ··· 508 479 unsigned long req = kvmppc_get_gpr(vcpu, 3); 509 480 unsigned long target, ret = H_SUCCESS; 510 481 struct kvm_vcpu *tvcpu; 511 - int idx; 482 + int idx, rc; 512 483 513 484 switch (req) { 514 485 case H_ENTER: ··· 544 515 kvmppc_get_gpr(vcpu, 5), 545 516 kvmppc_get_gpr(vcpu, 6)); 546 517 break; 518 + case H_RTAS: 519 + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) 520 + return RESUME_HOST; 521 + 522 + rc = kvmppc_rtas_hcall(vcpu); 523 + 524 + if (rc == -ENOENT) 525 + return RESUME_HOST; 526 + else if (rc == 0) 527 + break; 528 + 529 + /* Send the error out to userspace via KVM_RUN */ 530 + return rc; 531 + 532 + case H_XIRR: 533 + case H_CPPR: 534 + case H_EOI: 535 + case H_IPI: 536 + if (kvmppc_xics_enabled(vcpu)) { 537 + ret = kvmppc_xics_hcall(vcpu, req); 538 + break; 539 + } /* fallthrough */ 547 540 default: 548 541 return RESUME_HOST; 549 542 } ··· 964 913 return ERR_PTR(err); 965 914 } 966 915 916 + static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) 917 + { 918 + if (vpa->pinned_addr) 919 + kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, 920 + vpa->dirty); 921 + } 922 + 967 923 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) 968 924 { 969 925 spin_lock(&vcpu->arch.vpa_update_lock); 970 - if (vcpu->arch.dtl.pinned_addr) 971 - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr); 972 - if (vcpu->arch.slb_shadow.pinned_addr) 973 - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr); 974 - if (vcpu->arch.vpa.pinned_addr) 975 - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr); 926 + unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); 927 + unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); 928 + unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); 976 929 spin_unlock(&vcpu->arch.vpa_update_lock); 977 930 kvm_vcpu_uninit(vcpu); 978 931 kmem_cache_free(kvm_vcpu_cache, vcpu); ··· 1010 955 } 1011 956 1012 957 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 1013 - extern void xics_wake_cpu(int cpu); 1014 958 1015 959 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 1016 960 struct kvm_vcpu *vcpu) ··· 1384 1330 break; 1385 1331 vc->runner = vcpu; 1386 1332 n_ceded = 0; 1387 - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) 1333 + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { 1388 1334 if (!v->arch.pending_exceptions) 1389 1335 n_ceded += v->arch.ceded; 1336 + else 1337 + v->arch.ceded = 0; 1338 + } 1390 1339 if (n_ceded == vc->n_runnable) 1391 1340 kvmppc_vcore_blocked(vc); 1392 1341 else ··· 1702 1645 1703 1646 void kvmppc_core_commit_memory_region(struct kvm *kvm, 1704 1647 struct kvm_userspace_memory_region *mem, 1705 - struct kvm_memory_slot old) 1648 + const struct kvm_memory_slot *old) 1706 1649 { 1707 1650 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 1708 1651 struct kvm_memory_slot *memslot; 1709 1652 1710 - if (npages && old.npages) { 1653 + if (npages && old->npages) { 1711 1654 /* 1712 1655 * If modifying a memslot, reset all the rmap dirty bits. 1713 1656 * If this is a new memslot, we don't need to do anything ··· 1884 1827 cpumask_setall(&kvm->arch.need_tlb_flush); 1885 1828 1886 1829 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1830 + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); 1887 1831 1888 1832 kvm->arch.rma = NULL; 1889 1833 ··· 1929 1871 kvm_release_rma(kvm->arch.rma); 1930 1872 kvm->arch.rma = NULL; 1931 1873 } 1874 + 1875 + kvmppc_rtas_tokens_free(kvm); 1932 1876 1933 1877 kvmppc_free_hpt(kvm); 1934 1878 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));

-11

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 97 97 } 98 98 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 99 99 100 - /* 101 - * Note modification of an HPTE; set the HPTE modified bit 102 - * if anyone is interested. 103 - */ 104 - static inline void note_hpte_modification(struct kvm *kvm, 105 - struct revmap_entry *rev) 106 - { 107 - if (atomic_read(&kvm->arch.hpte_mod_interest)) 108 - rev->guest_rpte |= HPTE_GR_MODIFIED; 109 - } 110 - 111 100 /* Remove this HPTE from the chain for a real page */ 112 101 static void remove_revmap_chain(struct kvm *kvm, long pte_index, 113 102 struct revmap_entry *rev,

+406

arch/powerpc/kvm/book3s_hv_rm_xics.c

··· 1 + /* 2 + * Copyright 2012 Michael Ellerman, IBM Corporation. 3 + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License, version 2, as 7 + * published by the Free Software Foundation. 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/kvm_host.h> 12 + #include <linux/err.h> 13 + 14 + #include <asm/kvm_book3s.h> 15 + #include <asm/kvm_ppc.h> 16 + #include <asm/hvcall.h> 17 + #include <asm/xics.h> 18 + #include <asm/debug.h> 19 + #include <asm/synch.h> 20 + #include <asm/ppc-opcode.h> 21 + 22 + #include "book3s_xics.h" 23 + 24 + #define DEBUG_PASSUP 25 + 26 + static inline void rm_writeb(unsigned long paddr, u8 val) 27 + { 28 + __asm__ __volatile__("sync; stbcix %0,0,%1" 29 + : : "r" (val), "r" (paddr) : "memory"); 30 + } 31 + 32 + static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, 33 + struct kvm_vcpu *this_vcpu) 34 + { 35 + struct kvmppc_icp *this_icp = this_vcpu->arch.icp; 36 + unsigned long xics_phys; 37 + int cpu; 38 + 39 + /* Mark the target VCPU as having an interrupt pending */ 40 + vcpu->stat.queue_intr++; 41 + set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); 42 + 43 + /* Kick self ? Just set MER and return */ 44 + if (vcpu == this_vcpu) { 45 + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER); 46 + return; 47 + } 48 + 49 + /* Check if the core is loaded, if not, too hard */ 50 + cpu = vcpu->cpu; 51 + if (cpu < 0 || cpu >= nr_cpu_ids) { 52 + this_icp->rm_action |= XICS_RM_KICK_VCPU; 53 + this_icp->rm_kick_target = vcpu; 54 + return; 55 + } 56 + /* In SMT cpu will always point to thread 0, we adjust it */ 57 + cpu += vcpu->arch.ptid; 58 + 59 + /* Not too hard, then poke the target */ 60 + xics_phys = paca[cpu].kvm_hstate.xics_phys; 61 + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 62 + } 63 + 64 + static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) 65 + { 66 + /* Note: Only called on self ! */ 67 + clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 68 + &vcpu->arch.pending_exceptions); 69 + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); 70 + } 71 + 72 + static inline bool icp_rm_try_update(struct kvmppc_icp *icp, 73 + union kvmppc_icp_state old, 74 + union kvmppc_icp_state new) 75 + { 76 + struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu; 77 + bool success; 78 + 79 + /* Calculate new output value */ 80 + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); 81 + 82 + /* Attempt atomic update */ 83 + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; 84 + if (!success) 85 + goto bail; 86 + 87 + /* 88 + * Check for output state update 89 + * 90 + * Note that this is racy since another processor could be updating 91 + * the state already. This is why we never clear the interrupt output 92 + * here, we only ever set it. The clear only happens prior to doing 93 + * an update and only by the processor itself. Currently we do it 94 + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). 95 + * 96 + * We also do not try to figure out whether the EE state has changed, 97 + * we unconditionally set it if the new state calls for it. The reason 98 + * for that is that we opportunistically remove the pending interrupt 99 + * flag when raising CPPR, so we need to set it back here if an 100 + * interrupt is still pending. 101 + */ 102 + if (new.out_ee) 103 + icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu); 104 + 105 + /* Expose the state change for debug purposes */ 106 + this_vcpu->arch.icp->rm_dbgstate = new; 107 + this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu; 108 + 109 + bail: 110 + return success; 111 + } 112 + 113 + static inline int check_too_hard(struct kvmppc_xics *xics, 114 + struct kvmppc_icp *icp) 115 + { 116 + return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; 117 + } 118 + 119 + static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 120 + u8 new_cppr) 121 + { 122 + union kvmppc_icp_state old_state, new_state; 123 + bool resend; 124 + 125 + /* 126 + * This handles several related states in one operation: 127 + * 128 + * ICP State: Down_CPPR 129 + * 130 + * Load CPPR with new value and if the XISR is 0 131 + * then check for resends: 132 + * 133 + * ICP State: Resend 134 + * 135 + * If MFRR is more favored than CPPR, check for IPIs 136 + * and notify ICS of a potential resend. This is done 137 + * asynchronously (when used in real mode, we will have 138 + * to exit here). 139 + * 140 + * We do not handle the complete Check_IPI as documented 141 + * here. In the PAPR, this state will be used for both 142 + * Set_MFRR and Down_CPPR. However, we know that we aren't 143 + * changing the MFRR state here so we don't need to handle 144 + * the case of an MFRR causing a reject of a pending irq, 145 + * this will have been handled when the MFRR was set in the 146 + * first place. 147 + * 148 + * Thus we don't have to handle rejects, only resends. 149 + * 150 + * When implementing real mode for HV KVM, resend will lead to 151 + * a H_TOO_HARD return and the whole transaction will be handled 152 + * in virtual mode. 153 + */ 154 + do { 155 + old_state = new_state = ACCESS_ONCE(icp->state); 156 + 157 + /* Down_CPPR */ 158 + new_state.cppr = new_cppr; 159 + 160 + /* 161 + * Cut down Resend / Check_IPI / IPI 162 + * 163 + * The logic is that we cannot have a pending interrupt 164 + * trumped by an IPI at this point (see above), so we 165 + * know that either the pending interrupt is already an 166 + * IPI (in which case we don't care to override it) or 167 + * it's either more favored than us or non existent 168 + */ 169 + if (new_state.mfrr < new_cppr && 170 + new_state.mfrr <= new_state.pending_pri) { 171 + new_state.pending_pri = new_state.mfrr; 172 + new_state.xisr = XICS_IPI; 173 + } 174 + 175 + /* Latch/clear resend bit */ 176 + resend = new_state.need_resend; 177 + new_state.need_resend = 0; 178 + 179 + } while (!icp_rm_try_update(icp, old_state, new_state)); 180 + 181 + /* 182 + * Now handle resend checks. Those are asynchronous to the ICP 183 + * state update in HW (ie bus transactions) so we can handle them 184 + * separately here as well. 185 + */ 186 + if (resend) 187 + icp->rm_action |= XICS_RM_CHECK_RESEND; 188 + } 189 + 190 + 191 + unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) 192 + { 193 + union kvmppc_icp_state old_state, new_state; 194 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 195 + struct kvmppc_icp *icp = vcpu->arch.icp; 196 + u32 xirr; 197 + 198 + if (!xics || !xics->real_mode) 199 + return H_TOO_HARD; 200 + 201 + /* First clear the interrupt */ 202 + icp_rm_clr_vcpu_irq(icp->vcpu); 203 + 204 + /* 205 + * ICP State: Accept_Interrupt 206 + * 207 + * Return the pending interrupt (if any) along with the 208 + * current CPPR, then clear the XISR & set CPPR to the 209 + * pending priority 210 + */ 211 + do { 212 + old_state = new_state = ACCESS_ONCE(icp->state); 213 + 214 + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); 215 + if (!old_state.xisr) 216 + break; 217 + new_state.cppr = new_state.pending_pri; 218 + new_state.pending_pri = 0xff; 219 + new_state.xisr = 0; 220 + 221 + } while (!icp_rm_try_update(icp, old_state, new_state)); 222 + 223 + /* Return the result in GPR4 */ 224 + vcpu->arch.gpr[4] = xirr; 225 + 226 + return check_too_hard(xics, icp); 227 + } 228 + 229 + int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, 230 + unsigned long mfrr) 231 + { 232 + union kvmppc_icp_state old_state, new_state; 233 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 234 + struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp; 235 + u32 reject; 236 + bool resend; 237 + bool local; 238 + 239 + if (!xics || !xics->real_mode) 240 + return H_TOO_HARD; 241 + 242 + local = this_icp->server_num == server; 243 + if (local) 244 + icp = this_icp; 245 + else 246 + icp = kvmppc_xics_find_server(vcpu->kvm, server); 247 + if (!icp) 248 + return H_PARAMETER; 249 + 250 + /* 251 + * ICP state: Set_MFRR 252 + * 253 + * If the CPPR is more favored than the new MFRR, then 254 + * nothing needs to be done as there can be no XISR to 255 + * reject. 256 + * 257 + * If the CPPR is less favored, then we might be replacing 258 + * an interrupt, and thus need to possibly reject it as in 259 + * 260 + * ICP state: Check_IPI 261 + */ 262 + do { 263 + old_state = new_state = ACCESS_ONCE(icp->state); 264 + 265 + /* Set_MFRR */ 266 + new_state.mfrr = mfrr; 267 + 268 + /* Check_IPI */ 269 + reject = 0; 270 + resend = false; 271 + if (mfrr < new_state.cppr) { 272 + /* Reject a pending interrupt if not an IPI */ 273 + if (mfrr <= new_state.pending_pri) 274 + reject = new_state.xisr; 275 + new_state.pending_pri = mfrr; 276 + new_state.xisr = XICS_IPI; 277 + } 278 + 279 + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { 280 + resend = new_state.need_resend; 281 + new_state.need_resend = 0; 282 + } 283 + } while (!icp_rm_try_update(icp, old_state, new_state)); 284 + 285 + /* Pass rejects to virtual mode */ 286 + if (reject && reject != XICS_IPI) { 287 + this_icp->rm_action |= XICS_RM_REJECT; 288 + this_icp->rm_reject = reject; 289 + } 290 + 291 + /* Pass resends to virtual mode */ 292 + if (resend) 293 + this_icp->rm_action |= XICS_RM_CHECK_RESEND; 294 + 295 + return check_too_hard(xics, this_icp); 296 + } 297 + 298 + int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) 299 + { 300 + union kvmppc_icp_state old_state, new_state; 301 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 302 + struct kvmppc_icp *icp = vcpu->arch.icp; 303 + u32 reject; 304 + 305 + if (!xics || !xics->real_mode) 306 + return H_TOO_HARD; 307 + 308 + /* 309 + * ICP State: Set_CPPR 310 + * 311 + * We can safely compare the new value with the current 312 + * value outside of the transaction as the CPPR is only 313 + * ever changed by the processor on itself 314 + */ 315 + if (cppr > icp->state.cppr) { 316 + icp_rm_down_cppr(xics, icp, cppr); 317 + goto bail; 318 + } else if (cppr == icp->state.cppr) 319 + return H_SUCCESS; 320 + 321 + /* 322 + * ICP State: Up_CPPR 323 + * 324 + * The processor is raising its priority, this can result 325 + * in a rejection of a pending interrupt: 326 + * 327 + * ICP State: Reject_Current 328 + * 329 + * We can remove EE from the current processor, the update 330 + * transaction will set it again if needed 331 + */ 332 + icp_rm_clr_vcpu_irq(icp->vcpu); 333 + 334 + do { 335 + old_state = new_state = ACCESS_ONCE(icp->state); 336 + 337 + reject = 0; 338 + new_state.cppr = cppr; 339 + 340 + if (cppr <= new_state.pending_pri) { 341 + reject = new_state.xisr; 342 + new_state.xisr = 0; 343 + new_state.pending_pri = 0xff; 344 + } 345 + 346 + } while (!icp_rm_try_update(icp, old_state, new_state)); 347 + 348 + /* Pass rejects to virtual mode */ 349 + if (reject && reject != XICS_IPI) { 350 + icp->rm_action |= XICS_RM_REJECT; 351 + icp->rm_reject = reject; 352 + } 353 + bail: 354 + return check_too_hard(xics, icp); 355 + } 356 + 357 + int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) 358 + { 359 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 360 + struct kvmppc_icp *icp = vcpu->arch.icp; 361 + struct kvmppc_ics *ics; 362 + struct ics_irq_state *state; 363 + u32 irq = xirr & 0x00ffffff; 364 + u16 src; 365 + 366 + if (!xics || !xics->real_mode) 367 + return H_TOO_HARD; 368 + 369 + /* 370 + * ICP State: EOI 371 + * 372 + * Note: If EOI is incorrectly used by SW to lower the CPPR 373 + * value (ie more favored), we do not check for rejection of 374 + * a pending interrupt, this is a SW error and PAPR sepcifies 375 + * that we don't have to deal with it. 376 + * 377 + * The sending of an EOI to the ICS is handled after the 378 + * CPPR update 379 + * 380 + * ICP State: Down_CPPR which we handle 381 + * in a separate function as it's shared with H_CPPR. 382 + */ 383 + icp_rm_down_cppr(xics, icp, xirr >> 24); 384 + 385 + /* IPIs have no EOI */ 386 + if (irq == XICS_IPI) 387 + goto bail; 388 + /* 389 + * EOI handling: If the interrupt is still asserted, we need to 390 + * resend it. We can take a lockless "peek" at the ICS state here. 391 + * 392 + * "Message" interrupts will never have "asserted" set 393 + */ 394 + ics = kvmppc_xics_find_ics(xics, irq, &src); 395 + if (!ics) 396 + goto bail; 397 + state = &ics->irq_state[src]; 398 + 399 + /* Still asserted, resend it, we make it look like a reject */ 400 + if (state->asserted) { 401 + icp->rm_action |= XICS_RM_REJECT; 402 + icp->rm_reject = irq; 403 + } 404 + bail: 405 + return check_too_hard(xics, icp); 406 + }

+161 -67

arch/powerpc/kvm/book3s_hv_rmhandlers.S

··· 79 79 * * 80 80 *****************************************************************************/ 81 81 82 - #define XICS_XIRR 4 83 - #define XICS_QIRR 0xc 84 - #define XICS_IPI 2 /* interrupt source # for IPIs */ 85 - 86 82 /* 87 83 * We come in here when wakened from nap mode on a secondary hw thread. 88 84 * Relocation is off and most register values are lost. ··· 97 101 li r0,1 98 102 stb r0,PACA_NAPSTATELOST(r13) 99 103 100 - /* get vcpu pointer, NULL if we have no vcpu to run */ 101 - ld r4,HSTATE_KVM_VCPU(r13) 102 - cmpdi cr1,r4,0 104 + /* were we napping due to cede? */ 105 + lbz r0,HSTATE_NAPPING(r13) 106 + cmpwi r0,0 107 + bne kvm_end_cede 108 + 109 + /* 110 + * We weren't napping due to cede, so this must be a secondary 111 + * thread being woken up to run a guest, or being woken up due 112 + * to a stray IPI. (Or due to some machine check or hypervisor 113 + * maintenance interrupt while the core is in KVM.) 114 + */ 103 115 104 116 /* Check the wake reason in SRR1 to see why we got here */ 105 117 mfspr r3,SPRN_SRR1 106 118 rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ 107 119 cmpwi r3,4 /* was it an external interrupt? */ 108 - bne 27f 109 - 110 - /* 111 - * External interrupt - for now assume it is an IPI, since we 112 - * should never get any other interrupts sent to offline threads. 113 - * Only do this for secondary threads. 114 - */ 115 - beq cr1,25f 116 - lwz r3,VCPU_PTID(r4) 117 - cmpwi r3,0 118 - beq 27f 119 - 25: ld r5,HSTATE_XICS_PHYS(r13) 120 - li r0,0xff 121 - li r6,XICS_QIRR 122 - li r7,XICS_XIRR 120 + bne 27f /* if not */ 121 + ld r5,HSTATE_XICS_PHYS(r13) 122 + li r7,XICS_XIRR /* if it was an external interrupt, */ 123 123 lwzcix r8,r5,r7 /* get and ack the interrupt */ 124 124 sync 125 125 clrldi. r9,r8,40 /* get interrupt source ID. */ 126 - beq 27f /* none there? */ 127 - cmpwi r9,XICS_IPI 128 - bne 26f 126 + beq 28f /* none there? */ 127 + cmpwi r9,XICS_IPI /* was it an IPI? */ 128 + bne 29f 129 + li r0,0xff 130 + li r6,XICS_MFRR 129 131 stbcix r0,r5,r6 /* clear IPI */ 130 - 26: stwcix r8,r5,r7 /* EOI the interrupt */ 132 + stwcix r8,r5,r7 /* EOI the interrupt */ 133 + sync /* order loading of vcpu after that */ 131 134 132 - 27: /* XXX should handle hypervisor maintenance interrupts etc. here */ 133 - 134 - /* reload vcpu pointer after clearing the IPI */ 135 + /* get vcpu pointer, NULL if we have no vcpu to run */ 135 136 ld r4,HSTATE_KVM_VCPU(r13) 136 137 cmpdi r4,0 137 138 /* if we have no vcpu to run, go back to sleep */ 138 139 beq kvm_no_guest 140 + b kvmppc_hv_entry 139 141 140 - /* were we napping due to cede? */ 141 - lbz r0,HSTATE_NAPPING(r13) 142 - cmpwi r0,0 143 - bne kvm_end_cede 142 + 27: /* XXX should handle hypervisor maintenance interrupts etc. here */ 143 + b kvm_no_guest 144 + 28: /* SRR1 said external but ICP said nope?? */ 145 + b kvm_no_guest 146 + 29: /* External non-IPI interrupt to offline secondary thread? help?? */ 147 + stw r8,HSTATE_SAVED_XIRR(r13) 148 + b kvm_no_guest 144 149 145 150 .global kvmppc_hv_entry 146 151 kvmppc_hv_entry: ··· 257 260 lwz r5, LPPACA_YIELDCOUNT(r3) 258 261 addi r5, r5, 1 259 262 stw r5, LPPACA_YIELDCOUNT(r3) 263 + li r6, 1 264 + stb r6, VCPU_VPA_DIRTY(r4) 260 265 25: 261 266 /* Load up DAR and DSISR */ 262 267 ld r5, VCPU_DAR(r4) ··· 484 485 mtctr r6 485 486 mtxer r7 486 487 488 + ld r10, VCPU_PC(r4) 489 + ld r11, VCPU_MSR(r4) 487 490 kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ 488 491 ld r6, VCPU_SRR0(r4) 489 492 ld r7, VCPU_SRR1(r4) 490 - ld r10, VCPU_PC(r4) 491 - ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ 492 493 494 + /* r11 = vcpu->arch.msr & ~MSR_HV */ 493 495 rldicl r11, r11, 63 - MSR_HV_LG, 1 494 496 rotldi r11, r11, 1 + MSR_HV_LG 495 497 ori r11, r11, MSR_ME 496 498 497 499 /* Check if we can deliver an external or decrementer interrupt now */ 498 500 ld r0,VCPU_PENDING_EXC(r4) 499 - li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) 500 - oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h 501 + lis r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h 501 502 and r0,r0,r8 502 503 cmpdi cr1,r0,0 503 504 andi. r0,r11,MSR_EE ··· 525 526 /* Move SRR0 and SRR1 into the respective regs */ 526 527 5: mtspr SPRN_SRR0, r6 527 528 mtspr SPRN_SRR1, r7 528 - li r0,0 529 - stb r0,VCPU_CEDED(r4) /* cancel cede */ 530 529 531 530 fast_guest_return: 531 + li r0,0 532 + stb r0,VCPU_CEDED(r4) /* cancel cede */ 532 533 mtspr SPRN_HSRR0,r10 533 534 mtspr SPRN_HSRR1,r11 534 535 ··· 675 676 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL 676 677 beq hcall_try_real_mode 677 678 678 - /* Check for mediated interrupts (could be done earlier really ...) */ 679 + /* Only handle external interrupts here on arch 206 and later */ 679 680 BEGIN_FTR_SECTION 680 - cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL 681 - bne+ 1f 682 - andi. r0,r11,MSR_EE 683 - beq 1f 684 - mfspr r5,SPRN_LPCR 685 - andi. r0,r5,LPCR_MER 686 - bne bounce_ext_interrupt 687 - 1: 688 - END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 681 + b ext_interrupt_to_host 682 + END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) 683 + 684 + /* External interrupt ? */ 685 + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 686 + bne+ ext_interrupt_to_host 687 + 688 + /* External interrupt, first check for host_ipi. If this is 689 + * set, we know the host wants us out so let's do it now 690 + */ 691 + do_ext_interrupt: 692 + lbz r0, HSTATE_HOST_IPI(r13) 693 + cmpwi r0, 0 694 + bne ext_interrupt_to_host 695 + 696 + /* Now read the interrupt from the ICP */ 697 + ld r5, HSTATE_XICS_PHYS(r13) 698 + li r7, XICS_XIRR 699 + cmpdi r5, 0 700 + beq- ext_interrupt_to_host 701 + lwzcix r3, r5, r7 702 + rlwinm. r0, r3, 0, 0xffffff 703 + sync 704 + beq 3f /* if nothing pending in the ICP */ 705 + 706 + /* We found something in the ICP... 707 + * 708 + * If it's not an IPI, stash it in the PACA and return to 709 + * the host, we don't (yet) handle directing real external 710 + * interrupts directly to the guest 711 + */ 712 + cmpwi r0, XICS_IPI 713 + bne ext_stash_for_host 714 + 715 + /* It's an IPI, clear the MFRR and EOI it */ 716 + li r0, 0xff 717 + li r6, XICS_MFRR 718 + stbcix r0, r5, r6 /* clear the IPI */ 719 + stwcix r3, r5, r7 /* EOI it */ 720 + sync 721 + 722 + /* We need to re-check host IPI now in case it got set in the 723 + * meantime. If it's clear, we bounce the interrupt to the 724 + * guest 725 + */ 726 + lbz r0, HSTATE_HOST_IPI(r13) 727 + cmpwi r0, 0 728 + bne- 1f 729 + 730 + /* Allright, looks like an IPI for the guest, we need to set MER */ 731 + 3: 732 + /* Check if any CPU is heading out to the host, if so head out too */ 733 + ld r5, HSTATE_KVM_VCORE(r13) 734 + lwz r0, VCORE_ENTRY_EXIT(r5) 735 + cmpwi r0, 0x100 736 + bge ext_interrupt_to_host 737 + 738 + /* See if there is a pending interrupt for the guest */ 739 + mfspr r8, SPRN_LPCR 740 + ld r0, VCPU_PENDING_EXC(r9) 741 + /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ 742 + rldicl. r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 743 + rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH 744 + beq 2f 745 + 746 + /* And if the guest EE is set, we can deliver immediately, else 747 + * we return to the guest with MER set 748 + */ 749 + andi. r0, r11, MSR_EE 750 + beq 2f 751 + mtspr SPRN_SRR0, r10 752 + mtspr SPRN_SRR1, r11 753 + li r10, BOOK3S_INTERRUPT_EXTERNAL 754 + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 755 + rotldi r11, r11, 63 756 + 2: mr r4, r9 757 + mtspr SPRN_LPCR, r8 758 + b fast_guest_return 759 + 760 + /* We raced with the host, we need to resend that IPI, bummer */ 761 + 1: li r0, IPI_PRIORITY 762 + stbcix r0, r5, r6 /* set the IPI */ 763 + sync 764 + b ext_interrupt_to_host 765 + 766 + ext_stash_for_host: 767 + /* It's not an IPI and it's for the host, stash it in the PACA 768 + * before exit, it will be picked up by the host ICP driver 769 + */ 770 + stw r3, HSTATE_SAVED_XIRR(r13) 771 + ext_interrupt_to_host: 689 772 690 773 guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 691 774 /* Save DEC */ ··· 910 829 beq 44f 911 830 ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ 912 831 li r0,IPI_PRIORITY 913 - li r7,XICS_QIRR 832 + li r7,XICS_MFRR 914 833 stbcix r0,r7,r8 /* trigger the IPI */ 915 834 44: srdi. r3,r3,1 916 835 addi r6,r6,PACA_SIZE ··· 1099 1018 lwz r3, LPPACA_YIELDCOUNT(r8) 1100 1019 addi r3, r3, 1 1101 1020 stw r3, LPPACA_YIELDCOUNT(r8) 1021 + li r3, 1 1022 + stb r3, VCPU_VPA_DIRTY(r9) 1102 1023 25: 1103 1024 /* Save PMU registers if requested */ 1104 1025 /* r8 and cr0.eq are live here */ ··· 1433 1350 .long 0 /* 0x58 */ 1434 1351 .long 0 /* 0x5c */ 1435 1352 .long 0 /* 0x60 */ 1436 - .long 0 /* 0x64 */ 1437 - .long 0 /* 0x68 */ 1438 - .long 0 /* 0x6c */ 1439 - .long 0 /* 0x70 */ 1440 - .long 0 /* 0x74 */ 1353 + #ifdef CONFIG_KVM_XICS 1354 + .long .kvmppc_rm_h_eoi - hcall_real_table 1355 + .long .kvmppc_rm_h_cppr - hcall_real_table 1356 + .long .kvmppc_rm_h_ipi - hcall_real_table 1357 + .long 0 /* 0x70 - H_IPOLL */ 1358 + .long .kvmppc_rm_h_xirr - hcall_real_table 1359 + #else 1360 + .long 0 /* 0x64 - H_EOI */ 1361 + .long 0 /* 0x68 - H_CPPR */ 1362 + .long 0 /* 0x6c - H_IPI */ 1363 + .long 0 /* 0x70 - H_IPOLL */ 1364 + .long 0 /* 0x74 - H_XIRR */ 1365 + #endif 1441 1366 .long 0 /* 0x78 */ 1442 1367 .long 0 /* 0x7c */ 1443 1368 .long 0 /* 0x80 */ ··· 1494 1403 1495 1404 ignore_hdec: 1496 1405 mr r4,r9 1497 - b fast_guest_return 1498 - 1499 - bounce_ext_interrupt: 1500 - mr r4,r9 1501 - mtspr SPRN_SRR0,r10 1502 - mtspr SPRN_SRR1,r11 1503 - li r10,BOOK3S_INTERRUPT_EXTERNAL 1504 - li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1505 - rotldi r11,r11,63 1506 1406 b fast_guest_return 1507 1407 1508 1408 _GLOBAL(kvmppc_h_set_dabr) ··· 1601 1519 b . 1602 1520 1603 1521 kvm_end_cede: 1522 + /* get vcpu pointer */ 1523 + ld r4, HSTATE_KVM_VCPU(r13) 1524 + 1604 1525 /* Woken by external or decrementer interrupt */ 1605 1526 ld r1, HSTATE_HOST_R1(r13) 1606 1527 ··· 1643 1558 li r0,0 1644 1559 stb r0,HSTATE_NAPPING(r13) 1645 1560 1561 + /* Check the wake reason in SRR1 to see why we got here */ 1562 + mfspr r3, SPRN_SRR1 1563 + rlwinm r3, r3, 44-31, 0x7 /* extract wake reason field */ 1564 + cmpwi r3, 4 /* was it an external interrupt? */ 1565 + li r12, BOOK3S_INTERRUPT_EXTERNAL 1566 + mr r9, r4 1567 + ld r10, VCPU_PC(r9) 1568 + ld r11, VCPU_MSR(r9) 1569 + beq do_ext_interrupt /* if so */ 1570 + 1646 1571 /* see if any other thread is already exiting */ 1647 1572 lwz r0,VCORE_ENTRY_EXIT(r5) 1648 1573 cmpwi r0,0x100 ··· 1672 1577 1673 1578 /* we've ceded but we want to give control to the host */ 1674 1579 kvm_cede_exit: 1675 - li r3,H_TOO_HARD 1676 - blr 1580 + b hcall_real_fallback 1677 1581 1678 1582 /* Try to handle a machine check in real mode */ 1679 1583 machine_check_realmode: ··· 1720 1626 beq 37f 1721 1627 sync 1722 1628 li r0, 0xff 1723 - li r6, XICS_QIRR 1629 + li r6, XICS_MFRR 1724 1630 stbcix r0, r5, r6 /* clear the IPI */ 1725 1631 stwcix r3, r5, r7 /* EOI it */ 1726 1632 37: sync

+3 -4

arch/powerpc/kvm/book3s_pr.c

··· 762 762 run->exit_reason = KVM_EXIT_MMIO; 763 763 r = RESUME_HOST_NV; 764 764 break; 765 - case EMULATE_DO_PAPR: 766 - run->exit_reason = KVM_EXIT_PAPR_HCALL; 767 - vcpu->arch.hcall_needed = 1; 765 + case EMULATE_EXIT_USER: 768 766 r = RESUME_HOST_NV; 769 767 break; 770 768 default: ··· 1281 1283 1282 1284 void kvmppc_core_commit_memory_region(struct kvm *kvm, 1283 1285 struct kvm_userspace_memory_region *mem, 1284 - struct kvm_memory_slot old) 1286 + const struct kvm_memory_slot *old) 1285 1287 { 1286 1288 } 1287 1289 ··· 1296 1298 { 1297 1299 #ifdef CONFIG_PPC64 1298 1300 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1301 + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); 1299 1302 #endif 1300 1303 1301 1304 if (firmware_has_feature(FW_FEATURE_SET_MODE)) {

+21

arch/powerpc/kvm/book3s_pr_papr.c

··· 227 227 return EMULATE_DONE; 228 228 } 229 229 230 + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) 231 + { 232 + long rc = kvmppc_xics_hcall(vcpu, cmd); 233 + kvmppc_set_gpr(vcpu, 3, rc); 234 + return EMULATE_DONE; 235 + } 236 + 230 237 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) 231 238 { 232 239 switch (cmd) { ··· 252 245 kvm_vcpu_block(vcpu); 253 246 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 254 247 vcpu->stat.halt_wakeup++; 248 + return EMULATE_DONE; 249 + case H_XIRR: 250 + case H_CPPR: 251 + case H_EOI: 252 + case H_IPI: 253 + if (kvmppc_xics_enabled(vcpu)) 254 + return kvmppc_h_pr_xics_hcall(vcpu, cmd); 255 + break; 256 + case H_RTAS: 257 + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) 258 + return RESUME_HOST; 259 + if (kvmppc_rtas_hcall(vcpu)) 260 + break; 261 + kvmppc_set_gpr(vcpu, 3, 0); 255 262 return EMULATE_DONE; 256 263 } 257 264

+274

arch/powerpc/kvm/book3s_rtas.c

··· 1 + /* 2 + * Copyright 2012 Michael Ellerman, IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License, version 2, as 6 + * published by the Free Software Foundation. 7 + */ 8 + 9 + #include <linux/kernel.h> 10 + #include <linux/kvm_host.h> 11 + #include <linux/kvm.h> 12 + #include <linux/err.h> 13 + 14 + #include <asm/uaccess.h> 15 + #include <asm/kvm_book3s.h> 16 + #include <asm/kvm_ppc.h> 17 + #include <asm/hvcall.h> 18 + #include <asm/rtas.h> 19 + 20 + #ifdef CONFIG_KVM_XICS 21 + static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) 22 + { 23 + u32 irq, server, priority; 24 + int rc; 25 + 26 + if (args->nargs != 3 || args->nret != 1) { 27 + rc = -3; 28 + goto out; 29 + } 30 + 31 + irq = args->args[0]; 32 + server = args->args[1]; 33 + priority = args->args[2]; 34 + 35 + rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); 36 + if (rc) 37 + rc = -3; 38 + out: 39 + args->rets[0] = rc; 40 + } 41 + 42 + static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) 43 + { 44 + u32 irq, server, priority; 45 + int rc; 46 + 47 + if (args->nargs != 1 || args->nret != 3) { 48 + rc = -3; 49 + goto out; 50 + } 51 + 52 + irq = args->args[0]; 53 + 54 + server = priority = 0; 55 + rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); 56 + if (rc) { 57 + rc = -3; 58 + goto out; 59 + } 60 + 61 + args->rets[1] = server; 62 + args->rets[2] = priority; 63 + out: 64 + args->rets[0] = rc; 65 + } 66 + 67 + static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) 68 + { 69 + u32 irq; 70 + int rc; 71 + 72 + if (args->nargs != 1 || args->nret != 1) { 73 + rc = -3; 74 + goto out; 75 + } 76 + 77 + irq = args->args[0]; 78 + 79 + rc = kvmppc_xics_int_off(vcpu->kvm, irq); 80 + if (rc) 81 + rc = -3; 82 + out: 83 + args->rets[0] = rc; 84 + } 85 + 86 + static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) 87 + { 88 + u32 irq; 89 + int rc; 90 + 91 + if (args->nargs != 1 || args->nret != 1) { 92 + rc = -3; 93 + goto out; 94 + } 95 + 96 + irq = args->args[0]; 97 + 98 + rc = kvmppc_xics_int_on(vcpu->kvm, irq); 99 + if (rc) 100 + rc = -3; 101 + out: 102 + args->rets[0] = rc; 103 + } 104 + #endif /* CONFIG_KVM_XICS */ 105 + 106 + struct rtas_handler { 107 + void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args); 108 + char *name; 109 + }; 110 + 111 + static struct rtas_handler rtas_handlers[] = { 112 + #ifdef CONFIG_KVM_XICS 113 + { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive }, 114 + { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive }, 115 + { .name = "ibm,int-off", .handler = kvm_rtas_int_off }, 116 + { .name = "ibm,int-on", .handler = kvm_rtas_int_on }, 117 + #endif 118 + }; 119 + 120 + struct rtas_token_definition { 121 + struct list_head list; 122 + struct rtas_handler *handler; 123 + u64 token; 124 + }; 125 + 126 + static int rtas_name_matches(char *s1, char *s2) 127 + { 128 + struct kvm_rtas_token_args args; 129 + return !strncmp(s1, s2, sizeof(args.name)); 130 + } 131 + 132 + static int rtas_token_undefine(struct kvm *kvm, char *name) 133 + { 134 + struct rtas_token_definition *d, *tmp; 135 + 136 + lockdep_assert_held(&kvm->lock); 137 + 138 + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { 139 + if (rtas_name_matches(d->handler->name, name)) { 140 + list_del(&d->list); 141 + kfree(d); 142 + return 0; 143 + } 144 + } 145 + 146 + /* It's not an error to undefine an undefined token */ 147 + return 0; 148 + } 149 + 150 + static int rtas_token_define(struct kvm *kvm, char *name, u64 token) 151 + { 152 + struct rtas_token_definition *d; 153 + struct rtas_handler *h = NULL; 154 + bool found; 155 + int i; 156 + 157 + lockdep_assert_held(&kvm->lock); 158 + 159 + list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { 160 + if (d->token == token) 161 + return -EEXIST; 162 + } 163 + 164 + found = false; 165 + for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) { 166 + h = &rtas_handlers[i]; 167 + if (rtas_name_matches(h->name, name)) { 168 + found = true; 169 + break; 170 + } 171 + } 172 + 173 + if (!found) 174 + return -ENOENT; 175 + 176 + d = kzalloc(sizeof(*d), GFP_KERNEL); 177 + if (!d) 178 + return -ENOMEM; 179 + 180 + d->handler = h; 181 + d->token = token; 182 + 183 + list_add_tail(&d->list, &kvm->arch.rtas_tokens); 184 + 185 + return 0; 186 + } 187 + 188 + int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) 189 + { 190 + struct kvm_rtas_token_args args; 191 + int rc; 192 + 193 + if (copy_from_user(&args, argp, sizeof(args))) 194 + return -EFAULT; 195 + 196 + mutex_lock(&kvm->lock); 197 + 198 + if (args.token) 199 + rc = rtas_token_define(kvm, args.name, args.token); 200 + else 201 + rc = rtas_token_undefine(kvm, args.name); 202 + 203 + mutex_unlock(&kvm->lock); 204 + 205 + return rc; 206 + } 207 + 208 + int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) 209 + { 210 + struct rtas_token_definition *d; 211 + struct rtas_args args; 212 + rtas_arg_t *orig_rets; 213 + gpa_t args_phys; 214 + int rc; 215 + 216 + /* r4 contains the guest physical address of the RTAS args */ 217 + args_phys = kvmppc_get_gpr(vcpu, 4); 218 + 219 + rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); 220 + if (rc) 221 + goto fail; 222 + 223 + /* 224 + * args->rets is a pointer into args->args. Now that we've 225 + * copied args we need to fix it up to point into our copy, 226 + * not the guest args. We also need to save the original 227 + * value so we can restore it on the way out. 228 + */ 229 + orig_rets = args.rets; 230 + args.rets = &args.args[args.nargs]; 231 + 232 + mutex_lock(&vcpu->kvm->lock); 233 + 234 + rc = -ENOENT; 235 + list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { 236 + if (d->token == args.token) { 237 + d->handler->handler(vcpu, &args); 238 + rc = 0; 239 + break; 240 + } 241 + } 242 + 243 + mutex_unlock(&vcpu->kvm->lock); 244 + 245 + if (rc == 0) { 246 + args.rets = orig_rets; 247 + rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args)); 248 + if (rc) 249 + goto fail; 250 + } 251 + 252 + return rc; 253 + 254 + fail: 255 + /* 256 + * We only get here if the guest has called RTAS with a bogus 257 + * args pointer. That means we can't get to the args, and so we 258 + * can't fail the RTAS call. So fail right out to userspace, 259 + * which should kill the guest. 260 + */ 261 + return rc; 262 + } 263 + 264 + void kvmppc_rtas_tokens_free(struct kvm *kvm) 265 + { 266 + struct rtas_token_definition *d, *tmp; 267 + 268 + lockdep_assert_held(&kvm->lock); 269 + 270 + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { 271 + list_del(&d->list); 272 + kfree(d); 273 + } 274 + }

+1270

arch/powerpc/kvm/book3s_xics.c

··· 1 + /* 2 + * Copyright 2012 Michael Ellerman, IBM Corporation. 3 + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation. 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License, version 2, as 7 + * published by the Free Software Foundation. 8 + */ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/kvm_host.h> 12 + #include <linux/err.h> 13 + #include <linux/gfp.h> 14 + #include <linux/anon_inodes.h> 15 + 16 + #include <asm/uaccess.h> 17 + #include <asm/kvm_book3s.h> 18 + #include <asm/kvm_ppc.h> 19 + #include <asm/hvcall.h> 20 + #include <asm/xics.h> 21 + #include <asm/debug.h> 22 + 23 + #include <linux/debugfs.h> 24 + #include <linux/seq_file.h> 25 + 26 + #include "book3s_xics.h" 27 + 28 + #if 1 29 + #define XICS_DBG(fmt...) do { } while (0) 30 + #else 31 + #define XICS_DBG(fmt...) trace_printk(fmt) 32 + #endif 33 + 34 + #define ENABLE_REALMODE true 35 + #define DEBUG_REALMODE false 36 + 37 + /* 38 + * LOCKING 39 + * ======= 40 + * 41 + * Each ICS has a mutex protecting the information about the IRQ 42 + * sources and avoiding simultaneous deliveries if the same interrupt. 43 + * 44 + * ICP operations are done via a single compare & swap transaction 45 + * (most ICP state fits in the union kvmppc_icp_state) 46 + */ 47 + 48 + /* 49 + * TODO 50 + * ==== 51 + * 52 + * - To speed up resends, keep a bitmap of "resend" set bits in the 53 + * ICS 54 + * 55 + * - Speed up server# -> ICP lookup (array ? hash table ?) 56 + * 57 + * - Make ICS lockless as well, or at least a per-interrupt lock or hashed 58 + * locks array to improve scalability 59 + */ 60 + 61 + /* -- ICS routines -- */ 62 + 63 + static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 64 + u32 new_irq); 65 + 66 + static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level, 67 + bool report_status) 68 + { 69 + struct ics_irq_state *state; 70 + struct kvmppc_ics *ics; 71 + u16 src; 72 + 73 + XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); 74 + 75 + ics = kvmppc_xics_find_ics(xics, irq, &src); 76 + if (!ics) { 77 + XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq); 78 + return -EINVAL; 79 + } 80 + state = &ics->irq_state[src]; 81 + if (!state->exists) 82 + return -EINVAL; 83 + 84 + if (report_status) 85 + return state->asserted; 86 + 87 + /* 88 + * We set state->asserted locklessly. This should be fine as 89 + * we are the only setter, thus concurrent access is undefined 90 + * to begin with. 91 + */ 92 + if (level == KVM_INTERRUPT_SET_LEVEL) 93 + state->asserted = 1; 94 + else if (level == KVM_INTERRUPT_UNSET) { 95 + state->asserted = 0; 96 + return 0; 97 + } 98 + 99 + /* Attempt delivery */ 100 + icp_deliver_irq(xics, NULL, irq); 101 + 102 + return state->asserted; 103 + } 104 + 105 + static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, 106 + struct kvmppc_icp *icp) 107 + { 108 + int i; 109 + 110 + mutex_lock(&ics->lock); 111 + 112 + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 113 + struct ics_irq_state *state = &ics->irq_state[i]; 114 + 115 + if (!state->resend) 116 + continue; 117 + 118 + XICS_DBG("resend %#x prio %#x\n", state->number, 119 + state->priority); 120 + 121 + mutex_unlock(&ics->lock); 122 + icp_deliver_irq(xics, icp, state->number); 123 + mutex_lock(&ics->lock); 124 + } 125 + 126 + mutex_unlock(&ics->lock); 127 + } 128 + 129 + static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, 130 + struct ics_irq_state *state, 131 + u32 server, u32 priority, u32 saved_priority) 132 + { 133 + bool deliver; 134 + 135 + mutex_lock(&ics->lock); 136 + 137 + state->server = server; 138 + state->priority = priority; 139 + state->saved_priority = saved_priority; 140 + deliver = false; 141 + if ((state->masked_pending || state->resend) && priority != MASKED) { 142 + state->masked_pending = 0; 143 + deliver = true; 144 + } 145 + 146 + mutex_unlock(&ics->lock); 147 + 148 + return deliver; 149 + } 150 + 151 + int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) 152 + { 153 + struct kvmppc_xics *xics = kvm->arch.xics; 154 + struct kvmppc_icp *icp; 155 + struct kvmppc_ics *ics; 156 + struct ics_irq_state *state; 157 + u16 src; 158 + 159 + if (!xics) 160 + return -ENODEV; 161 + 162 + ics = kvmppc_xics_find_ics(xics, irq, &src); 163 + if (!ics) 164 + return -EINVAL; 165 + state = &ics->irq_state[src]; 166 + 167 + icp = kvmppc_xics_find_server(kvm, server); 168 + if (!icp) 169 + return -EINVAL; 170 + 171 + XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n", 172 + irq, server, priority, 173 + state->masked_pending, state->resend); 174 + 175 + if (write_xive(xics, ics, state, server, priority, priority)) 176 + icp_deliver_irq(xics, icp, irq); 177 + 178 + return 0; 179 + } 180 + 181 + int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) 182 + { 183 + struct kvmppc_xics *xics = kvm->arch.xics; 184 + struct kvmppc_ics *ics; 185 + struct ics_irq_state *state; 186 + u16 src; 187 + 188 + if (!xics) 189 + return -ENODEV; 190 + 191 + ics = kvmppc_xics_find_ics(xics, irq, &src); 192 + if (!ics) 193 + return -EINVAL; 194 + state = &ics->irq_state[src]; 195 + 196 + mutex_lock(&ics->lock); 197 + *server = state->server; 198 + *priority = state->priority; 199 + mutex_unlock(&ics->lock); 200 + 201 + return 0; 202 + } 203 + 204 + int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) 205 + { 206 + struct kvmppc_xics *xics = kvm->arch.xics; 207 + struct kvmppc_icp *icp; 208 + struct kvmppc_ics *ics; 209 + struct ics_irq_state *state; 210 + u16 src; 211 + 212 + if (!xics) 213 + return -ENODEV; 214 + 215 + ics = kvmppc_xics_find_ics(xics, irq, &src); 216 + if (!ics) 217 + return -EINVAL; 218 + state = &ics->irq_state[src]; 219 + 220 + icp = kvmppc_xics_find_server(kvm, state->server); 221 + if (!icp) 222 + return -EINVAL; 223 + 224 + if (write_xive(xics, ics, state, state->server, state->saved_priority, 225 + state->saved_priority)) 226 + icp_deliver_irq(xics, icp, irq); 227 + 228 + return 0; 229 + } 230 + 231 + int kvmppc_xics_int_off(struct kvm *kvm, u32 irq) 232 + { 233 + struct kvmppc_xics *xics = kvm->arch.xics; 234 + struct kvmppc_ics *ics; 235 + struct ics_irq_state *state; 236 + u16 src; 237 + 238 + if (!xics) 239 + return -ENODEV; 240 + 241 + ics = kvmppc_xics_find_ics(xics, irq, &src); 242 + if (!ics) 243 + return -EINVAL; 244 + state = &ics->irq_state[src]; 245 + 246 + write_xive(xics, ics, state, state->server, MASKED, state->priority); 247 + 248 + return 0; 249 + } 250 + 251 + /* -- ICP routines, including hcalls -- */ 252 + 253 + static inline bool icp_try_update(struct kvmppc_icp *icp, 254 + union kvmppc_icp_state old, 255 + union kvmppc_icp_state new, 256 + bool change_self) 257 + { 258 + bool success; 259 + 260 + /* Calculate new output value */ 261 + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); 262 + 263 + /* Attempt atomic update */ 264 + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; 265 + if (!success) 266 + goto bail; 267 + 268 + XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", 269 + icp->server_num, 270 + old.cppr, old.mfrr, old.pending_pri, old.xisr, 271 + old.need_resend, old.out_ee); 272 + XICS_DBG("UPD - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", 273 + new.cppr, new.mfrr, new.pending_pri, new.xisr, 274 + new.need_resend, new.out_ee); 275 + /* 276 + * Check for output state update 277 + * 278 + * Note that this is racy since another processor could be updating 279 + * the state already. This is why we never clear the interrupt output 280 + * here, we only ever set it. The clear only happens prior to doing 281 + * an update and only by the processor itself. Currently we do it 282 + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). 283 + * 284 + * We also do not try to figure out whether the EE state has changed, 285 + * we unconditionally set it if the new state calls for it. The reason 286 + * for that is that we opportunistically remove the pending interrupt 287 + * flag when raising CPPR, so we need to set it back here if an 288 + * interrupt is still pending. 289 + */ 290 + if (new.out_ee) { 291 + kvmppc_book3s_queue_irqprio(icp->vcpu, 292 + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); 293 + if (!change_self) 294 + kvmppc_fast_vcpu_kick(icp->vcpu); 295 + } 296 + bail: 297 + return success; 298 + } 299 + 300 + static void icp_check_resend(struct kvmppc_xics *xics, 301 + struct kvmppc_icp *icp) 302 + { 303 + u32 icsid; 304 + 305 + /* Order this load with the test for need_resend in the caller */ 306 + smp_rmb(); 307 + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { 308 + struct kvmppc_ics *ics = xics->ics[icsid]; 309 + 310 + if (!test_and_clear_bit(icsid, icp->resend_map)) 311 + continue; 312 + if (!ics) 313 + continue; 314 + ics_check_resend(xics, ics, icp); 315 + } 316 + } 317 + 318 + static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, 319 + u32 *reject) 320 + { 321 + union kvmppc_icp_state old_state, new_state; 322 + bool success; 323 + 324 + XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority, 325 + icp->server_num); 326 + 327 + do { 328 + old_state = new_state = ACCESS_ONCE(icp->state); 329 + 330 + *reject = 0; 331 + 332 + /* See if we can deliver */ 333 + success = new_state.cppr > priority && 334 + new_state.mfrr > priority && 335 + new_state.pending_pri > priority; 336 + 337 + /* 338 + * If we can, check for a rejection and perform the 339 + * delivery 340 + */ 341 + if (success) { 342 + *reject = new_state.xisr; 343 + new_state.xisr = irq; 344 + new_state.pending_pri = priority; 345 + } else { 346 + /* 347 + * If we failed to deliver we set need_resend 348 + * so a subsequent CPPR state change causes us 349 + * to try a new delivery. 350 + */ 351 + new_state.need_resend = true; 352 + } 353 + 354 + } while (!icp_try_update(icp, old_state, new_state, false)); 355 + 356 + return success; 357 + } 358 + 359 + static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 360 + u32 new_irq) 361 + { 362 + struct ics_irq_state *state; 363 + struct kvmppc_ics *ics; 364 + u32 reject; 365 + u16 src; 366 + 367 + /* 368 + * This is used both for initial delivery of an interrupt and 369 + * for subsequent rejection. 370 + * 371 + * Rejection can be racy vs. resends. We have evaluated the 372 + * rejection in an atomic ICP transaction which is now complete, 373 + * so potentially the ICP can already accept the interrupt again. 374 + * 375 + * So we need to retry the delivery. Essentially the reject path 376 + * boils down to a failed delivery. Always. 377 + * 378 + * Now the interrupt could also have moved to a different target, 379 + * thus we may need to re-do the ICP lookup as well 380 + */ 381 + 382 + again: 383 + /* Get the ICS state and lock it */ 384 + ics = kvmppc_xics_find_ics(xics, new_irq, &src); 385 + if (!ics) { 386 + XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq); 387 + return; 388 + } 389 + state = &ics->irq_state[src]; 390 + 391 + /* Get a lock on the ICS */ 392 + mutex_lock(&ics->lock); 393 + 394 + /* Get our server */ 395 + if (!icp || state->server != icp->server_num) { 396 + icp = kvmppc_xics_find_server(xics->kvm, state->server); 397 + if (!icp) { 398 + pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n", 399 + new_irq, state->server); 400 + goto out; 401 + } 402 + } 403 + 404 + /* Clear the resend bit of that interrupt */ 405 + state->resend = 0; 406 + 407 + /* 408 + * If masked, bail out 409 + * 410 + * Note: PAPR doesn't mention anything about masked pending 411 + * when doing a resend, only when doing a delivery. 412 + * 413 + * However that would have the effect of losing a masked 414 + * interrupt that was rejected and isn't consistent with 415 + * the whole masked_pending business which is about not 416 + * losing interrupts that occur while masked. 417 + * 418 + * I don't differenciate normal deliveries and resends, this 419 + * implementation will differ from PAPR and not lose such 420 + * interrupts. 421 + */ 422 + if (state->priority == MASKED) { 423 + XICS_DBG("irq %#x masked pending\n", new_irq); 424 + state->masked_pending = 1; 425 + goto out; 426 + } 427 + 428 + /* 429 + * Try the delivery, this will set the need_resend flag 430 + * in the ICP as part of the atomic transaction if the 431 + * delivery is not possible. 432 + * 433 + * Note that if successful, the new delivery might have itself 434 + * rejected an interrupt that was "delivered" before we took the 435 + * icp mutex. 436 + * 437 + * In this case we do the whole sequence all over again for the 438 + * new guy. We cannot assume that the rejected interrupt is less 439 + * favored than the new one, and thus doesn't need to be delivered, 440 + * because by the time we exit icp_try_to_deliver() the target 441 + * processor may well have alrady consumed & completed it, and thus 442 + * the rejected interrupt might actually be already acceptable. 443 + */ 444 + if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) { 445 + /* 446 + * Delivery was successful, did we reject somebody else ? 447 + */ 448 + if (reject && reject != XICS_IPI) { 449 + mutex_unlock(&ics->lock); 450 + new_irq = reject; 451 + goto again; 452 + } 453 + } else { 454 + /* 455 + * We failed to deliver the interrupt we need to set the 456 + * resend map bit and mark the ICS state as needing a resend 457 + */ 458 + set_bit(ics->icsid, icp->resend_map); 459 + state->resend = 1; 460 + 461 + /* 462 + * If the need_resend flag got cleared in the ICP some time 463 + * between icp_try_to_deliver() atomic update and now, then 464 + * we know it might have missed the resend_map bit. So we 465 + * retry 466 + */ 467 + smp_mb(); 468 + if (!icp->state.need_resend) { 469 + mutex_unlock(&ics->lock); 470 + goto again; 471 + } 472 + } 473 + out: 474 + mutex_unlock(&ics->lock); 475 + } 476 + 477 + static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 478 + u8 new_cppr) 479 + { 480 + union kvmppc_icp_state old_state, new_state; 481 + bool resend; 482 + 483 + /* 484 + * This handles several related states in one operation: 485 + * 486 + * ICP State: Down_CPPR 487 + * 488 + * Load CPPR with new value and if the XISR is 0 489 + * then check for resends: 490 + * 491 + * ICP State: Resend 492 + * 493 + * If MFRR is more favored than CPPR, check for IPIs 494 + * and notify ICS of a potential resend. This is done 495 + * asynchronously (when used in real mode, we will have 496 + * to exit here). 497 + * 498 + * We do not handle the complete Check_IPI as documented 499 + * here. In the PAPR, this state will be used for both 500 + * Set_MFRR and Down_CPPR. However, we know that we aren't 501 + * changing the MFRR state here so we don't need to handle 502 + * the case of an MFRR causing a reject of a pending irq, 503 + * this will have been handled when the MFRR was set in the 504 + * first place. 505 + * 506 + * Thus we don't have to handle rejects, only resends. 507 + * 508 + * When implementing real mode for HV KVM, resend will lead to 509 + * a H_TOO_HARD return and the whole transaction will be handled 510 + * in virtual mode. 511 + */ 512 + do { 513 + old_state = new_state = ACCESS_ONCE(icp->state); 514 + 515 + /* Down_CPPR */ 516 + new_state.cppr = new_cppr; 517 + 518 + /* 519 + * Cut down Resend / Check_IPI / IPI 520 + * 521 + * The logic is that we cannot have a pending interrupt 522 + * trumped by an IPI at this point (see above), so we 523 + * know that either the pending interrupt is already an 524 + * IPI (in which case we don't care to override it) or 525 + * it's either more favored than us or non existent 526 + */ 527 + if (new_state.mfrr < new_cppr && 528 + new_state.mfrr <= new_state.pending_pri) { 529 + WARN_ON(new_state.xisr != XICS_IPI && 530 + new_state.xisr != 0); 531 + new_state.pending_pri = new_state.mfrr; 532 + new_state.xisr = XICS_IPI; 533 + } 534 + 535 + /* Latch/clear resend bit */ 536 + resend = new_state.need_resend; 537 + new_state.need_resend = 0; 538 + 539 + } while (!icp_try_update(icp, old_state, new_state, true)); 540 + 541 + /* 542 + * Now handle resend checks. Those are asynchronous to the ICP 543 + * state update in HW (ie bus transactions) so we can handle them 544 + * separately here too 545 + */ 546 + if (resend) 547 + icp_check_resend(xics, icp); 548 + } 549 + 550 + static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) 551 + { 552 + union kvmppc_icp_state old_state, new_state; 553 + struct kvmppc_icp *icp = vcpu->arch.icp; 554 + u32 xirr; 555 + 556 + /* First, remove EE from the processor */ 557 + kvmppc_book3s_dequeue_irqprio(icp->vcpu, 558 + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); 559 + 560 + /* 561 + * ICP State: Accept_Interrupt 562 + * 563 + * Return the pending interrupt (if any) along with the 564 + * current CPPR, then clear the XISR & set CPPR to the 565 + * pending priority 566 + */ 567 + do { 568 + old_state = new_state = ACCESS_ONCE(icp->state); 569 + 570 + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); 571 + if (!old_state.xisr) 572 + break; 573 + new_state.cppr = new_state.pending_pri; 574 + new_state.pending_pri = 0xff; 575 + new_state.xisr = 0; 576 + 577 + } while (!icp_try_update(icp, old_state, new_state, true)); 578 + 579 + XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr); 580 + 581 + return xirr; 582 + } 583 + 584 + static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, 585 + unsigned long mfrr) 586 + { 587 + union kvmppc_icp_state old_state, new_state; 588 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 589 + struct kvmppc_icp *icp; 590 + u32 reject; 591 + bool resend; 592 + bool local; 593 + 594 + XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n", 595 + vcpu->vcpu_id, server, mfrr); 596 + 597 + icp = vcpu->arch.icp; 598 + local = icp->server_num == server; 599 + if (!local) { 600 + icp = kvmppc_xics_find_server(vcpu->kvm, server); 601 + if (!icp) 602 + return H_PARAMETER; 603 + } 604 + 605 + /* 606 + * ICP state: Set_MFRR 607 + * 608 + * If the CPPR is more favored than the new MFRR, then 609 + * nothing needs to be rejected as there can be no XISR to 610 + * reject. If the MFRR is being made less favored then 611 + * there might be a previously-rejected interrupt needing 612 + * to be resent. 613 + * 614 + * If the CPPR is less favored, then we might be replacing 615 + * an interrupt, and thus need to possibly reject it as in 616 + * 617 + * ICP state: Check_IPI 618 + */ 619 + do { 620 + old_state = new_state = ACCESS_ONCE(icp->state); 621 + 622 + /* Set_MFRR */ 623 + new_state.mfrr = mfrr; 624 + 625 + /* Check_IPI */ 626 + reject = 0; 627 + resend = false; 628 + if (mfrr < new_state.cppr) { 629 + /* Reject a pending interrupt if not an IPI */ 630 + if (mfrr <= new_state.pending_pri) 631 + reject = new_state.xisr; 632 + new_state.pending_pri = mfrr; 633 + new_state.xisr = XICS_IPI; 634 + } 635 + 636 + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { 637 + resend = new_state.need_resend; 638 + new_state.need_resend = 0; 639 + } 640 + } while (!icp_try_update(icp, old_state, new_state, local)); 641 + 642 + /* Handle reject */ 643 + if (reject && reject != XICS_IPI) 644 + icp_deliver_irq(xics, icp, reject); 645 + 646 + /* Handle resend */ 647 + if (resend) 648 + icp_check_resend(xics, icp); 649 + 650 + return H_SUCCESS; 651 + } 652 + 653 + static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) 654 + { 655 + union kvmppc_icp_state old_state, new_state; 656 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 657 + struct kvmppc_icp *icp = vcpu->arch.icp; 658 + u32 reject; 659 + 660 + XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr); 661 + 662 + /* 663 + * ICP State: Set_CPPR 664 + * 665 + * We can safely compare the new value with the current 666 + * value outside of the transaction as the CPPR is only 667 + * ever changed by the processor on itself 668 + */ 669 + if (cppr > icp->state.cppr) 670 + icp_down_cppr(xics, icp, cppr); 671 + else if (cppr == icp->state.cppr) 672 + return; 673 + 674 + /* 675 + * ICP State: Up_CPPR 676 + * 677 + * The processor is raising its priority, this can result 678 + * in a rejection of a pending interrupt: 679 + * 680 + * ICP State: Reject_Current 681 + * 682 + * We can remove EE from the current processor, the update 683 + * transaction will set it again if needed 684 + */ 685 + kvmppc_book3s_dequeue_irqprio(icp->vcpu, 686 + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); 687 + 688 + do { 689 + old_state = new_state = ACCESS_ONCE(icp->state); 690 + 691 + reject = 0; 692 + new_state.cppr = cppr; 693 + 694 + if (cppr <= new_state.pending_pri) { 695 + reject = new_state.xisr; 696 + new_state.xisr = 0; 697 + new_state.pending_pri = 0xff; 698 + } 699 + 700 + } while (!icp_try_update(icp, old_state, new_state, true)); 701 + 702 + /* 703 + * Check for rejects. They are handled by doing a new delivery 704 + * attempt (see comments in icp_deliver_irq). 705 + */ 706 + if (reject && reject != XICS_IPI) 707 + icp_deliver_irq(xics, icp, reject); 708 + } 709 + 710 + static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) 711 + { 712 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 713 + struct kvmppc_icp *icp = vcpu->arch.icp; 714 + struct kvmppc_ics *ics; 715 + struct ics_irq_state *state; 716 + u32 irq = xirr & 0x00ffffff; 717 + u16 src; 718 + 719 + XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); 720 + 721 + /* 722 + * ICP State: EOI 723 + * 724 + * Note: If EOI is incorrectly used by SW to lower the CPPR 725 + * value (ie more favored), we do not check for rejection of 726 + * a pending interrupt, this is a SW error and PAPR sepcifies 727 + * that we don't have to deal with it. 728 + * 729 + * The sending of an EOI to the ICS is handled after the 730 + * CPPR update 731 + * 732 + * ICP State: Down_CPPR which we handle 733 + * in a separate function as it's shared with H_CPPR. 734 + */ 735 + icp_down_cppr(xics, icp, xirr >> 24); 736 + 737 + /* IPIs have no EOI */ 738 + if (irq == XICS_IPI) 739 + return H_SUCCESS; 740 + /* 741 + * EOI handling: If the interrupt is still asserted, we need to 742 + * resend it. We can take a lockless "peek" at the ICS state here. 743 + * 744 + * "Message" interrupts will never have "asserted" set 745 + */ 746 + ics = kvmppc_xics_find_ics(xics, irq, &src); 747 + if (!ics) { 748 + XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); 749 + return H_PARAMETER; 750 + } 751 + state = &ics->irq_state[src]; 752 + 753 + /* Still asserted, resend it */ 754 + if (state->asserted) 755 + icp_deliver_irq(xics, icp, irq); 756 + 757 + return H_SUCCESS; 758 + } 759 + 760 + static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) 761 + { 762 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 763 + struct kvmppc_icp *icp = vcpu->arch.icp; 764 + 765 + XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", 766 + hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); 767 + 768 + if (icp->rm_action & XICS_RM_KICK_VCPU) 769 + kvmppc_fast_vcpu_kick(icp->rm_kick_target); 770 + if (icp->rm_action & XICS_RM_CHECK_RESEND) 771 + icp_check_resend(xics, icp); 772 + if (icp->rm_action & XICS_RM_REJECT) 773 + icp_deliver_irq(xics, icp, icp->rm_reject); 774 + 775 + icp->rm_action = 0; 776 + 777 + return H_SUCCESS; 778 + } 779 + 780 + int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) 781 + { 782 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 783 + unsigned long res; 784 + int rc = H_SUCCESS; 785 + 786 + /* Check if we have an ICP */ 787 + if (!xics || !vcpu->arch.icp) 788 + return H_HARDWARE; 789 + 790 + /* Check for real mode returning too hard */ 791 + if (xics->real_mode) 792 + return kvmppc_xics_rm_complete(vcpu, req); 793 + 794 + switch (req) { 795 + case H_XIRR: 796 + res = kvmppc_h_xirr(vcpu); 797 + kvmppc_set_gpr(vcpu, 4, res); 798 + break; 799 + case H_CPPR: 800 + kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); 801 + break; 802 + case H_EOI: 803 + rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); 804 + break; 805 + case H_IPI: 806 + rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), 807 + kvmppc_get_gpr(vcpu, 5)); 808 + break; 809 + } 810 + 811 + return rc; 812 + } 813 + 814 + 815 + /* -- Initialisation code etc. -- */ 816 + 817 + static int xics_debug_show(struct seq_file *m, void *private) 818 + { 819 + struct kvmppc_xics *xics = m->private; 820 + struct kvm *kvm = xics->kvm; 821 + struct kvm_vcpu *vcpu; 822 + int icsid, i; 823 + 824 + if (!kvm) 825 + return 0; 826 + 827 + seq_printf(m, "=========\nICP state\n=========\n"); 828 + 829 + kvm_for_each_vcpu(i, vcpu, kvm) { 830 + struct kvmppc_icp *icp = vcpu->arch.icp; 831 + union kvmppc_icp_state state; 832 + 833 + if (!icp) 834 + continue; 835 + 836 + state.raw = ACCESS_ONCE(icp->state.raw); 837 + seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n", 838 + icp->server_num, state.xisr, 839 + state.pending_pri, state.cppr, state.mfrr, 840 + state.out_ee, state.need_resend); 841 + } 842 + 843 + for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { 844 + struct kvmppc_ics *ics = xics->ics[icsid]; 845 + 846 + if (!ics) 847 + continue; 848 + 849 + seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", 850 + icsid); 851 + 852 + mutex_lock(&ics->lock); 853 + 854 + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 855 + struct ics_irq_state *irq = &ics->irq_state[i]; 856 + 857 + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", 858 + irq->number, irq->server, irq->priority, 859 + irq->saved_priority, irq->asserted, 860 + irq->resend, irq->masked_pending); 861 + 862 + } 863 + mutex_unlock(&ics->lock); 864 + } 865 + return 0; 866 + } 867 + 868 + static int xics_debug_open(struct inode *inode, struct file *file) 869 + { 870 + return single_open(file, xics_debug_show, inode->i_private); 871 + } 872 + 873 + static const struct file_operations xics_debug_fops = { 874 + .open = xics_debug_open, 875 + .read = seq_read, 876 + .llseek = seq_lseek, 877 + .release = single_release, 878 + }; 879 + 880 + static void xics_debugfs_init(struct kvmppc_xics *xics) 881 + { 882 + char *name; 883 + 884 + name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics); 885 + if (!name) { 886 + pr_err("%s: no memory for name\n", __func__); 887 + return; 888 + } 889 + 890 + xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, 891 + xics, &xics_debug_fops); 892 + 893 + pr_debug("%s: created %s\n", __func__, name); 894 + kfree(name); 895 + } 896 + 897 + static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, 898 + struct kvmppc_xics *xics, int irq) 899 + { 900 + struct kvmppc_ics *ics; 901 + int i, icsid; 902 + 903 + icsid = irq >> KVMPPC_XICS_ICS_SHIFT; 904 + 905 + mutex_lock(&kvm->lock); 906 + 907 + /* ICS already exists - somebody else got here first */ 908 + if (xics->ics[icsid]) 909 + goto out; 910 + 911 + /* Create the ICS */ 912 + ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL); 913 + if (!ics) 914 + goto out; 915 + 916 + mutex_init(&ics->lock); 917 + ics->icsid = icsid; 918 + 919 + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 920 + ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i; 921 + ics->irq_state[i].priority = MASKED; 922 + ics->irq_state[i].saved_priority = MASKED; 923 + } 924 + smp_wmb(); 925 + xics->ics[icsid] = ics; 926 + 927 + if (icsid > xics->max_icsid) 928 + xics->max_icsid = icsid; 929 + 930 + out: 931 + mutex_unlock(&kvm->lock); 932 + return xics->ics[icsid]; 933 + } 934 + 935 + int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) 936 + { 937 + struct kvmppc_icp *icp; 938 + 939 + if (!vcpu->kvm->arch.xics) 940 + return -ENODEV; 941 + 942 + if (kvmppc_xics_find_server(vcpu->kvm, server_num)) 943 + return -EEXIST; 944 + 945 + icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL); 946 + if (!icp) 947 + return -ENOMEM; 948 + 949 + icp->vcpu = vcpu; 950 + icp->server_num = server_num; 951 + icp->state.mfrr = MASKED; 952 + icp->state.pending_pri = MASKED; 953 + vcpu->arch.icp = icp; 954 + 955 + XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id); 956 + 957 + return 0; 958 + } 959 + 960 + u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu) 961 + { 962 + struct kvmppc_icp *icp = vcpu->arch.icp; 963 + union kvmppc_icp_state state; 964 + 965 + if (!icp) 966 + return 0; 967 + state = icp->state; 968 + return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) | 969 + ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) | 970 + ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) | 971 + ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT); 972 + } 973 + 974 + int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) 975 + { 976 + struct kvmppc_icp *icp = vcpu->arch.icp; 977 + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 978 + union kvmppc_icp_state old_state, new_state; 979 + struct kvmppc_ics *ics; 980 + u8 cppr, mfrr, pending_pri; 981 + u32 xisr; 982 + u16 src; 983 + bool resend; 984 + 985 + if (!icp || !xics) 986 + return -ENOENT; 987 + 988 + cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; 989 + xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & 990 + KVM_REG_PPC_ICP_XISR_MASK; 991 + mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; 992 + pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT; 993 + 994 + /* Require the new state to be internally consistent */ 995 + if (xisr == 0) { 996 + if (pending_pri != 0xff) 997 + return -EINVAL; 998 + } else if (xisr == XICS_IPI) { 999 + if (pending_pri != mfrr || pending_pri >= cppr) 1000 + return -EINVAL; 1001 + } else { 1002 + if (pending_pri >= mfrr || pending_pri >= cppr) 1003 + return -EINVAL; 1004 + ics = kvmppc_xics_find_ics(xics, xisr, &src); 1005 + if (!ics) 1006 + return -EINVAL; 1007 + } 1008 + 1009 + new_state.raw = 0; 1010 + new_state.cppr = cppr; 1011 + new_state.xisr = xisr; 1012 + new_state.mfrr = mfrr; 1013 + new_state.pending_pri = pending_pri; 1014 + 1015 + /* 1016 + * Deassert the CPU interrupt request. 1017 + * icp_try_update will reassert it if necessary. 1018 + */ 1019 + kvmppc_book3s_dequeue_irqprio(icp->vcpu, 1020 + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); 1021 + 1022 + /* 1023 + * Note that if we displace an interrupt from old_state.xisr, 1024 + * we don't mark it as rejected. We expect userspace to set 1025 + * the state of the interrupt sources to be consistent with 1026 + * the ICP states (either before or afterwards, which doesn't 1027 + * matter). We do handle resends due to CPPR becoming less 1028 + * favoured because that is necessary to end up with a 1029 + * consistent state in the situation where userspace restores 1030 + * the ICS states before the ICP states. 1031 + */ 1032 + do { 1033 + old_state = ACCESS_ONCE(icp->state); 1034 + 1035 + if (new_state.mfrr <= old_state.mfrr) { 1036 + resend = false; 1037 + new_state.need_resend = old_state.need_resend; 1038 + } else { 1039 + resend = old_state.need_resend; 1040 + new_state.need_resend = 0; 1041 + } 1042 + } while (!icp_try_update(icp, old_state, new_state, false)); 1043 + 1044 + if (resend) 1045 + icp_check_resend(xics, icp); 1046 + 1047 + return 0; 1048 + } 1049 + 1050 + static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) 1051 + { 1052 + int ret; 1053 + struct kvmppc_ics *ics; 1054 + struct ics_irq_state *irqp; 1055 + u64 __user *ubufp = (u64 __user *) addr; 1056 + u16 idx; 1057 + u64 val, prio; 1058 + 1059 + ics = kvmppc_xics_find_ics(xics, irq, &idx); 1060 + if (!ics) 1061 + return -ENOENT; 1062 + 1063 + irqp = &ics->irq_state[idx]; 1064 + mutex_lock(&ics->lock); 1065 + ret = -ENOENT; 1066 + if (irqp->exists) { 1067 + val = irqp->server; 1068 + prio = irqp->priority; 1069 + if (prio == MASKED) { 1070 + val |= KVM_XICS_MASKED; 1071 + prio = irqp->saved_priority; 1072 + } 1073 + val |= prio << KVM_XICS_PRIORITY_SHIFT; 1074 + if (irqp->asserted) 1075 + val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; 1076 + else if (irqp->masked_pending || irqp->resend) 1077 + val |= KVM_XICS_PENDING; 1078 + ret = 0; 1079 + } 1080 + mutex_unlock(&ics->lock); 1081 + 1082 + if (!ret && put_user(val, ubufp)) 1083 + ret = -EFAULT; 1084 + 1085 + return ret; 1086 + } 1087 + 1088 + static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) 1089 + { 1090 + struct kvmppc_ics *ics; 1091 + struct ics_irq_state *irqp; 1092 + u64 __user *ubufp = (u64 __user *) addr; 1093 + u16 idx; 1094 + u64 val; 1095 + u8 prio; 1096 + u32 server; 1097 + 1098 + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) 1099 + return -ENOENT; 1100 + 1101 + ics = kvmppc_xics_find_ics(xics, irq, &idx); 1102 + if (!ics) { 1103 + ics = kvmppc_xics_create_ics(xics->kvm, xics, irq); 1104 + if (!ics) 1105 + return -ENOMEM; 1106 + } 1107 + irqp = &ics->irq_state[idx]; 1108 + if (get_user(val, ubufp)) 1109 + return -EFAULT; 1110 + 1111 + server = val & KVM_XICS_DESTINATION_MASK; 1112 + prio = val >> KVM_XICS_PRIORITY_SHIFT; 1113 + if (prio != MASKED && 1114 + kvmppc_xics_find_server(xics->kvm, server) == NULL) 1115 + return -EINVAL; 1116 + 1117 + mutex_lock(&ics->lock); 1118 + irqp->server = server; 1119 + irqp->saved_priority = prio; 1120 + if (val & KVM_XICS_MASKED) 1121 + prio = MASKED; 1122 + irqp->priority = prio; 1123 + irqp->resend = 0; 1124 + irqp->masked_pending = 0; 1125 + irqp->asserted = 0; 1126 + if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) 1127 + irqp->asserted = 1; 1128 + irqp->exists = 1; 1129 + mutex_unlock(&ics->lock); 1130 + 1131 + if (val & KVM_XICS_PENDING) 1132 + icp_deliver_irq(xics, NULL, irqp->number); 1133 + 1134 + return 0; 1135 + } 1136 + 1137 + int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, 1138 + bool line_status) 1139 + { 1140 + struct kvmppc_xics *xics = kvm->arch.xics; 1141 + 1142 + return ics_deliver_irq(xics, irq, level, line_status); 1143 + } 1144 + 1145 + static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1146 + { 1147 + struct kvmppc_xics *xics = dev->private; 1148 + 1149 + switch (attr->group) { 1150 + case KVM_DEV_XICS_GRP_SOURCES: 1151 + return xics_set_source(xics, attr->attr, attr->addr); 1152 + } 1153 + return -ENXIO; 1154 + } 1155 + 1156 + static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1157 + { 1158 + struct kvmppc_xics *xics = dev->private; 1159 + 1160 + switch (attr->group) { 1161 + case KVM_DEV_XICS_GRP_SOURCES: 1162 + return xics_get_source(xics, attr->attr, attr->addr); 1163 + } 1164 + return -ENXIO; 1165 + } 1166 + 1167 + static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1168 + { 1169 + switch (attr->group) { 1170 + case KVM_DEV_XICS_GRP_SOURCES: 1171 + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && 1172 + attr->attr < KVMPPC_XICS_NR_IRQS) 1173 + return 0; 1174 + break; 1175 + } 1176 + return -ENXIO; 1177 + } 1178 + 1179 + static void kvmppc_xics_free(struct kvm_device *dev) 1180 + { 1181 + struct kvmppc_xics *xics = dev->private; 1182 + int i; 1183 + struct kvm *kvm = xics->kvm; 1184 + 1185 + debugfs_remove(xics->dentry); 1186 + 1187 + if (kvm) 1188 + kvm->arch.xics = NULL; 1189 + 1190 + for (i = 0; i <= xics->max_icsid; i++) 1191 + kfree(xics->ics[i]); 1192 + kfree(xics); 1193 + kfree(dev); 1194 + } 1195 + 1196 + static int kvmppc_xics_create(struct kvm_device *dev, u32 type) 1197 + { 1198 + struct kvmppc_xics *xics; 1199 + struct kvm *kvm = dev->kvm; 1200 + int ret = 0; 1201 + 1202 + xics = kzalloc(sizeof(*xics), GFP_KERNEL); 1203 + if (!xics) 1204 + return -ENOMEM; 1205 + 1206 + dev->private = xics; 1207 + xics->dev = dev; 1208 + xics->kvm = kvm; 1209 + 1210 + /* Already there ? */ 1211 + mutex_lock(&kvm->lock); 1212 + if (kvm->arch.xics) 1213 + ret = -EEXIST; 1214 + else 1215 + kvm->arch.xics = xics; 1216 + mutex_unlock(&kvm->lock); 1217 + 1218 + if (ret) 1219 + return ret; 1220 + 1221 + xics_debugfs_init(xics); 1222 + 1223 + #ifdef CONFIG_KVM_BOOK3S_64_HV 1224 + if (cpu_has_feature(CPU_FTR_ARCH_206)) { 1225 + /* Enable real mode support */ 1226 + xics->real_mode = ENABLE_REALMODE; 1227 + xics->real_mode_dbg = DEBUG_REALMODE; 1228 + } 1229 + #endif /* CONFIG_KVM_BOOK3S_64_HV */ 1230 + 1231 + return 0; 1232 + } 1233 + 1234 + struct kvm_device_ops kvm_xics_ops = { 1235 + .name = "kvm-xics", 1236 + .create = kvmppc_xics_create, 1237 + .destroy = kvmppc_xics_free, 1238 + .set_attr = xics_set_attr, 1239 + .get_attr = xics_get_attr, 1240 + .has_attr = xics_has_attr, 1241 + }; 1242 + 1243 + int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, 1244 + u32 xcpu) 1245 + { 1246 + struct kvmppc_xics *xics = dev->private; 1247 + int r = -EBUSY; 1248 + 1249 + if (dev->ops != &kvm_xics_ops) 1250 + return -EPERM; 1251 + if (xics->kvm != vcpu->kvm) 1252 + return -EPERM; 1253 + if (vcpu->arch.irq_type) 1254 + return -EBUSY; 1255 + 1256 + r = kvmppc_xics_create_icp(vcpu, xcpu); 1257 + if (!r) 1258 + vcpu->arch.irq_type = KVMPPC_IRQ_XICS; 1259 + 1260 + return r; 1261 + } 1262 + 1263 + void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) 1264 + { 1265 + if (!vcpu->arch.icp) 1266 + return; 1267 + kfree(vcpu->arch.icp); 1268 + vcpu->arch.icp = NULL; 1269 + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; 1270 + }

+130

arch/powerpc/kvm/book3s_xics.h

··· 1 + /* 2 + * Copyright 2012 Michael Ellerman, IBM Corporation. 3 + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License, version 2, as 7 + * published by the Free Software Foundation. 8 + */ 9 + 10 + #ifndef _KVM_PPC_BOOK3S_XICS_H 11 + #define _KVM_PPC_BOOK3S_XICS_H 12 + 13 + /* 14 + * We use a two-level tree to store interrupt source information. 15 + * There are up to 1024 ICS nodes, each of which can represent 16 + * 1024 sources. 17 + */ 18 + #define KVMPPC_XICS_MAX_ICS_ID 1023 19 + #define KVMPPC_XICS_ICS_SHIFT 10 20 + #define KVMPPC_XICS_IRQ_PER_ICS (1 << KVMPPC_XICS_ICS_SHIFT) 21 + #define KVMPPC_XICS_SRC_MASK (KVMPPC_XICS_IRQ_PER_ICS - 1) 22 + 23 + /* 24 + * Interrupt source numbers below this are reserved, for example 25 + * 0 is "no interrupt", and 2 is used for IPIs. 26 + */ 27 + #define KVMPPC_XICS_FIRST_IRQ 16 28 + #define KVMPPC_XICS_NR_IRQS ((KVMPPC_XICS_MAX_ICS_ID + 1) * \ 29 + KVMPPC_XICS_IRQ_PER_ICS) 30 + 31 + /* Priority value to use for disabling an interrupt */ 32 + #define MASKED 0xff 33 + 34 + /* State for one irq source */ 35 + struct ics_irq_state { 36 + u32 number; 37 + u32 server; 38 + u8 priority; 39 + u8 saved_priority; 40 + u8 resend; 41 + u8 masked_pending; 42 + u8 asserted; /* Only for LSI */ 43 + u8 exists; 44 + }; 45 + 46 + /* Atomic ICP state, updated with a single compare & swap */ 47 + union kvmppc_icp_state { 48 + unsigned long raw; 49 + struct { 50 + u8 out_ee:1; 51 + u8 need_resend:1; 52 + u8 cppr; 53 + u8 mfrr; 54 + u8 pending_pri; 55 + u32 xisr; 56 + }; 57 + }; 58 + 59 + /* One bit per ICS */ 60 + #define ICP_RESEND_MAP_SIZE (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1) 61 + 62 + struct kvmppc_icp { 63 + struct kvm_vcpu *vcpu; 64 + unsigned long server_num; 65 + union kvmppc_icp_state state; 66 + unsigned long resend_map[ICP_RESEND_MAP_SIZE]; 67 + 68 + /* Real mode might find something too hard, here's the action 69 + * it might request from virtual mode 70 + */ 71 + #define XICS_RM_KICK_VCPU 0x1 72 + #define XICS_RM_CHECK_RESEND 0x2 73 + #define XICS_RM_REJECT 0x4 74 + u32 rm_action; 75 + struct kvm_vcpu *rm_kick_target; 76 + u32 rm_reject; 77 + 78 + /* Debug stuff for real mode */ 79 + union kvmppc_icp_state rm_dbgstate; 80 + struct kvm_vcpu *rm_dbgtgt; 81 + }; 82 + 83 + struct kvmppc_ics { 84 + struct mutex lock; 85 + u16 icsid; 86 + struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; 87 + }; 88 + 89 + struct kvmppc_xics { 90 + struct kvm *kvm; 91 + struct kvm_device *dev; 92 + struct dentry *dentry; 93 + u32 max_icsid; 94 + bool real_mode; 95 + bool real_mode_dbg; 96 + struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; 97 + }; 98 + 99 + static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, 100 + u32 nr) 101 + { 102 + struct kvm_vcpu *vcpu = NULL; 103 + int i; 104 + 105 + kvm_for_each_vcpu(i, vcpu, kvm) { 106 + if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num) 107 + return vcpu->arch.icp; 108 + } 109 + return NULL; 110 + } 111 + 112 + static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics, 113 + u32 irq, u16 *source) 114 + { 115 + u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT; 116 + u16 src = irq & KVMPPC_XICS_SRC_MASK; 117 + struct kvmppc_ics *ics; 118 + 119 + if (source) 120 + *source = src; 121 + if (icsid > KVMPPC_XICS_MAX_ICS_ID) 122 + return NULL; 123 + ics = xics->ics[icsid]; 124 + if (!ics) 125 + return NULL; 126 + return ics; 127 + } 128 + 129 + 130 + #endif /* _KVM_PPC_BOOK3S_XICS_H */

+111 -49

arch/powerpc/kvm/booke.c

··· 222 222 kvmppc_booke_queue_irqprio(vcpu, prio); 223 223 } 224 224 225 - void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 226 - struct kvm_interrupt *irq) 225 + void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) 227 226 { 228 227 clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); 229 228 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); ··· 346 347 keep_irq = true; 347 348 } 348 349 349 - if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled) 350 + if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags) 350 351 update_epr = true; 351 352 352 353 switch (priority) { ··· 427 428 set_guest_esr(vcpu, vcpu->arch.queued_esr); 428 429 if (update_dear == true) 429 430 set_guest_dear(vcpu, vcpu->arch.queued_dear); 430 - if (update_epr == true) 431 - kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); 431 + if (update_epr == true) { 432 + if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) 433 + kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); 434 + else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) { 435 + BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC); 436 + kvmppc_mpic_set_epr(vcpu); 437 + } 438 + } 432 439 433 440 new_msr &= msr_mask; 434 441 #if defined(CONFIG_64BIT) ··· 749 744 run->hw.hardware_exit_reason = ~0ULL << 32; 750 745 run->hw.hardware_exit_reason |= vcpu->arch.last_inst; 751 746 kvmppc_core_queue_program(vcpu, ESR_PIL); 747 + return RESUME_HOST; 748 + 749 + case EMULATE_EXIT_USER: 752 750 return RESUME_HOST; 753 751 754 752 default: ··· 1156 1148 return r; 1157 1149 } 1158 1150 1151 + static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) 1152 + { 1153 + u32 old_tsr = vcpu->arch.tsr; 1154 + 1155 + vcpu->arch.tsr = new_tsr; 1156 + 1157 + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) 1158 + arm_next_watchdog(vcpu); 1159 + 1160 + update_timer_ints(vcpu); 1161 + } 1162 + 1159 1163 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ 1160 1164 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 1161 1165 { ··· 1307 1287 kvmppc_emulate_dec(vcpu); 1308 1288 } 1309 1289 1310 - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { 1311 - u32 old_tsr = vcpu->arch.tsr; 1312 - 1313 - vcpu->arch.tsr = sregs->u.e.tsr; 1314 - 1315 - if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) 1316 - arm_next_watchdog(vcpu); 1317 - 1318 - update_timer_ints(vcpu); 1319 - } 1290 + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) 1291 + kvmppc_set_tsr(vcpu, sregs->u.e.tsr); 1320 1292 1321 1293 return 0; 1322 1294 } ··· 1421 1409 1422 1410 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1423 1411 { 1424 - int r = -EINVAL; 1412 + int r = 0; 1413 + union kvmppc_one_reg val; 1414 + int size; 1415 + long int i; 1416 + 1417 + size = one_reg_size(reg->id); 1418 + if (size > sizeof(val)) 1419 + return -EINVAL; 1425 1420 1426 1421 switch (reg->id) { 1427 1422 case KVM_REG_PPC_IAC1: 1428 1423 case KVM_REG_PPC_IAC2: 1429 1424 case KVM_REG_PPC_IAC3: 1430 - case KVM_REG_PPC_IAC4: { 1431 - int iac = reg->id - KVM_REG_PPC_IAC1; 1432 - r = copy_to_user((u64 __user *)(long)reg->addr, 1433 - &vcpu->arch.dbg_reg.iac[iac], sizeof(u64)); 1425 + case KVM_REG_PPC_IAC4: 1426 + i = reg->id - KVM_REG_PPC_IAC1; 1427 + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]); 1434 1428 break; 1435 - } 1436 1429 case KVM_REG_PPC_DAC1: 1437 - case KVM_REG_PPC_DAC2: { 1438 - int dac = reg->id - KVM_REG_PPC_DAC1; 1439 - r = copy_to_user((u64 __user *)(long)reg->addr, 1440 - &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); 1430 + case KVM_REG_PPC_DAC2: 1431 + i = reg->id - KVM_REG_PPC_DAC1; 1432 + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]); 1441 1433 break; 1442 - } 1443 1434 case KVM_REG_PPC_EPR: { 1444 1435 u32 epr = get_guest_epr(vcpu); 1445 - r = put_user(epr, (u32 __user *)(long)reg->addr); 1436 + val = get_reg_val(reg->id, epr); 1446 1437 break; 1447 1438 } 1448 1439 #if defined(CONFIG_64BIT) 1449 1440 case KVM_REG_PPC_EPCR: 1450 - r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); 1441 + val = get_reg_val(reg->id, vcpu->arch.epcr); 1451 1442 break; 1452 1443 #endif 1444 + case KVM_REG_PPC_TCR: 1445 + val = get_reg_val(reg->id, vcpu->arch.tcr); 1446 + break; 1447 + case KVM_REG_PPC_TSR: 1448 + val = get_reg_val(reg->id, vcpu->arch.tsr); 1449 + break; 1450 + case KVM_REG_PPC_DEBUG_INST: 1451 + val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV); 1452 + break; 1453 1453 default: 1454 + r = kvmppc_get_one_reg(vcpu, reg->id, &val); 1454 1455 break; 1455 1456 } 1457 + 1458 + if (r) 1459 + return r; 1460 + 1461 + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) 1462 + r = -EFAULT; 1463 + 1456 1464 return r; 1457 1465 } 1458 1466 1459 1467 int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1460 1468 { 1461 - int r = -EINVAL; 1469 + int r = 0; 1470 + union kvmppc_one_reg val; 1471 + int size; 1472 + long int i; 1473 + 1474 + size = one_reg_size(reg->id); 1475 + if (size > sizeof(val)) 1476 + return -EINVAL; 1477 + 1478 + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) 1479 + return -EFAULT; 1462 1480 1463 1481 switch (reg->id) { 1464 1482 case KVM_REG_PPC_IAC1: 1465 1483 case KVM_REG_PPC_IAC2: 1466 1484 case KVM_REG_PPC_IAC3: 1467 - case KVM_REG_PPC_IAC4: { 1468 - int iac = reg->id - KVM_REG_PPC_IAC1; 1469 - r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac], 1470 - (u64 __user *)(long)reg->addr, sizeof(u64)); 1485 + case KVM_REG_PPC_IAC4: 1486 + i = reg->id - KVM_REG_PPC_IAC1; 1487 + vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val); 1471 1488 break; 1472 - } 1473 1489 case KVM_REG_PPC_DAC1: 1474 - case KVM_REG_PPC_DAC2: { 1475 - int dac = reg->id - KVM_REG_PPC_DAC1; 1476 - r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac], 1477 - (u64 __user *)(long)reg->addr, sizeof(u64)); 1490 + case KVM_REG_PPC_DAC2: 1491 + i = reg->id - KVM_REG_PPC_DAC1; 1492 + vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val); 1478 1493 break; 1479 - } 1480 1494 case KVM_REG_PPC_EPR: { 1481 - u32 new_epr; 1482 - r = get_user(new_epr, (u32 __user *)(long)reg->addr); 1483 - if (!r) 1484 - kvmppc_set_epr(vcpu, new_epr); 1495 + u32 new_epr = set_reg_val(reg->id, val); 1496 + kvmppc_set_epr(vcpu, new_epr); 1485 1497 break; 1486 1498 } 1487 1499 #if defined(CONFIG_64BIT) 1488 1500 case KVM_REG_PPC_EPCR: { 1489 - u32 new_epcr; 1490 - r = get_user(new_epcr, (u32 __user *)(long)reg->addr); 1491 - if (r == 0) 1492 - kvmppc_set_epcr(vcpu, new_epcr); 1501 + u32 new_epcr = set_reg_val(reg->id, val); 1502 + kvmppc_set_epcr(vcpu, new_epcr); 1493 1503 break; 1494 1504 } 1495 1505 #endif 1496 - default: 1506 + case KVM_REG_PPC_OR_TSR: { 1507 + u32 tsr_bits = set_reg_val(reg->id, val); 1508 + kvmppc_set_tsr_bits(vcpu, tsr_bits); 1497 1509 break; 1498 1510 } 1511 + case KVM_REG_PPC_CLEAR_TSR: { 1512 + u32 tsr_bits = set_reg_val(reg->id, val); 1513 + kvmppc_clr_tsr_bits(vcpu, tsr_bits); 1514 + break; 1515 + } 1516 + case KVM_REG_PPC_TSR: { 1517 + u32 tsr = set_reg_val(reg->id, val); 1518 + kvmppc_set_tsr(vcpu, tsr); 1519 + break; 1520 + } 1521 + case KVM_REG_PPC_TCR: { 1522 + u32 tcr = set_reg_val(reg->id, val); 1523 + kvmppc_set_tcr(vcpu, tcr); 1524 + break; 1525 + } 1526 + default: 1527 + r = kvmppc_set_one_reg(vcpu, reg->id, &val); 1528 + break; 1529 + } 1530 + 1499 1531 return r; 1532 + } 1533 + 1534 + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 1535 + struct kvm_guest_debug *dbg) 1536 + { 1537 + return -EINVAL; 1500 1538 } 1501 1539 1502 1540 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) ··· 1593 1531 1594 1532 void kvmppc_core_commit_memory_region(struct kvm *kvm, 1595 1533 struct kvm_userspace_memory_region *mem, 1596 - struct kvm_memory_slot old) 1534 + const struct kvm_memory_slot *old) 1597 1535 { 1598 1536 } 1599 1537

+39 -3

arch/powerpc/kvm/booke_interrupts.S

··· 54 54 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ 55 55 (1<<BOOKE_INTERRUPT_ALIGNMENT)) 56 56 57 - .macro KVM_HANDLER ivor_nr scratch srr0 58 - _GLOBAL(kvmppc_handler_\ivor_nr) 57 + .macro __KVM_HANDLER ivor_nr scratch srr0 59 58 /* Get pointer to vcpu and record exit number. */ 60 59 mtspr \scratch , r4 61 60 mfspr r4, SPRN_SPRG_THREAD ··· 73 74 ori r6, r6, kvmppc_resume_host@l 74 75 mtctr r6 75 76 bctr 77 + .endm 78 + 79 + .macro KVM_HANDLER ivor_nr scratch srr0 80 + _GLOBAL(kvmppc_handler_\ivor_nr) 81 + __KVM_HANDLER \ivor_nr \scratch \srr0 82 + .endm 83 + 84 + .macro KVM_DBG_HANDLER ivor_nr scratch srr0 85 + _GLOBAL(kvmppc_handler_\ivor_nr) 86 + mtspr \scratch, r4 87 + mfspr r4, SPRN_SPRG_THREAD 88 + lwz r4, THREAD_KVM_VCPU(r4) 89 + stw r3, VCPU_CRIT_SAVE(r4) 90 + mfcr r3 91 + mfspr r4, SPRN_CSRR1 92 + andi. r4, r4, MSR_PR 93 + bne 1f 94 + /* debug interrupt happened in enter/exit path */ 95 + mfspr r4, SPRN_CSRR1 96 + rlwinm r4, r4, 0, ~MSR_DE 97 + mtspr SPRN_CSRR1, r4 98 + lis r4, 0xffff 99 + ori r4, r4, 0xffff 100 + mtspr SPRN_DBSR, r4 101 + mfspr r4, SPRN_SPRG_THREAD 102 + lwz r4, THREAD_KVM_VCPU(r4) 103 + mtcr r3 104 + lwz r3, VCPU_CRIT_SAVE(r4) 105 + mfspr r4, \scratch 106 + rfci 107 + 1: /* debug interrupt happened in guest */ 108 + mtcr r3 109 + mfspr r4, SPRN_SPRG_THREAD 110 + lwz r4, THREAD_KVM_VCPU(r4) 111 + lwz r3, VCPU_CRIT_SAVE(r4) 112 + mfspr r4, \scratch 113 + __KVM_HANDLER \ivor_nr \scratch \srr0 76 114 .endm 77 115 78 116 .macro KVM_HANDLER_ADDR ivor_nr ··· 136 100 KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 137 101 KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 138 102 KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 139 - KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 103 + KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 140 104 KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 141 105 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 142 106 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0

+14

arch/powerpc/kvm/e500.c

··· 425 425 return kvmppc_set_sregs_ivor(vcpu, sregs); 426 426 } 427 427 428 + int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, 429 + union kvmppc_one_reg *val) 430 + { 431 + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); 432 + return r; 433 + } 434 + 435 + int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, 436 + union kvmppc_one_reg *val) 437 + { 438 + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); 439 + return r; 440 + } 441 + 428 442 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) 429 443 { 430 444 struct kvmppc_vcpu_e500 *vcpu_e500;

+22

arch/powerpc/kvm/e500.h

··· 23 23 #include <asm/mmu-book3e.h> 24 24 #include <asm/tlb.h> 25 25 26 + enum vcpu_ftr { 27 + VCPU_FTR_MMU_V2 28 + }; 29 + 26 30 #define E500_PID_NUM 3 27 31 #define E500_TLB_NUM 2 28 32 ··· 135 131 void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 136 132 int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 137 133 134 + int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, 135 + union kvmppc_one_reg *val); 136 + int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, 137 + union kvmppc_one_reg *val); 138 138 139 139 #ifdef CONFIG_KVM_E500V2 140 140 unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, ··· 302 294 /* Force TS=1 for all guest mappings. */ 303 295 #define get_tlb_sts(gtlbe) (MAS1_TS) 304 296 #endif /* !BOOKE_HV */ 297 + 298 + static inline bool has_feature(const struct kvm_vcpu *vcpu, 299 + enum vcpu_ftr ftr) 300 + { 301 + bool has_ftr; 302 + switch (ftr) { 303 + case VCPU_FTR_MMU_V2: 304 + has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2); 305 + break; 306 + default: 307 + return false; 308 + } 309 + return has_ftr; 310 + } 305 311 306 312 #endif /* KVM_E500_H */

+19

arch/powerpc/kvm/e500_emulate.c

··· 284 284 case SPRN_TLB1CFG: 285 285 *spr_val = vcpu->arch.tlbcfg[1]; 286 286 break; 287 + case SPRN_TLB0PS: 288 + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) 289 + return EMULATE_FAIL; 290 + *spr_val = vcpu->arch.tlbps[0]; 291 + break; 292 + case SPRN_TLB1PS: 293 + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) 294 + return EMULATE_FAIL; 295 + *spr_val = vcpu->arch.tlbps[1]; 296 + break; 287 297 case SPRN_L1CSR0: 288 298 *spr_val = vcpu_e500->l1csr0; 289 299 break; ··· 316 306 317 307 case SPRN_MMUCFG: 318 308 *spr_val = vcpu->arch.mmucfg; 309 + break; 310 + case SPRN_EPTCFG: 311 + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) 312 + return EMULATE_FAIL; 313 + /* 314 + * Legacy Linux guests access EPTCFG register even if the E.PT 315 + * category is disabled in the VM. Give them a chance to live. 316 + */ 317 + *spr_val = vcpu->arch.eptcfg; 319 318 break; 320 319 321 320 /* extra exceptions */

+170 -22

arch/powerpc/kvm/e500_mmu.c

··· 596 596 return 0; 597 597 } 598 598 599 + int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, 600 + union kvmppc_one_reg *val) 601 + { 602 + int r = 0; 603 + long int i; 604 + 605 + switch (id) { 606 + case KVM_REG_PPC_MAS0: 607 + *val = get_reg_val(id, vcpu->arch.shared->mas0); 608 + break; 609 + case KVM_REG_PPC_MAS1: 610 + *val = get_reg_val(id, vcpu->arch.shared->mas1); 611 + break; 612 + case KVM_REG_PPC_MAS2: 613 + *val = get_reg_val(id, vcpu->arch.shared->mas2); 614 + break; 615 + case KVM_REG_PPC_MAS7_3: 616 + *val = get_reg_val(id, vcpu->arch.shared->mas7_3); 617 + break; 618 + case KVM_REG_PPC_MAS4: 619 + *val = get_reg_val(id, vcpu->arch.shared->mas4); 620 + break; 621 + case KVM_REG_PPC_MAS6: 622 + *val = get_reg_val(id, vcpu->arch.shared->mas6); 623 + break; 624 + case KVM_REG_PPC_MMUCFG: 625 + *val = get_reg_val(id, vcpu->arch.mmucfg); 626 + break; 627 + case KVM_REG_PPC_EPTCFG: 628 + *val = get_reg_val(id, vcpu->arch.eptcfg); 629 + break; 630 + case KVM_REG_PPC_TLB0CFG: 631 + case KVM_REG_PPC_TLB1CFG: 632 + case KVM_REG_PPC_TLB2CFG: 633 + case KVM_REG_PPC_TLB3CFG: 634 + i = id - KVM_REG_PPC_TLB0CFG; 635 + *val = get_reg_val(id, vcpu->arch.tlbcfg[i]); 636 + break; 637 + case KVM_REG_PPC_TLB0PS: 638 + case KVM_REG_PPC_TLB1PS: 639 + case KVM_REG_PPC_TLB2PS: 640 + case KVM_REG_PPC_TLB3PS: 641 + i = id - KVM_REG_PPC_TLB0PS; 642 + *val = get_reg_val(id, vcpu->arch.tlbps[i]); 643 + break; 644 + default: 645 + r = -EINVAL; 646 + break; 647 + } 648 + 649 + return r; 650 + } 651 + 652 + int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, 653 + union kvmppc_one_reg *val) 654 + { 655 + int r = 0; 656 + long int i; 657 + 658 + switch (id) { 659 + case KVM_REG_PPC_MAS0: 660 + vcpu->arch.shared->mas0 = set_reg_val(id, *val); 661 + break; 662 + case KVM_REG_PPC_MAS1: 663 + vcpu->arch.shared->mas1 = set_reg_val(id, *val); 664 + break; 665 + case KVM_REG_PPC_MAS2: 666 + vcpu->arch.shared->mas2 = set_reg_val(id, *val); 667 + break; 668 + case KVM_REG_PPC_MAS7_3: 669 + vcpu->arch.shared->mas7_3 = set_reg_val(id, *val); 670 + break; 671 + case KVM_REG_PPC_MAS4: 672 + vcpu->arch.shared->mas4 = set_reg_val(id, *val); 673 + break; 674 + case KVM_REG_PPC_MAS6: 675 + vcpu->arch.shared->mas6 = set_reg_val(id, *val); 676 + break; 677 + /* Only allow MMU registers to be set to the config supported by KVM */ 678 + case KVM_REG_PPC_MMUCFG: { 679 + u32 reg = set_reg_val(id, *val); 680 + if (reg != vcpu->arch.mmucfg) 681 + r = -EINVAL; 682 + break; 683 + } 684 + case KVM_REG_PPC_EPTCFG: { 685 + u32 reg = set_reg_val(id, *val); 686 + if (reg != vcpu->arch.eptcfg) 687 + r = -EINVAL; 688 + break; 689 + } 690 + case KVM_REG_PPC_TLB0CFG: 691 + case KVM_REG_PPC_TLB1CFG: 692 + case KVM_REG_PPC_TLB2CFG: 693 + case KVM_REG_PPC_TLB3CFG: { 694 + /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */ 695 + u32 reg = set_reg_val(id, *val); 696 + i = id - KVM_REG_PPC_TLB0CFG; 697 + if (reg != vcpu->arch.tlbcfg[i]) 698 + r = -EINVAL; 699 + break; 700 + } 701 + case KVM_REG_PPC_TLB0PS: 702 + case KVM_REG_PPC_TLB1PS: 703 + case KVM_REG_PPC_TLB2PS: 704 + case KVM_REG_PPC_TLB3PS: { 705 + u32 reg = set_reg_val(id, *val); 706 + i = id - KVM_REG_PPC_TLB0PS; 707 + if (reg != vcpu->arch.tlbps[i]) 708 + r = -EINVAL; 709 + break; 710 + } 711 + default: 712 + r = -EINVAL; 713 + break; 714 + } 715 + 716 + return r; 717 + } 718 + 719 + static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu, 720 + struct kvm_book3e_206_tlb_params *params) 721 + { 722 + vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 723 + if (params->tlb_sizes[0] <= 2048) 724 + vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0]; 725 + vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; 726 + 727 + vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 728 + vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1]; 729 + vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; 730 + return 0; 731 + } 732 + 599 733 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, 600 734 struct kvm_config_tlb *cfg) 601 735 { ··· 826 692 vcpu_e500->gtlb_offset[0] = 0; 827 693 vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; 828 694 829 - vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; 830 - 831 - vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 832 - if (params.tlb_sizes[0] <= 2048) 833 - vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0]; 834 - vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; 835 - 836 - vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 837 - vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1]; 838 - vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; 695 + /* Update vcpu's MMU geometry based on SW_TLB input */ 696 + vcpu_mmu_geometry_update(vcpu, &params); 839 697 840 698 vcpu_e500->shared_tlb_pages = pages; 841 699 vcpu_e500->num_shared_tlb_pages = num_pages; ··· 860 734 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 861 735 kvmppc_recalc_tlb1map_range(vcpu_e500); 862 736 kvmppc_core_flush_tlb(vcpu); 737 + return 0; 738 + } 739 + 740 + /* Vcpu's MMU default configuration */ 741 + static int vcpu_mmu_init(struct kvm_vcpu *vcpu, 742 + struct kvmppc_e500_tlb_params *params) 743 + { 744 + /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/ 745 + vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; 746 + 747 + /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/ 748 + vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & 749 + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 750 + vcpu->arch.tlbcfg[0] |= params[0].entries; 751 + vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT; 752 + 753 + vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & 754 + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 755 + vcpu->arch.tlbcfg[1] |= params[1].entries; 756 + vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT; 757 + 758 + if (has_feature(vcpu, VCPU_FTR_MMU_V2)) { 759 + vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); 760 + vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); 761 + 762 + vcpu->arch.mmucfg &= ~MMUCFG_LRAT; 763 + 764 + /* Guest mmu emulation currently doesn't handle E.PT */ 765 + vcpu->arch.eptcfg = 0; 766 + vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT; 767 + vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND; 768 + } 769 + 863 770 return 0; 864 771 } 865 772 ··· 940 781 if (!vcpu_e500->g2h_tlb1_map) 941 782 goto err; 942 783 943 - /* Init TLB configuration register */ 944 - vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & 945 - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 946 - vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries; 947 - vcpu->arch.tlbcfg[0] |= 948 - vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT; 949 - 950 - vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & 951 - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 952 - vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries; 953 - vcpu->arch.tlbcfg[1] |= 954 - vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT; 784 + vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); 955 785 956 786 kvmppc_recalc_tlb1map_range(vcpu_e500); 957 787 return 0;

+16

arch/powerpc/kvm/e500mc.c

··· 177 177 r = 0; 178 178 else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) 179 179 r = 0; 180 + else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0) 181 + r = 0; 180 182 else 181 183 r = -ENOTSUPP; 182 184 ··· 260 258 } 261 259 262 260 return kvmppc_set_sregs_ivor(vcpu, sregs); 261 + } 262 + 263 + int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, 264 + union kvmppc_one_reg *val) 265 + { 266 + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); 267 + return r; 268 + } 269 + 270 + int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, 271 + union kvmppc_one_reg *val) 272 + { 273 + int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); 274 + return r; 263 275 } 264 276 265 277 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)

+2

arch/powerpc/kvm/emulate.c

··· 38 38 39 39 #define OP_31_XOP_TRAP 4 40 40 #define OP_31_XOP_LWZX 23 41 + #define OP_31_XOP_DCBST 54 41 42 #define OP_31_XOP_TRAP_64 68 42 43 #define OP_31_XOP_DCBF 86 43 44 #define OP_31_XOP_LBZX 87 ··· 371 370 emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); 372 371 break; 373 372 373 + case OP_31_XOP_DCBST: 374 374 case OP_31_XOP_DCBF: 375 375 case OP_31_XOP_DCBI: 376 376 /* Do nothing. The guest is performing dcbi because

+20

arch/powerpc/kvm/irq.h

··· 1 + #ifndef __IRQ_H 2 + #define __IRQ_H 3 + 4 + #include <linux/kvm_host.h> 5 + 6 + static inline int irqchip_in_kernel(struct kvm *kvm) 7 + { 8 + int ret = 0; 9 + 10 + #ifdef CONFIG_KVM_MPIC 11 + ret = ret || (kvm->arch.mpic != NULL); 12 + #endif 13 + #ifdef CONFIG_KVM_XICS 14 + ret = ret || (kvm->arch.xics != NULL); 15 + #endif 16 + smp_rmb(); 17 + return ret; 18 + } 19 + 20 + #endif

+1853

arch/powerpc/kvm/mpic.c

··· 1 + /* 2 + * OpenPIC emulation 3 + * 4 + * Copyright (c) 2004 Jocelyn Mayer 5 + * 2011 Alexander Graf 6 + * 7 + * Permission is hereby granted, free of charge, to any person obtaining a copy 8 + * of this software and associated documentation files (the "Software"), to deal 9 + * in the Software without restriction, including without limitation the rights 10 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 + * copies of the Software, and to permit persons to whom the Software is 12 + * furnished to do so, subject to the following conditions: 13 + * 14 + * The above copyright notice and this permission notice shall be included in 15 + * all copies or substantial portions of the Software. 16 + * 17 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 + * THE SOFTWARE. 24 + */ 25 + 26 + #include <linux/slab.h> 27 + #include <linux/mutex.h> 28 + #include <linux/kvm_host.h> 29 + #include <linux/errno.h> 30 + #include <linux/fs.h> 31 + #include <linux/anon_inodes.h> 32 + #include <asm/uaccess.h> 33 + #include <asm/mpic.h> 34 + #include <asm/kvm_para.h> 35 + #include <asm/kvm_host.h> 36 + #include <asm/kvm_ppc.h> 37 + #include "iodev.h" 38 + 39 + #define MAX_CPU 32 40 + #define MAX_SRC 256 41 + #define MAX_TMR 4 42 + #define MAX_IPI 4 43 + #define MAX_MSI 8 44 + #define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR) 45 + #define VID 0x03 /* MPIC version ID */ 46 + 47 + /* OpenPIC capability flags */ 48 + #define OPENPIC_FLAG_IDR_CRIT (1 << 0) 49 + #define OPENPIC_FLAG_ILR (2 << 0) 50 + 51 + /* OpenPIC address map */ 52 + #define OPENPIC_REG_SIZE 0x40000 53 + #define OPENPIC_GLB_REG_START 0x0 54 + #define OPENPIC_GLB_REG_SIZE 0x10F0 55 + #define OPENPIC_TMR_REG_START 0x10F0 56 + #define OPENPIC_TMR_REG_SIZE 0x220 57 + #define OPENPIC_MSI_REG_START 0x1600 58 + #define OPENPIC_MSI_REG_SIZE 0x200 59 + #define OPENPIC_SUMMARY_REG_START 0x3800 60 + #define OPENPIC_SUMMARY_REG_SIZE 0x800 61 + #define OPENPIC_SRC_REG_START 0x10000 62 + #define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20) 63 + #define OPENPIC_CPU_REG_START 0x20000 64 + #define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000)) 65 + 66 + struct fsl_mpic_info { 67 + int max_ext; 68 + }; 69 + 70 + static struct fsl_mpic_info fsl_mpic_20 = { 71 + .max_ext = 12, 72 + }; 73 + 74 + static struct fsl_mpic_info fsl_mpic_42 = { 75 + .max_ext = 12, 76 + }; 77 + 78 + #define FRR_NIRQ_SHIFT 16 79 + #define FRR_NCPU_SHIFT 8 80 + #define FRR_VID_SHIFT 0 81 + 82 + #define VID_REVISION_1_2 2 83 + #define VID_REVISION_1_3 3 84 + 85 + #define VIR_GENERIC 0x00000000 /* Generic Vendor ID */ 86 + 87 + #define GCR_RESET 0x80000000 88 + #define GCR_MODE_PASS 0x00000000 89 + #define GCR_MODE_MIXED 0x20000000 90 + #define GCR_MODE_PROXY 0x60000000 91 + 92 + #define TBCR_CI 0x80000000 /* count inhibit */ 93 + #define TCCR_TOG 0x80000000 /* toggles when decrement to zero */ 94 + 95 + #define IDR_EP_SHIFT 31 96 + #define IDR_EP_MASK (1 << IDR_EP_SHIFT) 97 + #define IDR_CI0_SHIFT 30 98 + #define IDR_CI1_SHIFT 29 99 + #define IDR_P1_SHIFT 1 100 + #define IDR_P0_SHIFT 0 101 + 102 + #define ILR_INTTGT_MASK 0x000000ff 103 + #define ILR_INTTGT_INT 0x00 104 + #define ILR_INTTGT_CINT 0x01 /* critical */ 105 + #define ILR_INTTGT_MCP 0x02 /* machine check */ 106 + #define NUM_OUTPUTS 3 107 + 108 + #define MSIIR_OFFSET 0x140 109 + #define MSIIR_SRS_SHIFT 29 110 + #define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT) 111 + #define MSIIR_IBS_SHIFT 24 112 + #define MSIIR_IBS_MASK (0x1f << MSIIR_IBS_SHIFT) 113 + 114 + static int get_current_cpu(void) 115 + { 116 + #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) 117 + struct kvm_vcpu *vcpu = current->thread.kvm_vcpu; 118 + return vcpu ? vcpu->arch.irq_cpu_id : -1; 119 + #else 120 + /* XXX */ 121 + return -1; 122 + #endif 123 + } 124 + 125 + static int openpic_cpu_write_internal(void *opaque, gpa_t addr, 126 + u32 val, int idx); 127 + static int openpic_cpu_read_internal(void *opaque, gpa_t addr, 128 + u32 *ptr, int idx); 129 + 130 + enum irq_type { 131 + IRQ_TYPE_NORMAL = 0, 132 + IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */ 133 + IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */ 134 + }; 135 + 136 + struct irq_queue { 137 + /* Round up to the nearest 64 IRQs so that the queue length 138 + * won't change when moving between 32 and 64 bit hosts. 139 + */ 140 + unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)]; 141 + int next; 142 + int priority; 143 + }; 144 + 145 + struct irq_source { 146 + uint32_t ivpr; /* IRQ vector/priority register */ 147 + uint32_t idr; /* IRQ destination register */ 148 + uint32_t destmask; /* bitmap of CPU destinations */ 149 + int last_cpu; 150 + int output; /* IRQ level, e.g. ILR_INTTGT_INT */ 151 + int pending; /* TRUE if IRQ is pending */ 152 + enum irq_type type; 153 + bool level:1; /* level-triggered */ 154 + bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */ 155 + }; 156 + 157 + #define IVPR_MASK_SHIFT 31 158 + #define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT) 159 + #define IVPR_ACTIVITY_SHIFT 30 160 + #define IVPR_ACTIVITY_MASK (1 << IVPR_ACTIVITY_SHIFT) 161 + #define IVPR_MODE_SHIFT 29 162 + #define IVPR_MODE_MASK (1 << IVPR_MODE_SHIFT) 163 + #define IVPR_POLARITY_SHIFT 23 164 + #define IVPR_POLARITY_MASK (1 << IVPR_POLARITY_SHIFT) 165 + #define IVPR_SENSE_SHIFT 22 166 + #define IVPR_SENSE_MASK (1 << IVPR_SENSE_SHIFT) 167 + 168 + #define IVPR_PRIORITY_MASK (0xF << 16) 169 + #define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16)) 170 + #define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask) 171 + 172 + /* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */ 173 + #define IDR_EP 0x80000000 /* external pin */ 174 + #define IDR_CI 0x40000000 /* critical interrupt */ 175 + 176 + struct irq_dest { 177 + struct kvm_vcpu *vcpu; 178 + 179 + int32_t ctpr; /* CPU current task priority */ 180 + struct irq_queue raised; 181 + struct irq_queue servicing; 182 + 183 + /* Count of IRQ sources asserting on non-INT outputs */ 184 + uint32_t outputs_active[NUM_OUTPUTS]; 185 + }; 186 + 187 + #define MAX_MMIO_REGIONS 10 188 + 189 + struct openpic { 190 + struct kvm *kvm; 191 + struct kvm_device *dev; 192 + struct kvm_io_device mmio; 193 + const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS]; 194 + int num_mmio_regions; 195 + 196 + gpa_t reg_base; 197 + spinlock_t lock; 198 + 199 + /* Behavior control */ 200 + struct fsl_mpic_info *fsl; 201 + uint32_t model; 202 + uint32_t flags; 203 + uint32_t nb_irqs; 204 + uint32_t vid; 205 + uint32_t vir; /* Vendor identification register */ 206 + uint32_t vector_mask; 207 + uint32_t tfrr_reset; 208 + uint32_t ivpr_reset; 209 + uint32_t idr_reset; 210 + uint32_t brr1; 211 + uint32_t mpic_mode_mask; 212 + 213 + /* Global registers */ 214 + uint32_t frr; /* Feature reporting register */ 215 + uint32_t gcr; /* Global configuration register */ 216 + uint32_t pir; /* Processor initialization register */ 217 + uint32_t spve; /* Spurious vector register */ 218 + uint32_t tfrr; /* Timer frequency reporting register */ 219 + /* Source registers */ 220 + struct irq_source src[MAX_IRQ]; 221 + /* Local registers per output pin */ 222 + struct irq_dest dst[MAX_CPU]; 223 + uint32_t nb_cpus; 224 + /* Timer registers */ 225 + struct { 226 + uint32_t tccr; /* Global timer current count register */ 227 + uint32_t tbcr; /* Global timer base count register */ 228 + } timers[MAX_TMR]; 229 + /* Shared MSI registers */ 230 + struct { 231 + uint32_t msir; /* Shared Message Signaled Interrupt Register */ 232 + } msi[MAX_MSI]; 233 + uint32_t max_irq; 234 + uint32_t irq_ipi0; 235 + uint32_t irq_tim0; 236 + uint32_t irq_msi; 237 + }; 238 + 239 + 240 + static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst, 241 + int output) 242 + { 243 + struct kvm_interrupt irq = { 244 + .irq = KVM_INTERRUPT_SET_LEVEL, 245 + }; 246 + 247 + if (!dst->vcpu) { 248 + pr_debug("%s: destination cpu %d does not exist\n", 249 + __func__, (int)(dst - &opp->dst[0])); 250 + return; 251 + } 252 + 253 + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, 254 + output); 255 + 256 + if (output != ILR_INTTGT_INT) /* TODO */ 257 + return; 258 + 259 + kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq); 260 + } 261 + 262 + static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst, 263 + int output) 264 + { 265 + if (!dst->vcpu) { 266 + pr_debug("%s: destination cpu %d does not exist\n", 267 + __func__, (int)(dst - &opp->dst[0])); 268 + return; 269 + } 270 + 271 + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, 272 + output); 273 + 274 + if (output != ILR_INTTGT_INT) /* TODO */ 275 + return; 276 + 277 + kvmppc_core_dequeue_external(dst->vcpu); 278 + } 279 + 280 + static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ) 281 + { 282 + set_bit(n_IRQ, q->queue); 283 + } 284 + 285 + static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ) 286 + { 287 + clear_bit(n_IRQ, q->queue); 288 + } 289 + 290 + static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ) 291 + { 292 + return test_bit(n_IRQ, q->queue); 293 + } 294 + 295 + static void IRQ_check(struct openpic *opp, struct irq_queue *q) 296 + { 297 + int irq = -1; 298 + int next = -1; 299 + int priority = -1; 300 + 301 + for (;;) { 302 + irq = find_next_bit(q->queue, opp->max_irq, irq + 1); 303 + if (irq == opp->max_irq) 304 + break; 305 + 306 + pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", 307 + irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority); 308 + 309 + if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) { 310 + next = irq; 311 + priority = IVPR_PRIORITY(opp->src[irq].ivpr); 312 + } 313 + } 314 + 315 + q->next = next; 316 + q->priority = priority; 317 + } 318 + 319 + static int IRQ_get_next(struct openpic *opp, struct irq_queue *q) 320 + { 321 + /* XXX: optimize */ 322 + IRQ_check(opp, q); 323 + 324 + return q->next; 325 + } 326 + 327 + static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, 328 + bool active, bool was_active) 329 + { 330 + struct irq_dest *dst; 331 + struct irq_source *src; 332 + int priority; 333 + 334 + dst = &opp->dst[n_CPU]; 335 + src = &opp->src[n_IRQ]; 336 + 337 + pr_debug("%s: IRQ %d active %d was %d\n", 338 + __func__, n_IRQ, active, was_active); 339 + 340 + if (src->output != ILR_INTTGT_INT) { 341 + pr_debug("%s: output %d irq %d active %d was %d count %d\n", 342 + __func__, src->output, n_IRQ, active, was_active, 343 + dst->outputs_active[src->output]); 344 + 345 + /* On Freescale MPIC, critical interrupts ignore priority, 346 + * IACK, EOI, etc. Before MPIC v4.1 they also ignore 347 + * masking. 348 + */ 349 + if (active) { 350 + if (!was_active && 351 + dst->outputs_active[src->output]++ == 0) { 352 + pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n", 353 + __func__, src->output, n_CPU, n_IRQ); 354 + mpic_irq_raise(opp, dst, src->output); 355 + } 356 + } else { 357 + if (was_active && 358 + --dst->outputs_active[src->output] == 0) { 359 + pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n", 360 + __func__, src->output, n_CPU, n_IRQ); 361 + mpic_irq_lower(opp, dst, src->output); 362 + } 363 + } 364 + 365 + return; 366 + } 367 + 368 + priority = IVPR_PRIORITY(src->ivpr); 369 + 370 + /* Even if the interrupt doesn't have enough priority, 371 + * it is still raised, in case ctpr is lowered later. 372 + */ 373 + if (active) 374 + IRQ_setbit(&dst->raised, n_IRQ); 375 + else 376 + IRQ_resetbit(&dst->raised, n_IRQ); 377 + 378 + IRQ_check(opp, &dst->raised); 379 + 380 + if (active && priority <= dst->ctpr) { 381 + pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", 382 + __func__, n_IRQ, priority, dst->ctpr, n_CPU); 383 + active = 0; 384 + } 385 + 386 + if (active) { 387 + if (IRQ_get_next(opp, &dst->servicing) >= 0 && 388 + priority <= dst->servicing.priority) { 389 + pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", 390 + __func__, n_IRQ, dst->servicing.next, n_CPU); 391 + } else { 392 + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", 393 + __func__, n_CPU, n_IRQ, dst->raised.next); 394 + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); 395 + } 396 + } else { 397 + IRQ_get_next(opp, &dst->servicing); 398 + if (dst->raised.priority > dst->ctpr && 399 + dst->raised.priority > dst->servicing.priority) { 400 + pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", 401 + __func__, n_IRQ, dst->raised.next, 402 + dst->raised.priority, dst->ctpr, 403 + dst->servicing.priority, n_CPU); 404 + /* IRQ line stays asserted */ 405 + } else { 406 + pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", 407 + __func__, n_IRQ, dst->ctpr, 408 + dst->servicing.priority, n_CPU); 409 + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); 410 + } 411 + } 412 + } 413 + 414 + /* update pic state because registers for n_IRQ have changed value */ 415 + static void openpic_update_irq(struct openpic *opp, int n_IRQ) 416 + { 417 + struct irq_source *src; 418 + bool active, was_active; 419 + int i; 420 + 421 + src = &opp->src[n_IRQ]; 422 + active = src->pending; 423 + 424 + if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) { 425 + /* Interrupt source is disabled */ 426 + pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ); 427 + active = false; 428 + } 429 + 430 + was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK); 431 + 432 + /* 433 + * We don't have a similar check for already-active because 434 + * ctpr may have changed and we need to withdraw the interrupt. 435 + */ 436 + if (!active && !was_active) { 437 + pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ); 438 + return; 439 + } 440 + 441 + if (active) 442 + src->ivpr |= IVPR_ACTIVITY_MASK; 443 + else 444 + src->ivpr &= ~IVPR_ACTIVITY_MASK; 445 + 446 + if (src->destmask == 0) { 447 + /* No target */ 448 + pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ); 449 + return; 450 + } 451 + 452 + if (src->destmask == (1 << src->last_cpu)) { 453 + /* Only one CPU is allowed to receive this IRQ */ 454 + IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active); 455 + } else if (!(src->ivpr & IVPR_MODE_MASK)) { 456 + /* Directed delivery mode */ 457 + for (i = 0; i < opp->nb_cpus; i++) { 458 + if (src->destmask & (1 << i)) { 459 + IRQ_local_pipe(opp, i, n_IRQ, active, 460 + was_active); 461 + } 462 + } 463 + } else { 464 + /* Distributed delivery mode */ 465 + for (i = src->last_cpu + 1; i != src->last_cpu; i++) { 466 + if (i == opp->nb_cpus) 467 + i = 0; 468 + 469 + if (src->destmask & (1 << i)) { 470 + IRQ_local_pipe(opp, i, n_IRQ, active, 471 + was_active); 472 + src->last_cpu = i; 473 + break; 474 + } 475 + } 476 + } 477 + } 478 + 479 + static void openpic_set_irq(void *opaque, int n_IRQ, int level) 480 + { 481 + struct openpic *opp = opaque; 482 + struct irq_source *src; 483 + 484 + if (n_IRQ >= MAX_IRQ) { 485 + WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ); 486 + return; 487 + } 488 + 489 + src = &opp->src[n_IRQ]; 490 + pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n", 491 + n_IRQ, level, src->ivpr); 492 + if (src->level) { 493 + /* level-sensitive irq */ 494 + src->pending = level; 495 + openpic_update_irq(opp, n_IRQ); 496 + } else { 497 + /* edge-sensitive irq */ 498 + if (level) { 499 + src->pending = 1; 500 + openpic_update_irq(opp, n_IRQ); 501 + } 502 + 503 + if (src->output != ILR_INTTGT_INT) { 504 + /* Edge-triggered interrupts shouldn't be used 505 + * with non-INT delivery, but just in case, 506 + * try to make it do something sane rather than 507 + * cause an interrupt storm. This is close to 508 + * what you'd probably see happen in real hardware. 509 + */ 510 + src->pending = 0; 511 + openpic_update_irq(opp, n_IRQ); 512 + } 513 + } 514 + } 515 + 516 + static void openpic_reset(struct openpic *opp) 517 + { 518 + int i; 519 + 520 + opp->gcr = GCR_RESET; 521 + /* Initialise controller registers */ 522 + opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) | 523 + (opp->vid << FRR_VID_SHIFT); 524 + 525 + opp->pir = 0; 526 + opp->spve = -1 & opp->vector_mask; 527 + opp->tfrr = opp->tfrr_reset; 528 + /* Initialise IRQ sources */ 529 + for (i = 0; i < opp->max_irq; i++) { 530 + opp->src[i].ivpr = opp->ivpr_reset; 531 + opp->src[i].idr = opp->idr_reset; 532 + 533 + switch (opp->src[i].type) { 534 + case IRQ_TYPE_NORMAL: 535 + opp->src[i].level = 536 + !!(opp->ivpr_reset & IVPR_SENSE_MASK); 537 + break; 538 + 539 + case IRQ_TYPE_FSLINT: 540 + opp->src[i].ivpr |= IVPR_POLARITY_MASK; 541 + break; 542 + 543 + case IRQ_TYPE_FSLSPECIAL: 544 + break; 545 + } 546 + } 547 + /* Initialise IRQ destinations */ 548 + for (i = 0; i < MAX_CPU; i++) { 549 + opp->dst[i].ctpr = 15; 550 + memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue)); 551 + opp->dst[i].raised.next = -1; 552 + memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue)); 553 + opp->dst[i].servicing.next = -1; 554 + } 555 + /* Initialise timers */ 556 + for (i = 0; i < MAX_TMR; i++) { 557 + opp->timers[i].tccr = 0; 558 + opp->timers[i].tbcr = TBCR_CI; 559 + } 560 + /* Go out of RESET state */ 561 + opp->gcr = 0; 562 + } 563 + 564 + static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ) 565 + { 566 + return opp->src[n_IRQ].idr; 567 + } 568 + 569 + static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ) 570 + { 571 + if (opp->flags & OPENPIC_FLAG_ILR) 572 + return opp->src[n_IRQ].output; 573 + 574 + return 0xffffffff; 575 + } 576 + 577 + static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ) 578 + { 579 + return opp->src[n_IRQ].ivpr; 580 + } 581 + 582 + static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, 583 + uint32_t val) 584 + { 585 + struct irq_source *src = &opp->src[n_IRQ]; 586 + uint32_t normal_mask = (1UL << opp->nb_cpus) - 1; 587 + uint32_t crit_mask = 0; 588 + uint32_t mask = normal_mask; 589 + int crit_shift = IDR_EP_SHIFT - opp->nb_cpus; 590 + int i; 591 + 592 + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { 593 + crit_mask = mask << crit_shift; 594 + mask |= crit_mask | IDR_EP; 595 + } 596 + 597 + src->idr = val & mask; 598 + pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); 599 + 600 + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { 601 + if (src->idr & crit_mask) { 602 + if (src->idr & normal_mask) { 603 + pr_debug("%s: IRQ configured for multiple output types, using critical\n", 604 + __func__); 605 + } 606 + 607 + src->output = ILR_INTTGT_CINT; 608 + src->nomask = true; 609 + src->destmask = 0; 610 + 611 + for (i = 0; i < opp->nb_cpus; i++) { 612 + int n_ci = IDR_CI0_SHIFT - i; 613 + 614 + if (src->idr & (1UL << n_ci)) 615 + src->destmask |= 1UL << i; 616 + } 617 + } else { 618 + src->output = ILR_INTTGT_INT; 619 + src->nomask = false; 620 + src->destmask = src->idr & normal_mask; 621 + } 622 + } else { 623 + src->destmask = src->idr; 624 + } 625 + } 626 + 627 + static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ, 628 + uint32_t val) 629 + { 630 + if (opp->flags & OPENPIC_FLAG_ILR) { 631 + struct irq_source *src = &opp->src[n_IRQ]; 632 + 633 + src->output = val & ILR_INTTGT_MASK; 634 + pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, 635 + src->output); 636 + 637 + /* TODO: on MPIC v4.0 only, set nomask for non-INT */ 638 + } 639 + } 640 + 641 + static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ, 642 + uint32_t val) 643 + { 644 + uint32_t mask; 645 + 646 + /* NOTE when implementing newer FSL MPIC models: starting with v4.0, 647 + * the polarity bit is read-only on internal interrupts. 648 + */ 649 + mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | 650 + IVPR_POLARITY_MASK | opp->vector_mask; 651 + 652 + /* ACTIVITY bit is read-only */ 653 + opp->src[n_IRQ].ivpr = 654 + (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); 655 + 656 + /* For FSL internal interrupts, The sense bit is reserved and zero, 657 + * and the interrupt is always level-triggered. Timers and IPIs 658 + * have no sense or polarity bits, and are edge-triggered. 659 + */ 660 + switch (opp->src[n_IRQ].type) { 661 + case IRQ_TYPE_NORMAL: 662 + opp->src[n_IRQ].level = 663 + !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); 664 + break; 665 + 666 + case IRQ_TYPE_FSLINT: 667 + opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK; 668 + break; 669 + 670 + case IRQ_TYPE_FSLSPECIAL: 671 + opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK); 672 + break; 673 + } 674 + 675 + openpic_update_irq(opp, n_IRQ); 676 + pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, 677 + opp->src[n_IRQ].ivpr); 678 + } 679 + 680 + static void openpic_gcr_write(struct openpic *opp, uint64_t val) 681 + { 682 + if (val & GCR_RESET) { 683 + openpic_reset(opp); 684 + return; 685 + } 686 + 687 + opp->gcr &= ~opp->mpic_mode_mask; 688 + opp->gcr |= val & opp->mpic_mode_mask; 689 + } 690 + 691 + static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val) 692 + { 693 + struct openpic *opp = opaque; 694 + int err = 0; 695 + 696 + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); 697 + if (addr & 0xF) 698 + return 0; 699 + 700 + switch (addr) { 701 + case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ 702 + break; 703 + case 0x40: 704 + case 0x50: 705 + case 0x60: 706 + case 0x70: 707 + case 0x80: 708 + case 0x90: 709 + case 0xA0: 710 + case 0xB0: 711 + err = openpic_cpu_write_internal(opp, addr, val, 712 + get_current_cpu()); 713 + break; 714 + case 0x1000: /* FRR */ 715 + break; 716 + case 0x1020: /* GCR */ 717 + openpic_gcr_write(opp, val); 718 + break; 719 + case 0x1080: /* VIR */ 720 + break; 721 + case 0x1090: /* PIR */ 722 + /* 723 + * This register is used to reset a CPU core -- 724 + * let userspace handle it. 725 + */ 726 + err = -ENXIO; 727 + break; 728 + case 0x10A0: /* IPI_IVPR */ 729 + case 0x10B0: 730 + case 0x10C0: 731 + case 0x10D0: { 732 + int idx; 733 + idx = (addr - 0x10A0) >> 4; 734 + write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); 735 + break; 736 + } 737 + case 0x10E0: /* SPVE */ 738 + opp->spve = val & opp->vector_mask; 739 + break; 740 + default: 741 + break; 742 + } 743 + 744 + return err; 745 + } 746 + 747 + static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr) 748 + { 749 + struct openpic *opp = opaque; 750 + u32 retval; 751 + int err = 0; 752 + 753 + pr_debug("%s: addr %#llx\n", __func__, addr); 754 + retval = 0xFFFFFFFF; 755 + if (addr & 0xF) 756 + goto out; 757 + 758 + switch (addr) { 759 + case 0x1000: /* FRR */ 760 + retval = opp->frr; 761 + retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT; 762 + break; 763 + case 0x1020: /* GCR */ 764 + retval = opp->gcr; 765 + break; 766 + case 0x1080: /* VIR */ 767 + retval = opp->vir; 768 + break; 769 + case 0x1090: /* PIR */ 770 + retval = 0x00000000; 771 + break; 772 + case 0x00: /* Block Revision Register1 (BRR1) */ 773 + retval = opp->brr1; 774 + break; 775 + case 0x40: 776 + case 0x50: 777 + case 0x60: 778 + case 0x70: 779 + case 0x80: 780 + case 0x90: 781 + case 0xA0: 782 + case 0xB0: 783 + err = openpic_cpu_read_internal(opp, addr, 784 + &retval, get_current_cpu()); 785 + break; 786 + case 0x10A0: /* IPI_IVPR */ 787 + case 0x10B0: 788 + case 0x10C0: 789 + case 0x10D0: 790 + { 791 + int idx; 792 + idx = (addr - 0x10A0) >> 4; 793 + retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx); 794 + } 795 + break; 796 + case 0x10E0: /* SPVE */ 797 + retval = opp->spve; 798 + break; 799 + default: 800 + break; 801 + } 802 + 803 + out: 804 + pr_debug("%s: => 0x%08x\n", __func__, retval); 805 + *ptr = retval; 806 + return err; 807 + } 808 + 809 + static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val) 810 + { 811 + struct openpic *opp = opaque; 812 + int idx; 813 + 814 + addr += 0x10f0; 815 + 816 + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); 817 + if (addr & 0xF) 818 + return 0; 819 + 820 + if (addr == 0x10f0) { 821 + /* TFRR */ 822 + opp->tfrr = val; 823 + return 0; 824 + } 825 + 826 + idx = (addr >> 6) & 0x3; 827 + addr = addr & 0x30; 828 + 829 + switch (addr & 0x30) { 830 + case 0x00: /* TCCR */ 831 + break; 832 + case 0x10: /* TBCR */ 833 + if ((opp->timers[idx].tccr & TCCR_TOG) != 0 && 834 + (val & TBCR_CI) == 0 && 835 + (opp->timers[idx].tbcr & TBCR_CI) != 0) 836 + opp->timers[idx].tccr &= ~TCCR_TOG; 837 + 838 + opp->timers[idx].tbcr = val; 839 + break; 840 + case 0x20: /* TVPR */ 841 + write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val); 842 + break; 843 + case 0x30: /* TDR */ 844 + write_IRQreg_idr(opp, opp->irq_tim0 + idx, val); 845 + break; 846 + } 847 + 848 + return 0; 849 + } 850 + 851 + static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr) 852 + { 853 + struct openpic *opp = opaque; 854 + uint32_t retval = -1; 855 + int idx; 856 + 857 + pr_debug("%s: addr %#llx\n", __func__, addr); 858 + if (addr & 0xF) 859 + goto out; 860 + 861 + idx = (addr >> 6) & 0x3; 862 + if (addr == 0x0) { 863 + /* TFRR */ 864 + retval = opp->tfrr; 865 + goto out; 866 + } 867 + 868 + switch (addr & 0x30) { 869 + case 0x00: /* TCCR */ 870 + retval = opp->timers[idx].tccr; 871 + break; 872 + case 0x10: /* TBCR */ 873 + retval = opp->timers[idx].tbcr; 874 + break; 875 + case 0x20: /* TIPV */ 876 + retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx); 877 + break; 878 + case 0x30: /* TIDE (TIDR) */ 879 + retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx); 880 + break; 881 + } 882 + 883 + out: 884 + pr_debug("%s: => 0x%08x\n", __func__, retval); 885 + *ptr = retval; 886 + return 0; 887 + } 888 + 889 + static int openpic_src_write(void *opaque, gpa_t addr, u32 val) 890 + { 891 + struct openpic *opp = opaque; 892 + int idx; 893 + 894 + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); 895 + 896 + addr = addr & 0xffff; 897 + idx = addr >> 5; 898 + 899 + switch (addr & 0x1f) { 900 + case 0x00: 901 + write_IRQreg_ivpr(opp, idx, val); 902 + break; 903 + case 0x10: 904 + write_IRQreg_idr(opp, idx, val); 905 + break; 906 + case 0x18: 907 + write_IRQreg_ilr(opp, idx, val); 908 + break; 909 + } 910 + 911 + return 0; 912 + } 913 + 914 + static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr) 915 + { 916 + struct openpic *opp = opaque; 917 + uint32_t retval; 918 + int idx; 919 + 920 + pr_debug("%s: addr %#llx\n", __func__, addr); 921 + retval = 0xFFFFFFFF; 922 + 923 + addr = addr & 0xffff; 924 + idx = addr >> 5; 925 + 926 + switch (addr & 0x1f) { 927 + case 0x00: 928 + retval = read_IRQreg_ivpr(opp, idx); 929 + break; 930 + case 0x10: 931 + retval = read_IRQreg_idr(opp, idx); 932 + break; 933 + case 0x18: 934 + retval = read_IRQreg_ilr(opp, idx); 935 + break; 936 + } 937 + 938 + pr_debug("%s: => 0x%08x\n", __func__, retval); 939 + *ptr = retval; 940 + return 0; 941 + } 942 + 943 + static int openpic_msi_write(void *opaque, gpa_t addr, u32 val) 944 + { 945 + struct openpic *opp = opaque; 946 + int idx = opp->irq_msi; 947 + int srs, ibs; 948 + 949 + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); 950 + if (addr & 0xF) 951 + return 0; 952 + 953 + switch (addr) { 954 + case MSIIR_OFFSET: 955 + srs = val >> MSIIR_SRS_SHIFT; 956 + idx += srs; 957 + ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT; 958 + opp->msi[srs].msir |= 1 << ibs; 959 + openpic_set_irq(opp, idx, 1); 960 + break; 961 + default: 962 + /* most registers are read-only, thus ignored */ 963 + break; 964 + } 965 + 966 + return 0; 967 + } 968 + 969 + static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr) 970 + { 971 + struct openpic *opp = opaque; 972 + uint32_t r = 0; 973 + int i, srs; 974 + 975 + pr_debug("%s: addr %#llx\n", __func__, addr); 976 + if (addr & 0xF) 977 + return -ENXIO; 978 + 979 + srs = addr >> 4; 980 + 981 + switch (addr) { 982 + case 0x00: 983 + case 0x10: 984 + case 0x20: 985 + case 0x30: 986 + case 0x40: 987 + case 0x50: 988 + case 0x60: 989 + case 0x70: /* MSIRs */ 990 + r = opp->msi[srs].msir; 991 + /* Clear on read */ 992 + opp->msi[srs].msir = 0; 993 + openpic_set_irq(opp, opp->irq_msi + srs, 0); 994 + break; 995 + case 0x120: /* MSISR */ 996 + for (i = 0; i < MAX_MSI; i++) 997 + r |= (opp->msi[i].msir ? 1 : 0) << i; 998 + break; 999 + } 1000 + 1001 + pr_debug("%s: => 0x%08x\n", __func__, r); 1002 + *ptr = r; 1003 + return 0; 1004 + } 1005 + 1006 + static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr) 1007 + { 1008 + uint32_t r = 0; 1009 + 1010 + pr_debug("%s: addr %#llx\n", __func__, addr); 1011 + 1012 + /* TODO: EISR/EIMR */ 1013 + 1014 + *ptr = r; 1015 + return 0; 1016 + } 1017 + 1018 + static int openpic_summary_write(void *opaque, gpa_t addr, u32 val) 1019 + { 1020 + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); 1021 + 1022 + /* TODO: EISR/EIMR */ 1023 + return 0; 1024 + } 1025 + 1026 + static int openpic_cpu_write_internal(void *opaque, gpa_t addr, 1027 + u32 val, int idx) 1028 + { 1029 + struct openpic *opp = opaque; 1030 + struct irq_source *src; 1031 + struct irq_dest *dst; 1032 + int s_IRQ, n_IRQ; 1033 + 1034 + pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx, 1035 + addr, val); 1036 + 1037 + if (idx < 0) 1038 + return 0; 1039 + 1040 + if (addr & 0xF) 1041 + return 0; 1042 + 1043 + dst = &opp->dst[idx]; 1044 + addr &= 0xFF0; 1045 + switch (addr) { 1046 + case 0x40: /* IPIDR */ 1047 + case 0x50: 1048 + case 0x60: 1049 + case 0x70: 1050 + idx = (addr - 0x40) >> 4; 1051 + /* we use IDE as mask which CPUs to deliver the IPI to still. */ 1052 + opp->src[opp->irq_ipi0 + idx].destmask |= val; 1053 + openpic_set_irq(opp, opp->irq_ipi0 + idx, 1); 1054 + openpic_set_irq(opp, opp->irq_ipi0 + idx, 0); 1055 + break; 1056 + case 0x80: /* CTPR */ 1057 + dst->ctpr = val & 0x0000000F; 1058 + 1059 + pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", 1060 + __func__, idx, dst->ctpr, dst->raised.priority, 1061 + dst->servicing.priority); 1062 + 1063 + if (dst->raised.priority <= dst->ctpr) { 1064 + pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", 1065 + __func__, idx); 1066 + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); 1067 + } else if (dst->raised.priority > dst->servicing.priority) { 1068 + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n", 1069 + __func__, idx, dst->raised.next); 1070 + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); 1071 + } 1072 + 1073 + break; 1074 + case 0x90: /* WHOAMI */ 1075 + /* Read-only register */ 1076 + break; 1077 + case 0xA0: /* IACK */ 1078 + /* Read-only register */ 1079 + break; 1080 + case 0xB0: { /* EOI */ 1081 + int notify_eoi; 1082 + 1083 + pr_debug("EOI\n"); 1084 + s_IRQ = IRQ_get_next(opp, &dst->servicing); 1085 + 1086 + if (s_IRQ < 0) { 1087 + pr_debug("%s: EOI with no interrupt in service\n", 1088 + __func__); 1089 + break; 1090 + } 1091 + 1092 + IRQ_resetbit(&dst->servicing, s_IRQ); 1093 + /* Notify listeners that the IRQ is over */ 1094 + notify_eoi = s_IRQ; 1095 + /* Set up next servicing IRQ */ 1096 + s_IRQ = IRQ_get_next(opp, &dst->servicing); 1097 + /* Check queued interrupts. */ 1098 + n_IRQ = IRQ_get_next(opp, &dst->raised); 1099 + src = &opp->src[n_IRQ]; 1100 + if (n_IRQ != -1 && 1101 + (s_IRQ == -1 || 1102 + IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { 1103 + pr_debug("Raise OpenPIC INT output cpu %d irq %d\n", 1104 + idx, n_IRQ); 1105 + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); 1106 + } 1107 + 1108 + spin_unlock(&opp->lock); 1109 + kvm_notify_acked_irq(opp->kvm, 0, notify_eoi); 1110 + spin_lock(&opp->lock); 1111 + 1112 + break; 1113 + } 1114 + default: 1115 + break; 1116 + } 1117 + 1118 + return 0; 1119 + } 1120 + 1121 + static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val) 1122 + { 1123 + struct openpic *opp = opaque; 1124 + 1125 + return openpic_cpu_write_internal(opp, addr, val, 1126 + (addr & 0x1f000) >> 12); 1127 + } 1128 + 1129 + static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, 1130 + int cpu) 1131 + { 1132 + struct irq_source *src; 1133 + int retval, irq; 1134 + 1135 + pr_debug("Lower OpenPIC INT output\n"); 1136 + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); 1137 + 1138 + irq = IRQ_get_next(opp, &dst->raised); 1139 + pr_debug("IACK: irq=%d\n", irq); 1140 + 1141 + if (irq == -1) 1142 + /* No more interrupt pending */ 1143 + return opp->spve; 1144 + 1145 + src = &opp->src[irq]; 1146 + if (!(src->ivpr & IVPR_ACTIVITY_MASK) || 1147 + !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) { 1148 + pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", 1149 + __func__, irq, dst->ctpr, src->ivpr); 1150 + openpic_update_irq(opp, irq); 1151 + retval = opp->spve; 1152 + } else { 1153 + /* IRQ enter servicing state */ 1154 + IRQ_setbit(&dst->servicing, irq); 1155 + retval = IVPR_VECTOR(opp, src->ivpr); 1156 + } 1157 + 1158 + if (!src->level) { 1159 + /* edge-sensitive IRQ */ 1160 + src->ivpr &= ~IVPR_ACTIVITY_MASK; 1161 + src->pending = 0; 1162 + IRQ_resetbit(&dst->raised, irq); 1163 + } 1164 + 1165 + if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) { 1166 + src->destmask &= ~(1 << cpu); 1167 + if (src->destmask && !src->level) { 1168 + /* trigger on CPUs that didn't know about it yet */ 1169 + openpic_set_irq(opp, irq, 1); 1170 + openpic_set_irq(opp, irq, 0); 1171 + /* if all CPUs knew about it, set active bit again */ 1172 + src->ivpr |= IVPR_ACTIVITY_MASK; 1173 + } 1174 + } 1175 + 1176 + return retval; 1177 + } 1178 + 1179 + void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) 1180 + { 1181 + struct openpic *opp = vcpu->arch.mpic; 1182 + int cpu = vcpu->arch.irq_cpu_id; 1183 + unsigned long flags; 1184 + 1185 + spin_lock_irqsave(&opp->lock, flags); 1186 + 1187 + if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY) 1188 + kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu)); 1189 + 1190 + spin_unlock_irqrestore(&opp->lock, flags); 1191 + } 1192 + 1193 + static int openpic_cpu_read_internal(void *opaque, gpa_t addr, 1194 + u32 *ptr, int idx) 1195 + { 1196 + struct openpic *opp = opaque; 1197 + struct irq_dest *dst; 1198 + uint32_t retval; 1199 + 1200 + pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr); 1201 + retval = 0xFFFFFFFF; 1202 + 1203 + if (idx < 0) 1204 + goto out; 1205 + 1206 + if (addr & 0xF) 1207 + goto out; 1208 + 1209 + dst = &opp->dst[idx]; 1210 + addr &= 0xFF0; 1211 + switch (addr) { 1212 + case 0x80: /* CTPR */ 1213 + retval = dst->ctpr; 1214 + break; 1215 + case 0x90: /* WHOAMI */ 1216 + retval = idx; 1217 + break; 1218 + case 0xA0: /* IACK */ 1219 + retval = openpic_iack(opp, dst, idx); 1220 + break; 1221 + case 0xB0: /* EOI */ 1222 + retval = 0; 1223 + break; 1224 + default: 1225 + break; 1226 + } 1227 + pr_debug("%s: => 0x%08x\n", __func__, retval); 1228 + 1229 + out: 1230 + *ptr = retval; 1231 + return 0; 1232 + } 1233 + 1234 + static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr) 1235 + { 1236 + struct openpic *opp = opaque; 1237 + 1238 + return openpic_cpu_read_internal(opp, addr, ptr, 1239 + (addr & 0x1f000) >> 12); 1240 + } 1241 + 1242 + struct mem_reg { 1243 + int (*read)(void *opaque, gpa_t addr, u32 *ptr); 1244 + int (*write)(void *opaque, gpa_t addr, u32 val); 1245 + gpa_t start_addr; 1246 + int size; 1247 + }; 1248 + 1249 + static const struct mem_reg openpic_gbl_mmio = { 1250 + .write = openpic_gbl_write, 1251 + .read = openpic_gbl_read, 1252 + .start_addr = OPENPIC_GLB_REG_START, 1253 + .size = OPENPIC_GLB_REG_SIZE, 1254 + }; 1255 + 1256 + static const struct mem_reg openpic_tmr_mmio = { 1257 + .write = openpic_tmr_write, 1258 + .read = openpic_tmr_read, 1259 + .start_addr = OPENPIC_TMR_REG_START, 1260 + .size = OPENPIC_TMR_REG_SIZE, 1261 + }; 1262 + 1263 + static const struct mem_reg openpic_cpu_mmio = { 1264 + .write = openpic_cpu_write, 1265 + .read = openpic_cpu_read, 1266 + .start_addr = OPENPIC_CPU_REG_START, 1267 + .size = OPENPIC_CPU_REG_SIZE, 1268 + }; 1269 + 1270 + static const struct mem_reg openpic_src_mmio = { 1271 + .write = openpic_src_write, 1272 + .read = openpic_src_read, 1273 + .start_addr = OPENPIC_SRC_REG_START, 1274 + .size = OPENPIC_SRC_REG_SIZE, 1275 + }; 1276 + 1277 + static const struct mem_reg openpic_msi_mmio = { 1278 + .read = openpic_msi_read, 1279 + .write = openpic_msi_write, 1280 + .start_addr = OPENPIC_MSI_REG_START, 1281 + .size = OPENPIC_MSI_REG_SIZE, 1282 + }; 1283 + 1284 + static const struct mem_reg openpic_summary_mmio = { 1285 + .read = openpic_summary_read, 1286 + .write = openpic_summary_write, 1287 + .start_addr = OPENPIC_SUMMARY_REG_START, 1288 + .size = OPENPIC_SUMMARY_REG_SIZE, 1289 + }; 1290 + 1291 + static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr) 1292 + { 1293 + if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) { 1294 + WARN(1, "kvm mpic: too many mmio regions\n"); 1295 + return; 1296 + } 1297 + 1298 + opp->mmio_regions[opp->num_mmio_regions++] = mr; 1299 + } 1300 + 1301 + static void fsl_common_init(struct openpic *opp) 1302 + { 1303 + int i; 1304 + int virq = MAX_SRC; 1305 + 1306 + add_mmio_region(opp, &openpic_msi_mmio); 1307 + add_mmio_region(opp, &openpic_summary_mmio); 1308 + 1309 + opp->vid = VID_REVISION_1_2; 1310 + opp->vir = VIR_GENERIC; 1311 + opp->vector_mask = 0xFFFF; 1312 + opp->tfrr_reset = 0; 1313 + opp->ivpr_reset = IVPR_MASK_MASK; 1314 + opp->idr_reset = 1 << 0; 1315 + opp->max_irq = MAX_IRQ; 1316 + 1317 + opp->irq_ipi0 = virq; 1318 + virq += MAX_IPI; 1319 + opp->irq_tim0 = virq; 1320 + virq += MAX_TMR; 1321 + 1322 + BUG_ON(virq > MAX_IRQ); 1323 + 1324 + opp->irq_msi = 224; 1325 + 1326 + for (i = 0; i < opp->fsl->max_ext; i++) 1327 + opp->src[i].level = false; 1328 + 1329 + /* Internal interrupts, including message and MSI */ 1330 + for (i = 16; i < MAX_SRC; i++) { 1331 + opp->src[i].type = IRQ_TYPE_FSLINT; 1332 + opp->src[i].level = true; 1333 + } 1334 + 1335 + /* timers and IPIs */ 1336 + for (i = MAX_SRC; i < virq; i++) { 1337 + opp->src[i].type = IRQ_TYPE_FSLSPECIAL; 1338 + opp->src[i].level = false; 1339 + } 1340 + } 1341 + 1342 + static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) 1343 + { 1344 + int i; 1345 + 1346 + for (i = 0; i < opp->num_mmio_regions; i++) { 1347 + const struct mem_reg *mr = opp->mmio_regions[i]; 1348 + 1349 + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) 1350 + continue; 1351 + 1352 + return mr->read(opp, addr - mr->start_addr, ptr); 1353 + } 1354 + 1355 + return -ENXIO; 1356 + } 1357 + 1358 + static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val) 1359 + { 1360 + int i; 1361 + 1362 + for (i = 0; i < opp->num_mmio_regions; i++) { 1363 + const struct mem_reg *mr = opp->mmio_regions[i]; 1364 + 1365 + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) 1366 + continue; 1367 + 1368 + return mr->write(opp, addr - mr->start_addr, val); 1369 + } 1370 + 1371 + return -ENXIO; 1372 + } 1373 + 1374 + static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr, 1375 + int len, void *ptr) 1376 + { 1377 + struct openpic *opp = container_of(this, struct openpic, mmio); 1378 + int ret; 1379 + union { 1380 + u32 val; 1381 + u8 bytes[4]; 1382 + } u; 1383 + 1384 + if (addr & (len - 1)) { 1385 + pr_debug("%s: bad alignment %llx/%d\n", 1386 + __func__, addr, len); 1387 + return -EINVAL; 1388 + } 1389 + 1390 + spin_lock_irq(&opp->lock); 1391 + ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val); 1392 + spin_unlock_irq(&opp->lock); 1393 + 1394 + /* 1395 + * Technically only 32-bit accesses are allowed, but be nice to 1396 + * people dumping registers a byte at a time -- it works in real 1397 + * hardware (reads only, not writes). 1398 + */ 1399 + if (len == 4) { 1400 + *(u32 *)ptr = u.val; 1401 + pr_debug("%s: addr %llx ret %d len 4 val %x\n", 1402 + __func__, addr, ret, u.val); 1403 + } else if (len == 1) { 1404 + *(u8 *)ptr = u.bytes[addr & 3]; 1405 + pr_debug("%s: addr %llx ret %d len 1 val %x\n", 1406 + __func__, addr, ret, u.bytes[addr & 3]); 1407 + } else { 1408 + pr_debug("%s: bad length %d\n", __func__, len); 1409 + return -EINVAL; 1410 + } 1411 + 1412 + return ret; 1413 + } 1414 + 1415 + static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr, 1416 + int len, const void *ptr) 1417 + { 1418 + struct openpic *opp = container_of(this, struct openpic, mmio); 1419 + int ret; 1420 + 1421 + if (len != 4) { 1422 + pr_debug("%s: bad length %d\n", __func__, len); 1423 + return -EOPNOTSUPP; 1424 + } 1425 + if (addr & 3) { 1426 + pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len); 1427 + return -EOPNOTSUPP; 1428 + } 1429 + 1430 + spin_lock_irq(&opp->lock); 1431 + ret = kvm_mpic_write_internal(opp, addr - opp->reg_base, 1432 + *(const u32 *)ptr); 1433 + spin_unlock_irq(&opp->lock); 1434 + 1435 + pr_debug("%s: addr %llx ret %d val %x\n", 1436 + __func__, addr, ret, *(const u32 *)ptr); 1437 + 1438 + return ret; 1439 + } 1440 + 1441 + static const struct kvm_io_device_ops mpic_mmio_ops = { 1442 + .read = kvm_mpic_read, 1443 + .write = kvm_mpic_write, 1444 + }; 1445 + 1446 + static void map_mmio(struct openpic *opp) 1447 + { 1448 + kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops); 1449 + 1450 + kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS, 1451 + opp->reg_base, OPENPIC_REG_SIZE, 1452 + &opp->mmio); 1453 + } 1454 + 1455 + static void unmap_mmio(struct openpic *opp) 1456 + { 1457 + kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); 1458 + } 1459 + 1460 + static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) 1461 + { 1462 + u64 base; 1463 + 1464 + if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64))) 1465 + return -EFAULT; 1466 + 1467 + if (base & 0x3ffff) { 1468 + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n", 1469 + __func__, base); 1470 + return -EINVAL; 1471 + } 1472 + 1473 + if (base == opp->reg_base) 1474 + return 0; 1475 + 1476 + mutex_lock(&opp->kvm->slots_lock); 1477 + 1478 + unmap_mmio(opp); 1479 + opp->reg_base = base; 1480 + 1481 + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n", 1482 + __func__, base); 1483 + 1484 + if (base == 0) 1485 + goto out; 1486 + 1487 + map_mmio(opp); 1488 + 1489 + out: 1490 + mutex_unlock(&opp->kvm->slots_lock); 1491 + return 0; 1492 + } 1493 + 1494 + #define ATTR_SET 0 1495 + #define ATTR_GET 1 1496 + 1497 + static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type) 1498 + { 1499 + int ret; 1500 + 1501 + if (addr & 3) 1502 + return -ENXIO; 1503 + 1504 + spin_lock_irq(&opp->lock); 1505 + 1506 + if (type == ATTR_SET) 1507 + ret = kvm_mpic_write_internal(opp, addr, *val); 1508 + else 1509 + ret = kvm_mpic_read_internal(opp, addr, val); 1510 + 1511 + spin_unlock_irq(&opp->lock); 1512 + 1513 + pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val); 1514 + 1515 + return ret; 1516 + } 1517 + 1518 + static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1519 + { 1520 + struct openpic *opp = dev->private; 1521 + u32 attr32; 1522 + 1523 + switch (attr->group) { 1524 + case KVM_DEV_MPIC_GRP_MISC: 1525 + switch (attr->attr) { 1526 + case KVM_DEV_MPIC_BASE_ADDR: 1527 + return set_base_addr(opp, attr); 1528 + } 1529 + 1530 + break; 1531 + 1532 + case KVM_DEV_MPIC_GRP_REGISTER: 1533 + if (get_user(attr32, (u32 __user *)(long)attr->addr)) 1534 + return -EFAULT; 1535 + 1536 + return access_reg(opp, attr->attr, &attr32, ATTR_SET); 1537 + 1538 + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: 1539 + if (attr->attr > MAX_SRC) 1540 + return -EINVAL; 1541 + 1542 + if (get_user(attr32, (u32 __user *)(long)attr->addr)) 1543 + return -EFAULT; 1544 + 1545 + if (attr32 != 0 && attr32 != 1) 1546 + return -EINVAL; 1547 + 1548 + spin_lock_irq(&opp->lock); 1549 + openpic_set_irq(opp, attr->attr, attr32); 1550 + spin_unlock_irq(&opp->lock); 1551 + return 0; 1552 + } 1553 + 1554 + return -ENXIO; 1555 + } 1556 + 1557 + static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1558 + { 1559 + struct openpic *opp = dev->private; 1560 + u64 attr64; 1561 + u32 attr32; 1562 + int ret; 1563 + 1564 + switch (attr->group) { 1565 + case KVM_DEV_MPIC_GRP_MISC: 1566 + switch (attr->attr) { 1567 + case KVM_DEV_MPIC_BASE_ADDR: 1568 + mutex_lock(&opp->kvm->slots_lock); 1569 + attr64 = opp->reg_base; 1570 + mutex_unlock(&opp->kvm->slots_lock); 1571 + 1572 + if (copy_to_user((u64 __user *)(long)attr->addr, 1573 + &attr64, sizeof(u64))) 1574 + return -EFAULT; 1575 + 1576 + return 0; 1577 + } 1578 + 1579 + break; 1580 + 1581 + case KVM_DEV_MPIC_GRP_REGISTER: 1582 + ret = access_reg(opp, attr->attr, &attr32, ATTR_GET); 1583 + if (ret) 1584 + return ret; 1585 + 1586 + if (put_user(attr32, (u32 __user *)(long)attr->addr)) 1587 + return -EFAULT; 1588 + 1589 + return 0; 1590 + 1591 + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: 1592 + if (attr->attr > MAX_SRC) 1593 + return -EINVAL; 1594 + 1595 + spin_lock_irq(&opp->lock); 1596 + attr32 = opp->src[attr->attr].pending; 1597 + spin_unlock_irq(&opp->lock); 1598 + 1599 + if (put_user(attr32, (u32 __user *)(long)attr->addr)) 1600 + return -EFAULT; 1601 + 1602 + return 0; 1603 + } 1604 + 1605 + return -ENXIO; 1606 + } 1607 + 1608 + static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1609 + { 1610 + switch (attr->group) { 1611 + case KVM_DEV_MPIC_GRP_MISC: 1612 + switch (attr->attr) { 1613 + case KVM_DEV_MPIC_BASE_ADDR: 1614 + return 0; 1615 + } 1616 + 1617 + break; 1618 + 1619 + case KVM_DEV_MPIC_GRP_REGISTER: 1620 + return 0; 1621 + 1622 + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: 1623 + if (attr->attr > MAX_SRC) 1624 + break; 1625 + 1626 + return 0; 1627 + } 1628 + 1629 + return -ENXIO; 1630 + } 1631 + 1632 + static void mpic_destroy(struct kvm_device *dev) 1633 + { 1634 + struct openpic *opp = dev->private; 1635 + 1636 + dev->kvm->arch.mpic = NULL; 1637 + kfree(opp); 1638 + } 1639 + 1640 + static int mpic_set_default_irq_routing(struct openpic *opp) 1641 + { 1642 + struct kvm_irq_routing_entry *routing; 1643 + 1644 + /* Create a nop default map, so that dereferencing it still works */ 1645 + routing = kzalloc((sizeof(*routing)), GFP_KERNEL); 1646 + if (!routing) 1647 + return -ENOMEM; 1648 + 1649 + kvm_set_irq_routing(opp->kvm, routing, 0, 0); 1650 + 1651 + kfree(routing); 1652 + return 0; 1653 + } 1654 + 1655 + static int mpic_create(struct kvm_device *dev, u32 type) 1656 + { 1657 + struct openpic *opp; 1658 + int ret; 1659 + 1660 + /* We only support one MPIC at a time for now */ 1661 + if (dev->kvm->arch.mpic) 1662 + return -EINVAL; 1663 + 1664 + opp = kzalloc(sizeof(struct openpic), GFP_KERNEL); 1665 + if (!opp) 1666 + return -ENOMEM; 1667 + 1668 + dev->private = opp; 1669 + opp->kvm = dev->kvm; 1670 + opp->dev = dev; 1671 + opp->model = type; 1672 + spin_lock_init(&opp->lock); 1673 + 1674 + add_mmio_region(opp, &openpic_gbl_mmio); 1675 + add_mmio_region(opp, &openpic_tmr_mmio); 1676 + add_mmio_region(opp, &openpic_src_mmio); 1677 + add_mmio_region(opp, &openpic_cpu_mmio); 1678 + 1679 + switch (opp->model) { 1680 + case KVM_DEV_TYPE_FSL_MPIC_20: 1681 + opp->fsl = &fsl_mpic_20; 1682 + opp->brr1 = 0x00400200; 1683 + opp->flags |= OPENPIC_FLAG_IDR_CRIT; 1684 + opp->nb_irqs = 80; 1685 + opp->mpic_mode_mask = GCR_MODE_MIXED; 1686 + 1687 + fsl_common_init(opp); 1688 + 1689 + break; 1690 + 1691 + case KVM_DEV_TYPE_FSL_MPIC_42: 1692 + opp->fsl = &fsl_mpic_42; 1693 + opp->brr1 = 0x00400402; 1694 + opp->flags |= OPENPIC_FLAG_ILR; 1695 + opp->nb_irqs = 196; 1696 + opp->mpic_mode_mask = GCR_MODE_PROXY; 1697 + 1698 + fsl_common_init(opp); 1699 + 1700 + break; 1701 + 1702 + default: 1703 + ret = -ENODEV; 1704 + goto err; 1705 + } 1706 + 1707 + ret = mpic_set_default_irq_routing(opp); 1708 + if (ret) 1709 + goto err; 1710 + 1711 + openpic_reset(opp); 1712 + 1713 + smp_wmb(); 1714 + dev->kvm->arch.mpic = opp; 1715 + 1716 + return 0; 1717 + 1718 + err: 1719 + kfree(opp); 1720 + return ret; 1721 + } 1722 + 1723 + struct kvm_device_ops kvm_mpic_ops = { 1724 + .name = "kvm-mpic", 1725 + .create = mpic_create, 1726 + .destroy = mpic_destroy, 1727 + .set_attr = mpic_set_attr, 1728 + .get_attr = mpic_get_attr, 1729 + .has_attr = mpic_has_attr, 1730 + }; 1731 + 1732 + int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, 1733 + u32 cpu) 1734 + { 1735 + struct openpic *opp = dev->private; 1736 + int ret = 0; 1737 + 1738 + if (dev->ops != &kvm_mpic_ops) 1739 + return -EPERM; 1740 + if (opp->kvm != vcpu->kvm) 1741 + return -EPERM; 1742 + if (cpu < 0 || cpu >= MAX_CPU) 1743 + return -EPERM; 1744 + 1745 + spin_lock_irq(&opp->lock); 1746 + 1747 + if (opp->dst[cpu].vcpu) { 1748 + ret = -EEXIST; 1749 + goto out; 1750 + } 1751 + if (vcpu->arch.irq_type) { 1752 + ret = -EBUSY; 1753 + goto out; 1754 + } 1755 + 1756 + opp->dst[cpu].vcpu = vcpu; 1757 + opp->nb_cpus = max(opp->nb_cpus, cpu + 1); 1758 + 1759 + vcpu->arch.mpic = opp; 1760 + vcpu->arch.irq_cpu_id = cpu; 1761 + vcpu->arch.irq_type = KVMPPC_IRQ_MPIC; 1762 + 1763 + /* This might need to be changed if GCR gets extended */ 1764 + if (opp->mpic_mode_mask == GCR_MODE_PROXY) 1765 + vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL; 1766 + 1767 + out: 1768 + spin_unlock_irq(&opp->lock); 1769 + return ret; 1770 + } 1771 + 1772 + /* 1773 + * This should only happen immediately before the mpic is destroyed, 1774 + * so we shouldn't need to worry about anything still trying to 1775 + * access the vcpu pointer. 1776 + */ 1777 + void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) 1778 + { 1779 + BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu); 1780 + 1781 + opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; 1782 + } 1783 + 1784 + /* 1785 + * Return value: 1786 + * < 0 Interrupt was ignored (masked or not delivered for other reasons) 1787 + * = 0 Interrupt was coalesced (previous irq is still pending) 1788 + * > 0 Number of CPUs interrupt was delivered to 1789 + */ 1790 + static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e, 1791 + struct kvm *kvm, int irq_source_id, int level, 1792 + bool line_status) 1793 + { 1794 + u32 irq = e->irqchip.pin; 1795 + struct openpic *opp = kvm->arch.mpic; 1796 + unsigned long flags; 1797 + 1798 + spin_lock_irqsave(&opp->lock, flags); 1799 + openpic_set_irq(opp, irq, level); 1800 + spin_unlock_irqrestore(&opp->lock, flags); 1801 + 1802 + /* All code paths we care about don't check for the return value */ 1803 + return 0; 1804 + } 1805 + 1806 + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 1807 + struct kvm *kvm, int irq_source_id, int level, bool line_status) 1808 + { 1809 + struct openpic *opp = kvm->arch.mpic; 1810 + unsigned long flags; 1811 + 1812 + spin_lock_irqsave(&opp->lock, flags); 1813 + 1814 + /* 1815 + * XXX We ignore the target address for now, as we only support 1816 + * a single MSI bank. 1817 + */ 1818 + openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data); 1819 + spin_unlock_irqrestore(&opp->lock, flags); 1820 + 1821 + /* All code paths we care about don't check for the return value */ 1822 + return 0; 1823 + } 1824 + 1825 + int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, 1826 + struct kvm_kernel_irq_routing_entry *e, 1827 + const struct kvm_irq_routing_entry *ue) 1828 + { 1829 + int r = -EINVAL; 1830 + 1831 + switch (ue->type) { 1832 + case KVM_IRQ_ROUTING_IRQCHIP: 1833 + e->set = mpic_set_irq; 1834 + e->irqchip.irqchip = ue->u.irqchip.irqchip; 1835 + e->irqchip.pin = ue->u.irqchip.pin; 1836 + if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) 1837 + goto out; 1838 + rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; 1839 + break; 1840 + case KVM_IRQ_ROUTING_MSI: 1841 + e->set = kvm_set_msi; 1842 + e->msi.address_lo = ue->u.msi.address_lo; 1843 + e->msi.address_hi = ue->u.msi.address_hi; 1844 + e->msi.data = ue->u.msi.data; 1845 + break; 1846 + default: 1847 + goto out; 1848 + } 1849 + 1850 + r = 0; 1851 + out: 1852 + return r; 1853 + }

+108 -25

arch/powerpc/kvm/powerpc.c

··· 25 25 #include <linux/hrtimer.h> 26 26 #include <linux/fs.h> 27 27 #include <linux/slab.h> 28 + #include <linux/file.h> 28 29 #include <asm/cputable.h> 29 30 #include <asm/uaccess.h> 30 31 #include <asm/kvm_ppc.h> ··· 33 32 #include <asm/cputhreads.h> 34 33 #include <asm/irqflags.h> 35 34 #include "timing.h" 35 + #include "irq.h" 36 36 #include "../mm/mmu_decl.h" 37 37 38 38 #define CREATE_TRACE_POINTS ··· 319 317 case KVM_CAP_ENABLE_CAP: 320 318 case KVM_CAP_ONE_REG: 321 319 case KVM_CAP_IOEVENTFD: 320 + case KVM_CAP_DEVICE_CTRL: 322 321 r = 1; 323 322 break; 324 323 #ifndef CONFIG_KVM_BOOK3S_64_HV ··· 328 325 case KVM_CAP_PPC_GET_PVINFO: 329 326 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) 330 327 case KVM_CAP_SW_TLB: 328 + #endif 329 + #ifdef CONFIG_KVM_MPIC 330 + case KVM_CAP_IRQ_MPIC: 331 331 #endif 332 332 r = 1; 333 333 break; ··· 341 335 #ifdef CONFIG_PPC_BOOK3S_64 342 336 case KVM_CAP_SPAPR_TCE: 343 337 case KVM_CAP_PPC_ALLOC_HTAB: 338 + case KVM_CAP_PPC_RTAS: 339 + #ifdef CONFIG_KVM_XICS 340 + case KVM_CAP_IRQ_XICS: 341 + #endif 344 342 r = 1; 345 343 break; 346 344 #endif /* CONFIG_PPC_BOOK3S_64 */ ··· 421 411 } 422 412 423 413 int kvm_arch_prepare_memory_region(struct kvm *kvm, 424 - struct kvm_memory_slot *memslot, 425 - struct kvm_memory_slot old, 426 - struct kvm_userspace_memory_region *mem, 427 - bool user_alloc) 414 + struct kvm_memory_slot *memslot, 415 + struct kvm_userspace_memory_region *mem, 416 + enum kvm_mr_change change) 428 417 { 429 418 return kvmppc_core_prepare_memory_region(kvm, memslot, mem); 430 419 } 431 420 432 421 void kvm_arch_commit_memory_region(struct kvm *kvm, 433 - struct kvm_userspace_memory_region *mem, 434 - struct kvm_memory_slot old, 435 - bool user_alloc) 422 + struct kvm_userspace_memory_region *mem, 423 + const struct kvm_memory_slot *old, 424 + enum kvm_mr_change change) 436 425 { 437 426 kvmppc_core_commit_memory_region(kvm, mem, old); 438 427 } ··· 469 460 tasklet_kill(&vcpu->arch.tasklet); 470 461 471 462 kvmppc_remove_vcpu_debugfs(vcpu); 463 + 464 + switch (vcpu->arch.irq_type) { 465 + case KVMPPC_IRQ_MPIC: 466 + kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); 467 + break; 468 + case KVMPPC_IRQ_XICS: 469 + kvmppc_xics_free_icp(vcpu); 470 + break; 471 + } 472 + 472 473 kvmppc_core_vcpu_free(vcpu); 473 474 } 474 475 ··· 549 530 #ifdef CONFIG_BOOKE 550 531 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); 551 532 #endif 552 - } 553 - 554 - int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 555 - struct kvm_guest_debug *dbg) 556 - { 557 - return -EINVAL; 558 533 } 559 534 560 535 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, ··· 625 612 int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, 626 613 unsigned int rt, unsigned int bytes, int is_bigendian) 627 614 { 615 + int idx, ret; 616 + 628 617 if (bytes > sizeof(run->mmio.data)) { 629 618 printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, 630 619 run->mmio.len); ··· 642 627 vcpu->mmio_is_write = 0; 643 628 vcpu->arch.mmio_sign_extend = 0; 644 629 645 - if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 646 - bytes, &run->mmio.data)) { 630 + idx = srcu_read_lock(&vcpu->kvm->srcu); 631 + 632 + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 633 + bytes, &run->mmio.data); 634 + 635 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 636 + 637 + if (!ret) { 647 638 kvmppc_complete_mmio_load(vcpu, run); 648 639 vcpu->mmio_needed = 0; 649 640 return EMULATE_DONE; ··· 674 653 u64 val, unsigned int bytes, int is_bigendian) 675 654 { 676 655 void *data = run->mmio.data; 656 + int idx, ret; 677 657 678 658 if (bytes > sizeof(run->mmio.data)) { 679 659 printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, ··· 704 682 } 705 683 } 706 684 707 - if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 708 - bytes, &run->mmio.data)) { 709 - kvmppc_complete_mmio_load(vcpu, run); 685 + idx = srcu_read_lock(&vcpu->kvm->srcu); 686 + 687 + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 688 + bytes, &run->mmio.data); 689 + 690 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 691 + 692 + if (!ret) { 710 693 vcpu->mmio_needed = 0; 711 694 return EMULATE_DONE; 712 695 } ··· 767 740 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) 768 741 { 769 742 if (irq->irq == KVM_INTERRUPT_UNSET) { 770 - kvmppc_core_dequeue_external(vcpu, irq); 743 + kvmppc_core_dequeue_external(vcpu); 771 744 return 0; 772 745 } 773 746 ··· 797 770 break; 798 771 case KVM_CAP_PPC_EPR: 799 772 r = 0; 800 - vcpu->arch.epr_enabled = cap->args[0]; 773 + if (cap->args[0]) 774 + vcpu->arch.epr_flags |= KVMPPC_EPR_USER; 775 + else 776 + vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER; 801 777 break; 802 778 #ifdef CONFIG_BOOKE 803 779 case KVM_CAP_PPC_BOOKE_WATCHDOG: ··· 821 791 break; 822 792 } 823 793 #endif 794 + #ifdef CONFIG_KVM_MPIC 795 + case KVM_CAP_IRQ_MPIC: { 796 + struct file *filp; 797 + struct kvm_device *dev; 798 + 799 + r = -EBADF; 800 + filp = fget(cap->args[0]); 801 + if (!filp) 802 + break; 803 + 804 + r = -EPERM; 805 + dev = kvm_device_from_filp(filp); 806 + if (dev) 807 + r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]); 808 + 809 + fput(filp); 810 + break; 811 + } 812 + #endif 813 + #ifdef CONFIG_KVM_XICS 814 + case KVM_CAP_IRQ_XICS: { 815 + struct file *filp; 816 + struct kvm_device *dev; 817 + 818 + r = -EBADF; 819 + filp = fget(cap->args[0]); 820 + if (!filp) 821 + break; 822 + 823 + r = -EPERM; 824 + dev = kvm_device_from_filp(filp); 825 + if (dev) 826 + r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); 827 + 828 + fput(filp); 829 + break; 830 + } 831 + #endif /* CONFIG_KVM_XICS */ 824 832 default: 825 833 r = -EINVAL; 826 834 break; ··· 981 913 return 0; 982 914 } 983 915 916 + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, 917 + bool line_status) 918 + { 919 + if (!irqchip_in_kernel(kvm)) 920 + return -ENXIO; 921 + 922 + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 923 + irq_event->irq, irq_event->level, 924 + line_status); 925 + return 0; 926 + } 927 + 984 928 long kvm_arch_vm_ioctl(struct file *filp, 985 929 unsigned int ioctl, unsigned long arg) 986 930 { 931 + struct kvm *kvm __maybe_unused = filp->private_data; 987 932 void __user *argp = (void __user *)arg; 988 933 long r; 989 934 ··· 1015 934 #ifdef CONFIG_PPC_BOOK3S_64 1016 935 case KVM_CREATE_SPAPR_TCE: { 1017 936 struct kvm_create_spapr_tce create_tce; 1018 - struct kvm *kvm = filp->private_data; 1019 937 1020 938 r = -EFAULT; 1021 939 if (copy_from_user(&create_tce, argp, sizeof(create_tce))) ··· 1026 946 1027 947 #ifdef CONFIG_KVM_BOOK3S_64_HV 1028 948 case KVM_ALLOCATE_RMA: { 1029 - struct kvm *kvm = filp->private_data; 1030 949 struct kvm_allocate_rma rma; 950 + struct kvm *kvm = filp->private_data; 1031 951 1032 952 r = kvm_vm_ioctl_allocate_rma(kvm, &rma); 1033 953 if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) ··· 1036 956 } 1037 957 1038 958 case KVM_PPC_ALLOCATE_HTAB: { 1039 - struct kvm *kvm = filp->private_data; 1040 959 u32 htab_order; 1041 960 1042 961 r = -EFAULT; ··· 1052 973 } 1053 974 1054 975 case KVM_PPC_GET_HTAB_FD: { 1055 - struct kvm *kvm = filp->private_data; 1056 976 struct kvm_get_htab_fd ghf; 1057 977 1058 978 r = -EFAULT; ··· 1064 986 1065 987 #ifdef CONFIG_PPC_BOOK3S_64 1066 988 case KVM_PPC_GET_SMMU_INFO: { 1067 - struct kvm *kvm = filp->private_data; 1068 989 struct kvm_ppc_smmu_info info; 1069 990 1070 991 memset(&info, 0, sizeof(info)); 1071 992 r = kvm_vm_ioctl_get_smmu_info(kvm, &info); 1072 993 if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) 1073 994 r = -EFAULT; 995 + break; 996 + } 997 + case KVM_PPC_RTAS_DEFINE_TOKEN: { 998 + struct kvm *kvm = filp->private_data; 999 + 1000 + r = kvm_vm_ioctl_rtas_define_token(kvm, argp); 1074 1001 break; 1075 1002 } 1076 1003 #endif /* CONFIG_PPC_BOOK3S_64 */

+8

arch/powerpc/sysdev/xics/icp-native.c

··· 51 51 static inline unsigned int icp_native_get_xirr(void) 52 52 { 53 53 int cpu = smp_processor_id(); 54 + unsigned int xirr; 55 + 56 + /* Handled an interrupt latched by KVM */ 57 + xirr = kvmppc_get_xics_latch(); 58 + if (xirr) 59 + return xirr; 54 60 55 61 return in_be32(&icp_native_regs[cpu]->xirr.word); 56 62 } ··· 144 138 145 139 static void icp_native_cause_ipi(int cpu, unsigned long data) 146 140 { 141 + kvmppc_set_host_ipi(cpu, 1); 147 142 icp_native_set_qirr(cpu, IPI_PRIORITY); 148 143 } 149 144 ··· 158 151 { 159 152 int cpu = smp_processor_id(); 160 153 154 + kvmppc_set_host_ipi(cpu, 0); 161 155 icp_native_set_qirr(cpu, 0xff); 162 156 163 157 return smp_ipi_demux();

+1

arch/s390/include/uapi/asm/Kbuild

··· 44 44 header-y += types.h 45 45 header-y += ucontext.h 46 46 header-y += unistd.h 47 + header-y += virtio-ccw.h 47 48 header-y += vtoc.h 48 49 header-y += zcrypt.h

+21

arch/s390/include/uapi/asm/virtio-ccw.h

··· 1 + /* 2 + * Definitions for virtio-ccw devices. 3 + * 4 + * Copyright IBM Corp. 2013 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License (version 2 only) 8 + * as published by the Free Software Foundation. 9 + * 10 + * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com> 11 + */ 12 + #ifndef __KVM_VIRTIO_CCW_H 13 + #define __KVM_VIRTIO_CCW_H 14 + 15 + /* Alignment of vring buffers. */ 16 + #define KVM_VIRTIO_CCW_RING_ALIGN 4096 17 + 18 + /* Subcode for diagnose 500 (virtio hypercall). */ 19 + #define KVM_S390_VIRTIO_CCW_NOTIFY 3 20 + 21 + #endif

+1

arch/s390/kvm/Kconfig

··· 22 22 select PREEMPT_NOTIFIERS 23 23 select ANON_INODES 24 24 select HAVE_KVM_CPU_RELAX_INTERCEPT 25 + select HAVE_KVM_EVENTFD 25 26 ---help--- 26 27 Support hosting paravirtualized guest machines using the SIE 27 28 virtualization capability on the mainframe. This should work

+1 -1

arch/s390/kvm/Makefile

··· 6 6 # it under the terms of the GNU General Public License (version 2 only) 7 7 # as published by the Free Software Foundation. 8 8 9 - common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o) 9 + common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o) 10 10 11 11 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm 12 12

+26

arch/s390/kvm/diag.c

··· 13 13 14 14 #include <linux/kvm.h> 15 15 #include <linux/kvm_host.h> 16 + #include <asm/virtio-ccw.h> 16 17 #include "kvm-s390.h" 17 18 #include "trace.h" 18 19 #include "trace-s390.h" ··· 105 104 return -EREMOTE; 106 105 } 107 106 107 + static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) 108 + { 109 + int ret, idx; 110 + 111 + /* No virtio-ccw notification? Get out quickly. */ 112 + if (!vcpu->kvm->arch.css_support || 113 + (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) 114 + return -EOPNOTSUPP; 115 + 116 + idx = srcu_read_lock(&vcpu->kvm->srcu); 117 + /* 118 + * The layout is as follows: 119 + * - gpr 2 contains the subchannel id (passed as addr) 120 + * - gpr 3 contains the virtqueue index (passed as datamatch) 121 + */ 122 + ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS, 123 + vcpu->run->s.regs.gprs[2], 124 + 8, &vcpu->run->s.regs.gprs[3]); 125 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 126 + /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */ 127 + return ret < 0 ? ret : 0; 128 + } 129 + 108 130 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) 109 131 { 110 132 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; ··· 142 118 return __diag_time_slice_end_directed(vcpu); 143 119 case 0x308: 144 120 return __diag_ipl_functions(vcpu); 121 + case 0x500: 122 + return __diag_virtio_hypercall(vcpu); 145 123 default: 146 124 return -EOPNOTSUPP; 147 125 }

+71 -354

arch/s390/kvm/gaccess.h

··· 18 18 #include <asm/uaccess.h> 19 19 #include "kvm-s390.h" 20 20 21 - static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, 22 - unsigned long guestaddr) 21 + static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu, 22 + void __user *gptr, 23 + int prefixing) 23 24 { 24 25 unsigned long prefix = vcpu->arch.sie_block->prefix; 26 + unsigned long gaddr = (unsigned long) gptr; 27 + unsigned long uaddr; 25 28 26 - if (guestaddr < 2 * PAGE_SIZE) 27 - guestaddr += prefix; 28 - else if ((guestaddr >= prefix) && (guestaddr < prefix + 2 * PAGE_SIZE)) 29 - guestaddr -= prefix; 30 - 31 - return (void __user *) gmap_fault(guestaddr, vcpu->arch.gmap); 29 + if (prefixing) { 30 + if (gaddr < 2 * PAGE_SIZE) 31 + gaddr += prefix; 32 + else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE)) 33 + gaddr -= prefix; 34 + } 35 + uaddr = gmap_fault(gaddr, vcpu->arch.gmap); 36 + if (IS_ERR_VALUE(uaddr)) 37 + uaddr = -EFAULT; 38 + return (void __user *)uaddr; 32 39 } 33 40 34 - static inline int get_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr, 35 - u64 *result) 41 + #define get_guest(vcpu, x, gptr) \ 42 + ({ \ 43 + __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ 44 + int __mask = sizeof(__typeof__(*(gptr))) - 1; \ 45 + int __ret = PTR_RET((void __force *)__uptr); \ 46 + \ 47 + if (!__ret) { \ 48 + BUG_ON((unsigned long)__uptr & __mask); \ 49 + __ret = get_user(x, __uptr); \ 50 + } \ 51 + __ret; \ 52 + }) 53 + 54 + #define put_guest(vcpu, x, gptr) \ 55 + ({ \ 56 + __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ 57 + int __mask = sizeof(__typeof__(*(gptr))) - 1; \ 58 + int __ret = PTR_RET((void __force *)__uptr); \ 59 + \ 60 + if (!__ret) { \ 61 + BUG_ON((unsigned long)__uptr & __mask); \ 62 + __ret = put_user(x, __uptr); \ 63 + } \ 64 + __ret; \ 65 + }) 66 + 67 + static inline int __copy_guest(struct kvm_vcpu *vcpu, unsigned long to, 68 + unsigned long from, unsigned long len, 69 + int to_guest, int prefixing) 36 70 { 37 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 71 + unsigned long _len, rc; 72 + void __user *uptr; 38 73 39 - BUG_ON(guestaddr & 7); 40 - 41 - if (IS_ERR((void __force *) uptr)) 42 - return PTR_ERR((void __force *) uptr); 43 - 44 - return get_user(*result, (unsigned long __user *) uptr); 45 - } 46 - 47 - static inline int get_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr, 48 - u32 *result) 49 - { 50 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 51 - 52 - BUG_ON(guestaddr & 3); 53 - 54 - if (IS_ERR((void __force *) uptr)) 55 - return PTR_ERR((void __force *) uptr); 56 - 57 - return get_user(*result, (u32 __user *) uptr); 58 - } 59 - 60 - static inline int get_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr, 61 - u16 *result) 62 - { 63 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 64 - 65 - BUG_ON(guestaddr & 1); 66 - 67 - if (IS_ERR(uptr)) 68 - return PTR_ERR(uptr); 69 - 70 - return get_user(*result, (u16 __user *) uptr); 71 - } 72 - 73 - static inline int get_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr, 74 - u8 *result) 75 - { 76 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 77 - 78 - if (IS_ERR((void __force *) uptr)) 79 - return PTR_ERR((void __force *) uptr); 80 - 81 - return get_user(*result, (u8 __user *) uptr); 82 - } 83 - 84 - static inline int put_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr, 85 - u64 value) 86 - { 87 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 88 - 89 - BUG_ON(guestaddr & 7); 90 - 91 - if (IS_ERR((void __force *) uptr)) 92 - return PTR_ERR((void __force *) uptr); 93 - 94 - return put_user(value, (u64 __user *) uptr); 95 - } 96 - 97 - static inline int put_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr, 98 - u32 value) 99 - { 100 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 101 - 102 - BUG_ON(guestaddr & 3); 103 - 104 - if (IS_ERR((void __force *) uptr)) 105 - return PTR_ERR((void __force *) uptr); 106 - 107 - return put_user(value, (u32 __user *) uptr); 108 - } 109 - 110 - static inline int put_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr, 111 - u16 value) 112 - { 113 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 114 - 115 - BUG_ON(guestaddr & 1); 116 - 117 - if (IS_ERR((void __force *) uptr)) 118 - return PTR_ERR((void __force *) uptr); 119 - 120 - return put_user(value, (u16 __user *) uptr); 121 - } 122 - 123 - static inline int put_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr, 124 - u8 value) 125 - { 126 - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); 127 - 128 - if (IS_ERR((void __force *) uptr)) 129 - return PTR_ERR((void __force *) uptr); 130 - 131 - return put_user(value, (u8 __user *) uptr); 132 - } 133 - 134 - 135 - static inline int __copy_to_guest_slow(struct kvm_vcpu *vcpu, 136 - unsigned long guestdest, 137 - void *from, unsigned long n) 138 - { 139 - int rc; 140 - unsigned long i; 141 - u8 *data = from; 142 - 143 - for (i = 0; i < n; i++) { 144 - rc = put_guest_u8(vcpu, guestdest++, *(data++)); 145 - if (rc < 0) 146 - return rc; 74 + while (len) { 75 + uptr = to_guest ? (void __user *)to : (void __user *)from; 76 + uptr = __gptr_to_uptr(vcpu, uptr, prefixing); 77 + if (IS_ERR((void __force *)uptr)) 78 + return -EFAULT; 79 + _len = PAGE_SIZE - ((unsigned long)uptr & (PAGE_SIZE - 1)); 80 + _len = min(_len, len); 81 + if (to_guest) 82 + rc = copy_to_user((void __user *) uptr, (void *)from, _len); 83 + else 84 + rc = copy_from_user((void *)to, (void __user *)uptr, _len); 85 + if (rc) 86 + return -EFAULT; 87 + len -= _len; 88 + from += _len; 89 + to += _len; 147 90 } 148 91 return 0; 149 92 } 150 93 151 - static inline int __copy_to_guest_fast(struct kvm_vcpu *vcpu, 152 - unsigned long guestdest, 153 - void *from, unsigned long n) 154 - { 155 - int r; 156 - void __user *uptr; 157 - unsigned long size; 94 + #define copy_to_guest(vcpu, to, from, size) \ 95 + __copy_guest(vcpu, to, (unsigned long)from, size, 1, 1) 96 + #define copy_from_guest(vcpu, to, from, size) \ 97 + __copy_guest(vcpu, (unsigned long)to, from, size, 0, 1) 98 + #define copy_to_guest_absolute(vcpu, to, from, size) \ 99 + __copy_guest(vcpu, to, (unsigned long)from, size, 1, 0) 100 + #define copy_from_guest_absolute(vcpu, to, from, size) \ 101 + __copy_guest(vcpu, (unsigned long)to, from, size, 0, 0) 158 102 159 - if (guestdest + n < guestdest) 160 - return -EFAULT; 161 - 162 - /* simple case: all within one segment table entry? */ 163 - if ((guestdest & PMD_MASK) == ((guestdest+n) & PMD_MASK)) { 164 - uptr = (void __user *) gmap_fault(guestdest, vcpu->arch.gmap); 165 - 166 - if (IS_ERR((void __force *) uptr)) 167 - return PTR_ERR((void __force *) uptr); 168 - 169 - r = copy_to_user(uptr, from, n); 170 - 171 - if (r) 172 - r = -EFAULT; 173 - 174 - goto out; 175 - } 176 - 177 - /* copy first segment */ 178 - uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap); 179 - 180 - if (IS_ERR((void __force *) uptr)) 181 - return PTR_ERR((void __force *) uptr); 182 - 183 - size = PMD_SIZE - (guestdest & ~PMD_MASK); 184 - 185 - r = copy_to_user(uptr, from, size); 186 - 187 - if (r) { 188 - r = -EFAULT; 189 - goto out; 190 - } 191 - from += size; 192 - n -= size; 193 - guestdest += size; 194 - 195 - /* copy full segments */ 196 - while (n >= PMD_SIZE) { 197 - uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap); 198 - 199 - if (IS_ERR((void __force *) uptr)) 200 - return PTR_ERR((void __force *) uptr); 201 - 202 - r = copy_to_user(uptr, from, PMD_SIZE); 203 - 204 - if (r) { 205 - r = -EFAULT; 206 - goto out; 207 - } 208 - from += PMD_SIZE; 209 - n -= PMD_SIZE; 210 - guestdest += PMD_SIZE; 211 - } 212 - 213 - /* copy the tail segment */ 214 - if (n) { 215 - uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap); 216 - 217 - if (IS_ERR((void __force *) uptr)) 218 - return PTR_ERR((void __force *) uptr); 219 - 220 - r = copy_to_user(uptr, from, n); 221 - 222 - if (r) 223 - r = -EFAULT; 224 - } 225 - out: 226 - return r; 227 - } 228 - 229 - static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu, 230 - unsigned long guestdest, 231 - void *from, unsigned long n) 232 - { 233 - return __copy_to_guest_fast(vcpu, guestdest, from, n); 234 - } 235 - 236 - static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest, 237 - void *from, unsigned long n) 238 - { 239 - unsigned long prefix = vcpu->arch.sie_block->prefix; 240 - 241 - if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE)) 242 - goto slowpath; 243 - 244 - if ((guestdest < prefix) && (guestdest + n > prefix)) 245 - goto slowpath; 246 - 247 - if ((guestdest < prefix + 2 * PAGE_SIZE) 248 - && (guestdest + n > prefix + 2 * PAGE_SIZE)) 249 - goto slowpath; 250 - 251 - if (guestdest < 2 * PAGE_SIZE) 252 - guestdest += prefix; 253 - else if ((guestdest >= prefix) && (guestdest < prefix + 2 * PAGE_SIZE)) 254 - guestdest -= prefix; 255 - 256 - return __copy_to_guest_fast(vcpu, guestdest, from, n); 257 - slowpath: 258 - return __copy_to_guest_slow(vcpu, guestdest, from, n); 259 - } 260 - 261 - static inline int __copy_from_guest_slow(struct kvm_vcpu *vcpu, void *to, 262 - unsigned long guestsrc, 263 - unsigned long n) 264 - { 265 - int rc; 266 - unsigned long i; 267 - u8 *data = to; 268 - 269 - for (i = 0; i < n; i++) { 270 - rc = get_guest_u8(vcpu, guestsrc++, data++); 271 - if (rc < 0) 272 - return rc; 273 - } 274 - return 0; 275 - } 276 - 277 - static inline int __copy_from_guest_fast(struct kvm_vcpu *vcpu, void *to, 278 - unsigned long guestsrc, 279 - unsigned long n) 280 - { 281 - int r; 282 - void __user *uptr; 283 - unsigned long size; 284 - 285 - if (guestsrc + n < guestsrc) 286 - return -EFAULT; 287 - 288 - /* simple case: all within one segment table entry? */ 289 - if ((guestsrc & PMD_MASK) == ((guestsrc+n) & PMD_MASK)) { 290 - uptr = (void __user *) gmap_fault(guestsrc, vcpu->arch.gmap); 291 - 292 - if (IS_ERR((void __force *) uptr)) 293 - return PTR_ERR((void __force *) uptr); 294 - 295 - r = copy_from_user(to, uptr, n); 296 - 297 - if (r) 298 - r = -EFAULT; 299 - 300 - goto out; 301 - } 302 - 303 - /* copy first segment */ 304 - uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap); 305 - 306 - if (IS_ERR((void __force *) uptr)) 307 - return PTR_ERR((void __force *) uptr); 308 - 309 - size = PMD_SIZE - (guestsrc & ~PMD_MASK); 310 - 311 - r = copy_from_user(to, uptr, size); 312 - 313 - if (r) { 314 - r = -EFAULT; 315 - goto out; 316 - } 317 - to += size; 318 - n -= size; 319 - guestsrc += size; 320 - 321 - /* copy full segments */ 322 - while (n >= PMD_SIZE) { 323 - uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap); 324 - 325 - if (IS_ERR((void __force *) uptr)) 326 - return PTR_ERR((void __force *) uptr); 327 - 328 - r = copy_from_user(to, uptr, PMD_SIZE); 329 - 330 - if (r) { 331 - r = -EFAULT; 332 - goto out; 333 - } 334 - to += PMD_SIZE; 335 - n -= PMD_SIZE; 336 - guestsrc += PMD_SIZE; 337 - } 338 - 339 - /* copy the tail segment */ 340 - if (n) { 341 - uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap); 342 - 343 - if (IS_ERR((void __force *) uptr)) 344 - return PTR_ERR((void __force *) uptr); 345 - 346 - r = copy_from_user(to, uptr, n); 347 - 348 - if (r) 349 - r = -EFAULT; 350 - } 351 - out: 352 - return r; 353 - } 354 - 355 - static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to, 356 - unsigned long guestsrc, 357 - unsigned long n) 358 - { 359 - return __copy_from_guest_fast(vcpu, to, guestsrc, n); 360 - } 361 - 362 - static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to, 363 - unsigned long guestsrc, unsigned long n) 364 - { 365 - unsigned long prefix = vcpu->arch.sie_block->prefix; 366 - 367 - if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE)) 368 - goto slowpath; 369 - 370 - if ((guestsrc < prefix) && (guestsrc + n > prefix)) 371 - goto slowpath; 372 - 373 - if ((guestsrc < prefix + 2 * PAGE_SIZE) 374 - && (guestsrc + n > prefix + 2 * PAGE_SIZE)) 375 - goto slowpath; 376 - 377 - if (guestsrc < 2 * PAGE_SIZE) 378 - guestsrc += prefix; 379 - else if ((guestsrc >= prefix) && (guestsrc < prefix + 2 * PAGE_SIZE)) 380 - guestsrc -= prefix; 381 - 382 - return __copy_from_guest_fast(vcpu, to, guestsrc, n); 383 - slowpath: 384 - return __copy_from_guest_slow(vcpu, to, guestsrc, n); 385 - } 386 - #endif 103 + #endif /* __KVM_S390_GACCESS_H */

+7 -11

arch/s390/kvm/intercept.c

··· 43 43 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); 44 44 45 45 do { 46 - rc = get_guest_u64(vcpu, useraddr, 47 - &vcpu->arch.sie_block->gcr[reg]); 48 - if (rc == -EFAULT) { 49 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 50 - break; 51 - } 46 + rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg], 47 + (u64 __user *) useraddr); 48 + if (rc) 49 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 52 50 useraddr += 8; 53 51 if (reg == reg3) 54 52 break; ··· 76 78 77 79 reg = reg1; 78 80 do { 79 - rc = get_guest_u32(vcpu, useraddr, &val); 80 - if (rc == -EFAULT) { 81 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 82 - break; 83 - } 81 + rc = get_guest(vcpu, val, (u32 __user *) useraddr); 82 + if (rc) 83 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 84 84 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul; 85 85 vcpu->arch.sie_block->gcr[reg] |= val; 86 86 useraddr += 4;

+73 -172

arch/s390/kvm/interrupt.c

··· 180 180 struct kvm_s390_interrupt_info *inti) 181 181 { 182 182 const unsigned short table[] = { 2, 4, 4, 6 }; 183 - int rc, exception = 0; 183 + int rc = 0; 184 184 185 185 switch (inti->type) { 186 186 case KVM_S390_INT_EMERGENCY: ··· 188 188 vcpu->stat.deliver_emergency_signal++; 189 189 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 190 190 inti->emerg.code, 0); 191 - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); 192 - if (rc == -EFAULT) 193 - exception = 1; 194 - 195 - rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->emerg.code); 196 - if (rc == -EFAULT) 197 - exception = 1; 198 - 199 - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 200 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 201 - if (rc == -EFAULT) 202 - exception = 1; 203 - 204 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 205 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 206 - if (rc == -EFAULT) 207 - exception = 1; 191 + rc = put_guest(vcpu, 0x1201, (u16 __user *)__LC_EXT_INT_CODE); 192 + rc |= put_guest(vcpu, inti->emerg.code, 193 + (u16 __user *)__LC_EXT_CPU_ADDR); 194 + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 195 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 196 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 197 + __LC_EXT_NEW_PSW, sizeof(psw_t)); 208 198 break; 209 - 210 199 case KVM_S390_INT_EXTERNAL_CALL: 211 200 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); 212 201 vcpu->stat.deliver_external_call++; 213 202 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 214 203 inti->extcall.code, 0); 215 - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); 216 - if (rc == -EFAULT) 217 - exception = 1; 218 - 219 - rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->extcall.code); 220 - if (rc == -EFAULT) 221 - exception = 1; 222 - 223 - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 224 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 225 - if (rc == -EFAULT) 226 - exception = 1; 227 - 228 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 229 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 230 - if (rc == -EFAULT) 231 - exception = 1; 204 + rc = put_guest(vcpu, 0x1202, (u16 __user *)__LC_EXT_INT_CODE); 205 + rc |= put_guest(vcpu, inti->extcall.code, 206 + (u16 __user *)__LC_EXT_CPU_ADDR); 207 + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 208 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 209 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 210 + __LC_EXT_NEW_PSW, sizeof(psw_t)); 232 211 break; 233 - 234 212 case KVM_S390_INT_SERVICE: 235 213 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", 236 214 inti->ext.ext_params); 237 215 vcpu->stat.deliver_service_signal++; 238 216 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 239 217 inti->ext.ext_params, 0); 240 - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); 241 - if (rc == -EFAULT) 242 - exception = 1; 243 - 244 - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 245 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 246 - if (rc == -EFAULT) 247 - exception = 1; 248 - 249 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 250 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 251 - if (rc == -EFAULT) 252 - exception = 1; 253 - 254 - rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); 255 - if (rc == -EFAULT) 256 - exception = 1; 218 + rc = put_guest(vcpu, 0x2401, (u16 __user *)__LC_EXT_INT_CODE); 219 + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 220 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 221 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 222 + __LC_EXT_NEW_PSW, sizeof(psw_t)); 223 + rc |= put_guest(vcpu, inti->ext.ext_params, 224 + (u32 __user *)__LC_EXT_PARAMS); 257 225 break; 258 - 259 226 case KVM_S390_INT_VIRTIO: 260 227 VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", 261 228 inti->ext.ext_params, inti->ext.ext_params2); ··· 230 263 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 231 264 inti->ext.ext_params, 232 265 inti->ext.ext_params2); 233 - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); 234 - if (rc == -EFAULT) 235 - exception = 1; 236 - 237 - rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, 0x0d00); 238 - if (rc == -EFAULT) 239 - exception = 1; 240 - 241 - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 242 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 243 - if (rc == -EFAULT) 244 - exception = 1; 245 - 246 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 247 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 248 - if (rc == -EFAULT) 249 - exception = 1; 250 - 251 - rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); 252 - if (rc == -EFAULT) 253 - exception = 1; 254 - 255 - rc = put_guest_u64(vcpu, __LC_EXT_PARAMS2, 256 - inti->ext.ext_params2); 257 - if (rc == -EFAULT) 258 - exception = 1; 266 + rc = put_guest(vcpu, 0x2603, (u16 __user *)__LC_EXT_INT_CODE); 267 + rc |= put_guest(vcpu, 0x0d00, (u16 __user *)__LC_EXT_CPU_ADDR); 268 + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 269 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 270 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 271 + __LC_EXT_NEW_PSW, sizeof(psw_t)); 272 + rc |= put_guest(vcpu, inti->ext.ext_params, 273 + (u32 __user *)__LC_EXT_PARAMS); 274 + rc |= put_guest(vcpu, inti->ext.ext_params2, 275 + (u64 __user *)__LC_EXT_PARAMS2); 259 276 break; 260 - 261 277 case KVM_S390_SIGP_STOP: 262 278 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); 263 279 vcpu->stat.deliver_stop_signal++; ··· 263 313 vcpu->stat.deliver_restart_signal++; 264 314 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 265 315 0, 0); 266 - rc = copy_to_guest(vcpu, offsetof(struct _lowcore, 267 - restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 268 - if (rc == -EFAULT) 269 - exception = 1; 270 - 271 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 272 - offsetof(struct _lowcore, restart_psw), sizeof(psw_t)); 273 - if (rc == -EFAULT) 274 - exception = 1; 316 + rc = copy_to_guest(vcpu, 317 + offsetof(struct _lowcore, restart_old_psw), 318 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 319 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 320 + offsetof(struct _lowcore, restart_psw), 321 + sizeof(psw_t)); 275 322 atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 276 323 break; 277 - 278 324 case KVM_S390_PROGRAM_INT: 279 325 VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x", 280 326 inti->pgm.code, ··· 278 332 vcpu->stat.deliver_program_int++; 279 333 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 280 334 inti->pgm.code, 0); 281 - rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); 282 - if (rc == -EFAULT) 283 - exception = 1; 284 - 285 - rc = put_guest_u16(vcpu, __LC_PGM_ILC, 286 - table[vcpu->arch.sie_block->ipa >> 14]); 287 - if (rc == -EFAULT) 288 - exception = 1; 289 - 290 - rc = copy_to_guest(vcpu, __LC_PGM_OLD_PSW, 291 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 292 - if (rc == -EFAULT) 293 - exception = 1; 294 - 295 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 296 - __LC_PGM_NEW_PSW, sizeof(psw_t)); 297 - if (rc == -EFAULT) 298 - exception = 1; 335 + rc = put_guest(vcpu, inti->pgm.code, (u16 __user *)__LC_PGM_INT_CODE); 336 + rc |= put_guest(vcpu, table[vcpu->arch.sie_block->ipa >> 14], 337 + (u16 __user *)__LC_PGM_ILC); 338 + rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW, 339 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 340 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 341 + __LC_PGM_NEW_PSW, sizeof(psw_t)); 299 342 break; 300 343 301 344 case KVM_S390_MCHK: ··· 293 358 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 294 359 inti->mchk.cr14, 295 360 inti->mchk.mcic); 296 - rc = kvm_s390_vcpu_store_status(vcpu, 297 - KVM_S390_STORE_STATUS_PREFIXED); 298 - if (rc == -EFAULT) 299 - exception = 1; 300 - 301 - rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic); 302 - if (rc == -EFAULT) 303 - exception = 1; 304 - 305 - rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW, 306 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 307 - if (rc == -EFAULT) 308 - exception = 1; 309 - 310 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 311 - __LC_MCK_NEW_PSW, sizeof(psw_t)); 312 - if (rc == -EFAULT) 313 - exception = 1; 361 + rc = kvm_s390_vcpu_store_status(vcpu, 362 + KVM_S390_STORE_STATUS_PREFIXED); 363 + rc |= put_guest(vcpu, inti->mchk.mcic, (u64 __user *) __LC_MCCK_CODE); 364 + rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW, 365 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 366 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 367 + __LC_MCK_NEW_PSW, sizeof(psw_t)); 314 368 break; 315 369 316 370 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: ··· 312 388 vcpu->stat.deliver_io_int++; 313 389 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 314 390 param0, param1); 315 - rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID, 316 - inti->io.subchannel_id); 317 - if (rc == -EFAULT) 318 - exception = 1; 319 - 320 - rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR, 321 - inti->io.subchannel_nr); 322 - if (rc == -EFAULT) 323 - exception = 1; 324 - 325 - rc = put_guest_u32(vcpu, __LC_IO_INT_PARM, 326 - inti->io.io_int_parm); 327 - if (rc == -EFAULT) 328 - exception = 1; 329 - 330 - rc = put_guest_u32(vcpu, __LC_IO_INT_WORD, 331 - inti->io.io_int_word); 332 - if (rc == -EFAULT) 333 - exception = 1; 334 - 335 - rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW, 336 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 337 - if (rc == -EFAULT) 338 - exception = 1; 339 - 340 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 341 - __LC_IO_NEW_PSW, sizeof(psw_t)); 342 - if (rc == -EFAULT) 343 - exception = 1; 391 + rc = put_guest(vcpu, inti->io.subchannel_id, 392 + (u16 __user *) __LC_SUBCHANNEL_ID); 393 + rc |= put_guest(vcpu, inti->io.subchannel_nr, 394 + (u16 __user *) __LC_SUBCHANNEL_NR); 395 + rc |= put_guest(vcpu, inti->io.io_int_parm, 396 + (u32 __user *) __LC_IO_INT_PARM); 397 + rc |= put_guest(vcpu, inti->io.io_int_word, 398 + (u32 __user *) __LC_IO_INT_WORD); 399 + rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW, 400 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 401 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 402 + __LC_IO_NEW_PSW, sizeof(psw_t)); 344 403 break; 345 404 } 346 405 default: 347 406 BUG(); 348 407 } 349 - if (exception) { 408 + if (rc) { 350 409 printk("kvm: The guest lowcore is not mapped during interrupt " 351 - "delivery, killing userspace\n"); 410 + "delivery, killing userspace\n"); 352 411 do_exit(SIGKILL); 353 412 } 354 413 } 355 414 356 415 static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) 357 416 { 358 - int rc, exception = 0; 417 + int rc; 359 418 360 419 if (psw_extint_disabled(vcpu)) 361 420 return 0; 362 421 if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul)) 363 422 return 0; 364 - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1004); 365 - if (rc == -EFAULT) 366 - exception = 1; 367 - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 368 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 369 - if (rc == -EFAULT) 370 - exception = 1; 371 - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 372 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 373 - if (rc == -EFAULT) 374 - exception = 1; 375 - if (exception) { 423 + rc = put_guest(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE); 424 + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 425 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 426 + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 427 + __LC_EXT_NEW_PSW, sizeof(psw_t)); 428 + if (rc) { 376 429 printk("kvm: The guest lowcore is not mapped during interrupt " 377 430 "delivery, killing userspace\n"); 378 431 do_exit(SIGKILL);

+22 -21

arch/s390/kvm/kvm-s390.c

··· 142 142 case KVM_CAP_ONE_REG: 143 143 case KVM_CAP_ENABLE_CAP: 144 144 case KVM_CAP_S390_CSS_SUPPORT: 145 + case KVM_CAP_IOEVENTFD: 145 146 r = 1; 146 147 break; 147 148 case KVM_CAP_NR_VCPUS: 148 149 case KVM_CAP_MAX_VCPUS: 149 150 r = KVM_MAX_VCPUS; 151 + break; 152 + case KVM_CAP_NR_MEMSLOTS: 153 + r = KVM_USER_MEM_SLOTS; 150 154 break; 151 155 case KVM_CAP_S390_COW: 152 156 r = MACHINE_HAS_ESOP; ··· 636 632 } else { 637 633 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); 638 634 trace_kvm_s390_sie_fault(vcpu); 639 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 640 - rc = 0; 635 + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 641 636 } 642 637 } 643 638 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", ··· 977 974 /* Section: memory related */ 978 975 int kvm_arch_prepare_memory_region(struct kvm *kvm, 979 976 struct kvm_memory_slot *memslot, 980 - struct kvm_memory_slot old, 981 977 struct kvm_userspace_memory_region *mem, 982 - bool user_alloc) 978 + enum kvm_mr_change change) 983 979 { 984 - /* A few sanity checks. We can have exactly one memory slot which has 985 - to start at guest virtual zero and which has to be located at a 986 - page boundary in userland and which has to end at a page boundary. 987 - The memory in userland is ok to be fragmented into various different 988 - vmas. It is okay to mmap() and munmap() stuff in this slot after 989 - doing this call at any time */ 990 - 991 - if (mem->slot) 992 - return -EINVAL; 993 - 994 - if (mem->guest_phys_addr) 995 - return -EINVAL; 980 + /* A few sanity checks. We can have memory slots which have to be 981 + located/ended at a segment boundary (1MB). The memory in userland is 982 + ok to be fragmented into various different vmas. It is okay to mmap() 983 + and munmap() stuff in this slot after doing this call at any time */ 996 984 997 985 if (mem->userspace_addr & 0xffffful) 998 986 return -EINVAL; ··· 991 997 if (mem->memory_size & 0xffffful) 992 998 return -EINVAL; 993 999 994 - if (!user_alloc) 995 - return -EINVAL; 996 - 997 1000 return 0; 998 1001 } 999 1002 1000 1003 void kvm_arch_commit_memory_region(struct kvm *kvm, 1001 1004 struct kvm_userspace_memory_region *mem, 1002 - struct kvm_memory_slot old, 1003 - bool user_alloc) 1005 + const struct kvm_memory_slot *old, 1006 + enum kvm_mr_change change) 1004 1007 { 1005 1008 int rc; 1006 1009 1010 + /* If the basics of the memslot do not change, we do not want 1011 + * to update the gmap. Every update causes several unnecessary 1012 + * segment translation exceptions. This is usually handled just 1013 + * fine by the normal fault handler + gmap, but it will also 1014 + * cause faults on the prefix page of running guest CPUs. 1015 + */ 1016 + if (old->userspace_addr == mem->userspace_addr && 1017 + old->base_gfn * PAGE_SIZE == mem->guest_phys_addr && 1018 + old->npages * PAGE_SIZE == mem->memory_size) 1019 + return; 1007 1020 1008 1021 rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, 1009 1022 mem->guest_phys_addr, mem->memory_size);

+6 -6

arch/s390/kvm/kvm-s390.h

··· 110 110 void kvm_s390_tasklet(unsigned long parm); 111 111 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); 112 112 void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu); 113 - int kvm_s390_inject_vm(struct kvm *kvm, 114 - struct kvm_s390_interrupt *s390int); 115 - int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 116 - struct kvm_s390_interrupt *s390int); 117 - int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 118 - int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); 113 + int __must_check kvm_s390_inject_vm(struct kvm *kvm, 114 + struct kvm_s390_interrupt *s390int); 115 + int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 116 + struct kvm_s390_interrupt *s390int); 117 + int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 118 + int __must_check kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); 119 119 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, 120 120 u64 cr6, u64 schid); 121 121

+109 -161

arch/s390/kvm/priv.c

··· 14 14 #include <linux/kvm.h> 15 15 #include <linux/gfp.h> 16 16 #include <linux/errno.h> 17 + #include <linux/compat.h> 18 + #include <asm/asm-offsets.h> 17 19 #include <asm/current.h> 18 20 #include <asm/debug.h> 19 21 #include <asm/ebcdic.h> ··· 37 35 operand2 = kvm_s390_get_base_disp_s(vcpu); 38 36 39 37 /* must be word boundary */ 40 - if (operand2 & 3) { 41 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 42 - goto out; 43 - } 38 + if (operand2 & 3) 39 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 44 40 45 41 /* get the value */ 46 - if (get_guest_u32(vcpu, operand2, &address)) { 47 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 48 - goto out; 49 - } 42 + if (get_guest(vcpu, address, (u32 __user *) operand2)) 43 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 50 44 51 45 address = address & 0x7fffe000u; 52 46 53 47 /* make sure that the new value is valid memory */ 54 48 if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || 55 - (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) { 56 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 57 - goto out; 58 - } 49 + (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) 50 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 59 51 60 52 kvm_s390_set_prefix(vcpu, address); 61 53 62 54 VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); 63 55 trace_kvm_s390_handle_prefix(vcpu, 1, address); 64 - out: 65 56 return 0; 66 57 } 67 58 ··· 68 73 operand2 = kvm_s390_get_base_disp_s(vcpu); 69 74 70 75 /* must be word boundary */ 71 - if (operand2 & 3) { 72 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 73 - goto out; 74 - } 76 + if (operand2 & 3) 77 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 75 78 76 79 address = vcpu->arch.sie_block->prefix; 77 80 address = address & 0x7fffe000u; 78 81 79 82 /* get the value */ 80 - if (put_guest_u32(vcpu, operand2, address)) { 81 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 82 - goto out; 83 - } 83 + if (put_guest(vcpu, address, (u32 __user *)operand2)) 84 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 84 85 85 86 VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); 86 87 trace_kvm_s390_handle_prefix(vcpu, 0, address); 87 - out: 88 88 return 0; 89 89 } 90 90 91 91 static int handle_store_cpu_address(struct kvm_vcpu *vcpu) 92 92 { 93 93 u64 useraddr; 94 - int rc; 95 94 96 95 vcpu->stat.instruction_stap++; 97 96 98 97 useraddr = kvm_s390_get_base_disp_s(vcpu); 99 98 100 - if (useraddr & 1) { 101 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 102 - goto out; 103 - } 99 + if (useraddr & 1) 100 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 104 101 105 - rc = put_guest_u16(vcpu, useraddr, vcpu->vcpu_id); 106 - if (rc == -EFAULT) { 107 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 108 - goto out; 109 - } 102 + if (put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr)) 103 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 110 104 111 105 VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr); 112 106 trace_kvm_s390_handle_stap(vcpu, useraddr); 113 - out: 114 107 return 0; 115 108 } 116 109 ··· 112 129 113 130 static int handle_tpi(struct kvm_vcpu *vcpu) 114 131 { 115 - u64 addr; 116 132 struct kvm_s390_interrupt_info *inti; 133 + u64 addr; 117 134 int cc; 118 135 119 136 addr = kvm_s390_get_base_disp_s(vcpu); 120 - 137 + if (addr & 3) 138 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 139 + cc = 0; 121 140 inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0); 122 - if (inti) { 123 - if (addr) { 124 - /* 125 - * Store the two-word I/O interruption code into the 126 - * provided area. 127 - */ 128 - put_guest_u16(vcpu, addr, inti->io.subchannel_id); 129 - put_guest_u16(vcpu, addr + 2, inti->io.subchannel_nr); 130 - put_guest_u32(vcpu, addr + 4, inti->io.io_int_parm); 131 - } else { 132 - /* 133 - * Store the three-word I/O interruption code into 134 - * the appropriate lowcore area. 135 - */ 136 - put_guest_u16(vcpu, 184, inti->io.subchannel_id); 137 - put_guest_u16(vcpu, 186, inti->io.subchannel_nr); 138 - put_guest_u32(vcpu, 188, inti->io.io_int_parm); 139 - put_guest_u32(vcpu, 192, inti->io.io_int_word); 140 - } 141 - cc = 1; 142 - } else 143 - cc = 0; 141 + if (!inti) 142 + goto no_interrupt; 143 + cc = 1; 144 + if (addr) { 145 + /* 146 + * Store the two-word I/O interruption code into the 147 + * provided area. 148 + */ 149 + put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr); 150 + put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2)); 151 + put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4)); 152 + } else { 153 + /* 154 + * Store the three-word I/O interruption code into 155 + * the appropriate lowcore area. 156 + */ 157 + put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) __LC_SUBCHANNEL_ID); 158 + put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) __LC_SUBCHANNEL_NR); 159 + put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) __LC_IO_INT_PARM); 160 + put_guest(vcpu, inti->io.io_int_word, (u32 __user *) __LC_IO_INT_WORD); 161 + } 144 162 kfree(inti); 163 + no_interrupt: 145 164 /* Set condition code and we're done. */ 146 165 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 147 166 vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44; ··· 215 230 216 231 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 217 232 &facility_list, sizeof(facility_list)); 218 - if (rc == -EFAULT) 219 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 220 - else { 221 - VCPU_EVENT(vcpu, 5, "store facility list value %x", 222 - facility_list); 223 - trace_kvm_s390_handle_stfl(vcpu, facility_list); 224 - } 233 + if (rc) 234 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 235 + VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list); 236 + trace_kvm_s390_handle_stfl(vcpu, facility_list); 225 237 return 0; 226 238 } 227 239 ··· 231 249 232 250 #define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA) 233 251 #define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL 234 - #define PSW_ADDR_24 0x00000000000fffffUL 252 + #define PSW_ADDR_24 0x0000000000ffffffUL 235 253 #define PSW_ADDR_31 0x000000007fffffffUL 254 + 255 + static int is_valid_psw(psw_t *psw) { 256 + if (psw->mask & PSW_MASK_UNASSIGNED) 257 + return 0; 258 + if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) { 259 + if (psw->addr & ~PSW_ADDR_31) 260 + return 0; 261 + } 262 + if (!(psw->mask & PSW_MASK_ADDR_MODE) && (psw->addr & ~PSW_ADDR_24)) 263 + return 0; 264 + if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_EA) 265 + return 0; 266 + return 1; 267 + } 236 268 237 269 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) 238 270 { 239 - u64 addr; 271 + psw_t *gpsw = &vcpu->arch.sie_block->gpsw; 240 272 psw_compat_t new_psw; 273 + u64 addr; 241 274 242 - if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 275 + if (gpsw->mask & PSW_MASK_PSTATE) 243 276 return kvm_s390_inject_program_int(vcpu, 244 277 PGM_PRIVILEGED_OPERATION); 245 - 246 278 addr = kvm_s390_get_base_disp_s(vcpu); 247 - 248 - if (addr & 7) { 249 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 250 - goto out; 251 - } 252 - 253 - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) { 254 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 255 - goto out; 256 - } 257 - 258 - if (!(new_psw.mask & PSW32_MASK_BASE)) { 259 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 260 - goto out; 261 - } 262 - 263 - vcpu->arch.sie_block->gpsw.mask = 264 - (new_psw.mask & ~PSW32_MASK_BASE) << 32; 265 - vcpu->arch.sie_block->gpsw.addr = new_psw.addr; 266 - 267 - if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) || 268 - (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) && 269 - (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) || 270 - ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == 271 - PSW_MASK_EA)) { 272 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 273 - goto out; 274 - } 275 - 279 + if (addr & 7) 280 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 281 + if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) 282 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 283 + if (!(new_psw.mask & PSW32_MASK_BASE)) 284 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 285 + gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32; 286 + gpsw->mask |= new_psw.addr & PSW32_ADDR_AMODE; 287 + gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE; 288 + if (!is_valid_psw(gpsw)) 289 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 276 290 handle_new_psw(vcpu); 277 - out: 278 291 return 0; 279 292 } 280 293 281 294 static int handle_lpswe(struct kvm_vcpu *vcpu) 282 295 { 283 - u64 addr; 284 296 psw_t new_psw; 297 + u64 addr; 285 298 286 299 addr = kvm_s390_get_base_disp_s(vcpu); 287 - 288 - if (addr & 7) { 289 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 290 - goto out; 291 - } 292 - 293 - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) { 294 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 295 - goto out; 296 - } 297 - 298 - vcpu->arch.sie_block->gpsw.mask = new_psw.mask; 299 - vcpu->arch.sie_block->gpsw.addr = new_psw.addr; 300 - 301 - if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) || 302 - (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == 303 - PSW_MASK_BA) && 304 - (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) || 305 - (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) && 306 - (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) || 307 - ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == 308 - PSW_MASK_EA)) { 309 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 310 - goto out; 311 - } 312 - 300 + if (addr & 7) 301 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 302 + if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) 303 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 304 + vcpu->arch.sie_block->gpsw = new_psw; 305 + if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) 306 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 313 307 handle_new_psw(vcpu); 314 - out: 315 308 return 0; 316 309 } 317 310 318 311 static int handle_stidp(struct kvm_vcpu *vcpu) 319 312 { 320 313 u64 operand2; 321 - int rc; 322 314 323 315 vcpu->stat.instruction_stidp++; 324 316 325 317 operand2 = kvm_s390_get_base_disp_s(vcpu); 326 318 327 - if (operand2 & 7) { 328 - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 329 - goto out; 330 - } 319 + if (operand2 & 7) 320 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 331 321 332 - rc = put_guest_u64(vcpu, operand2, vcpu->arch.stidp_data); 333 - if (rc == -EFAULT) { 334 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 335 - goto out; 336 - } 322 + if (put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2)) 323 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 337 324 338 325 VCPU_EVENT(vcpu, 5, "%s", "store cpu id"); 339 - out: 340 326 return 0; 341 327 } 342 328 ··· 344 394 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; 345 395 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff; 346 396 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; 397 + unsigned long mem = 0; 347 398 u64 operand2; 348 - unsigned long mem; 399 + int rc = 0; 349 400 350 401 vcpu->stat.instruction_stsi++; 351 402 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); ··· 365 414 case 2: 366 415 mem = get_zeroed_page(GFP_KERNEL); 367 416 if (!mem) 368 - goto out_fail; 417 + goto out_no_data; 369 418 if (stsi((void *) mem, fc, sel1, sel2)) 370 - goto out_mem; 419 + goto out_no_data; 371 420 break; 372 421 case 3: 373 422 if (sel1 != 2 || sel2 != 2) 374 - goto out_fail; 423 + goto out_no_data; 375 424 mem = get_zeroed_page(GFP_KERNEL); 376 425 if (!mem) 377 - goto out_fail; 426 + goto out_no_data; 378 427 handle_stsi_3_2_2(vcpu, (void *) mem); 379 428 break; 380 429 default: 381 - goto out_fail; 430 + goto out_no_data; 382 431 } 383 432 384 433 if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { 385 - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 386 - goto out_mem; 434 + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 435 + goto out_exception; 387 436 } 388 437 trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); 389 438 free_page(mem); 390 439 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 391 440 vcpu->run->s.regs.gprs[0] = 0; 392 441 return 0; 393 - out_mem: 394 - free_page(mem); 395 - out_fail: 442 + out_no_data: 396 443 /* condition code 3 */ 397 444 vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; 398 - return 0; 445 + out_exception: 446 + free_page(mem); 447 + return rc; 399 448 } 400 449 401 450 static const intercept_handler_t b2_handlers[256] = { ··· 526 575 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) 527 576 return -EOPNOTSUPP; 528 577 529 - 530 - /* we must resolve the address without holding the mmap semaphore. 531 - * This is ok since the userspace hypervisor is not supposed to change 532 - * the mapping while the guest queries the memory. Otherwise the guest 533 - * might crash or get wrong info anyway. */ 534 - user_address = (unsigned long) __guestaddr_to_user(vcpu, address1); 535 - 536 578 down_read(&current->mm->mmap_sem); 579 + user_address = __gmap_translate(address1, vcpu->arch.gmap); 580 + if (IS_ERR_VALUE(user_address)) 581 + goto out_inject; 537 582 vma = find_vma(current->mm, user_address); 538 - if (!vma) { 539 - up_read(&current->mm->mmap_sem); 540 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 541 - } 542 - 583 + if (!vma) 584 + goto out_inject; 543 585 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 544 586 if (!(vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_READ)) 545 587 vcpu->arch.sie_block->gpsw.mask |= (1ul << 44); ··· 541 597 542 598 up_read(&current->mm->mmap_sem); 543 599 return 0; 600 + 601 + out_inject: 602 + up_read(&current->mm->mmap_sem); 603 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 544 604 } 545 605 546 606 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu)

+4

arch/x86/include/asm/entry_arch.h

··· 19 19 20 20 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 21 21 22 + #ifdef CONFIG_HAVE_KVM 23 + BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) 24 + #endif 25 + 22 26 /* 23 27 * every pentium local APIC has two 'local interrupts', with a 24 28 * soft-definable vector attached to both interrupts, one of

+3

arch/x86/include/asm/hardirq.h

··· 12 12 unsigned int irq_spurious_count; 13 13 unsigned int icr_read_retry_count; 14 14 #endif 15 + #ifdef CONFIG_HAVE_KVM 16 + unsigned int kvm_posted_intr_ipis; 17 + #endif 15 18 unsigned int x86_platform_ipis; /* arch dependent */ 16 19 unsigned int apic_perf_irqs; 17 20 unsigned int apic_irq_work_irqs;

+1

arch/x86/include/asm/hw_irq.h

··· 28 28 /* Interrupt handlers registered during init_IRQ */ 29 29 extern void apic_timer_interrupt(void); 30 30 extern void x86_platform_ipi(void); 31 + extern void kvm_posted_intr_ipi(void); 31 32 extern void error_interrupt(void); 32 33 extern void irq_work_interrupt(void); 33 34

+5

arch/x86/include/asm/irq_vectors.h

··· 102 102 */ 103 103 #define X86_PLATFORM_IPI_VECTOR 0xf7 104 104 105 + /* Vector for KVM to deliver posted interrupt IPI */ 106 + #ifdef CONFIG_HAVE_KVM 107 + #define POSTED_INTR_VECTOR 0xf2 108 + #endif 109 + 105 110 /* 106 111 * IRQ work vector: 107 112 */

+16 -10

arch/x86/include/asm/kvm_host.h

··· 31 31 #include <asm/msr-index.h> 32 32 #include <asm/asm.h> 33 33 34 - #define KVM_MAX_VCPUS 254 34 + #define KVM_MAX_VCPUS 255 35 35 #define KVM_SOFT_MAX_VCPUS 160 36 36 #define KVM_USER_MEM_SLOTS 125 37 37 /* memory slots that are not exposed to userspace */ ··· 42 42 43 43 #define KVM_PIO_PAGE_OFFSET 1 44 44 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 45 + 46 + #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS 45 47 46 48 #define CR0_RESERVED_BITS \ 47 49 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ ··· 95 93 #define KVM_NR_VAR_MTRR 8 96 94 97 95 #define ASYNC_PF_PER_VCPU 64 98 - 99 - extern raw_spinlock_t kvm_lock; 100 - extern struct list_head vm_list; 101 96 102 97 struct kvm_vcpu; 103 98 struct kvm; ··· 229 230 #endif 230 231 231 232 int write_flooding_count; 233 + bool mmio_cached; 232 234 }; 233 235 234 236 struct kvm_pio_request { ··· 345 345 unsigned long apic_attention; 346 346 int32_t apic_arb_prio; 347 347 int mp_state; 348 - int sipi_vector; 349 348 u64 ia32_misc_enable_msr; 350 349 bool tpr_access_reporting; 351 350 ··· 642 643 /* Create, but do not attach this VCPU */ 643 644 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 644 645 void (*vcpu_free)(struct kvm_vcpu *vcpu); 645 - int (*vcpu_reset)(struct kvm_vcpu *vcpu); 646 + void (*vcpu_reset)(struct kvm_vcpu *vcpu); 646 647 647 648 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); 648 649 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); ··· 695 696 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 696 697 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 697 698 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); 698 - void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 699 - void (*enable_irq_window)(struct kvm_vcpu *vcpu); 699 + int (*enable_nmi_window)(struct kvm_vcpu *vcpu); 700 + int (*enable_irq_window)(struct kvm_vcpu *vcpu); 700 701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 701 702 int (*vm_has_apicv)(struct kvm *kvm); 702 703 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 703 704 void (*hwapic_isr_update)(struct kvm *kvm, int isr); 704 705 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 705 706 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); 707 + void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); 708 + void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); 706 709 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 707 710 int (*get_tdp_level)(void); 708 711 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); ··· 731 730 int (*check_intercept)(struct kvm_vcpu *vcpu, 732 731 struct x86_instruction_info *info, 733 732 enum x86_intercept_stage stage); 733 + void (*handle_external_intr)(struct kvm_vcpu *vcpu); 734 734 }; 735 735 736 736 struct kvm_arch_async_pf { ··· 769 767 struct kvm_memory_slot *slot, 770 768 gfn_t gfn_offset, unsigned long mask); 771 769 void kvm_mmu_zap_all(struct kvm *kvm); 770 + void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); 772 771 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 773 772 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 774 773 ··· 800 797 #define EMULTYPE_TRAP_UD (1 << 1) 801 798 #define EMULTYPE_SKIP (1 << 2) 802 799 #define EMULTYPE_RETRY (1 << 3) 800 + #define EMULTYPE_NO_REEXECUTE (1 << 4) 803 801 int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, 804 802 int emulation_type, void *insn, int insn_len); 805 803 ··· 811 807 } 812 808 813 809 void kvm_enable_efer_bits(u64); 810 + bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); 814 811 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 815 812 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 816 813 ··· 824 819 825 820 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 826 821 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 822 + void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector); 827 823 828 824 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, 829 825 int reason, bool has_error_code, u32 error_code); ··· 979 973 * Trap the fault and ignore the instruction if that happens. 980 974 */ 981 975 asmlinkage void kvm_spurious_fault(void); 982 - extern bool kvm_rebooting; 983 976 984 977 #define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ 985 978 "666: " insn "\n\t" \ ··· 1007 1002 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1008 1003 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1009 1004 int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1005 + void kvm_vcpu_reset(struct kvm_vcpu *vcpu); 1010 1006 1011 1007 void kvm_define_shared_msr(unsigned index, u32 msr); 1012 1008 void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); ··· 1033 1027 void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); 1034 1028 bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); 1035 1029 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 1036 - int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); 1030 + int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 1037 1031 int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); 1038 1032 void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); 1039 1033 void kvm_deliver_pmi(struct kvm_vcpu *vcpu);

+18

arch/x86/include/asm/vmx.h

··· 65 65 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 66 66 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 67 67 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 68 + #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 68 69 69 70 70 71 #define PIN_BASED_EXT_INTR_MASK 0x00000001 71 72 #define PIN_BASED_NMI_EXITING 0x00000008 72 73 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 74 + #define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040 75 + #define PIN_BASED_POSTED_INTR 0x00000080 76 + 77 + #define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 73 78 74 79 #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 75 80 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 ··· 86 81 #define VM_EXIT_LOAD_IA32_EFER 0x00200000 87 82 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 88 83 84 + #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff 85 + 89 86 #define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 90 87 #define VM_ENTRY_IA32E_MODE 0x00000200 91 88 #define VM_ENTRY_SMM 0x00000400 ··· 96 89 #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 97 90 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 98 91 92 + #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff 93 + 94 + #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f 95 + #define VMX_MISC_SAVE_EFER_LMA 0x00000020 96 + 99 97 /* VMCS Encodings */ 100 98 enum vmcs_field { 101 99 VIRTUAL_PROCESSOR_ID = 0x00000000, 100 + POSTED_INTR_NV = 0x00000002, 102 101 GUEST_ES_SELECTOR = 0x00000800, 103 102 GUEST_CS_SELECTOR = 0x00000802, 104 103 GUEST_SS_SELECTOR = 0x00000804, ··· 139 126 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, 140 127 APIC_ACCESS_ADDR = 0x00002014, 141 128 APIC_ACCESS_ADDR_HIGH = 0x00002015, 129 + POSTED_INTR_DESC_ADDR = 0x00002016, 130 + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, 142 131 EPT_POINTER = 0x0000201a, 143 132 EPT_POINTER_HIGH = 0x0000201b, 144 133 EOI_EXIT_BITMAP0 = 0x0000201c, ··· 151 136 EOI_EXIT_BITMAP2_HIGH = 0x00002021, 152 137 EOI_EXIT_BITMAP3 = 0x00002022, 153 138 EOI_EXIT_BITMAP3_HIGH = 0x00002023, 139 + VMREAD_BITMAP = 0x00002026, 140 + VMWRITE_BITMAP = 0x00002028, 154 141 GUEST_PHYSICAL_ADDRESS = 0x00002400, 155 142 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 156 143 VMCS_LINK_POINTER = 0x00002800, ··· 226 209 GUEST_INTERRUPTIBILITY_INFO = 0x00004824, 227 210 GUEST_ACTIVITY_STATE = 0X00004826, 228 211 GUEST_SYSENTER_CS = 0x0000482A, 212 + VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, 229 213 HOST_IA32_SYSENTER_CS = 0x00004c00, 230 214 CR0_GUEST_HOST_MASK = 0x00006000, 231 215 CR4_GUEST_HOST_MASK = 0x00006002,

-1

arch/x86/include/uapi/asm/kvm.h

··· 29 29 #define __KVM_HAVE_PIT 30 30 #define __KVM_HAVE_IOAPIC 31 31 #define __KVM_HAVE_IRQ_LINE 32 - #define __KVM_HAVE_DEVICE_ASSIGNMENT 33 32 #define __KVM_HAVE_MSI 34 33 #define __KVM_HAVE_USER_NMI 35 34 #define __KVM_HAVE_GUEST_DEBUG

+2

arch/x86/include/uapi/asm/msr-index.h

··· 528 528 #define VMX_BASIC_MEM_TYPE_WB 6LLU 529 529 #define VMX_BASIC_INOUT 0x0040000000000000LLU 530 530 531 + /* MSR_IA32_VMX_MISC bits */ 532 + #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) 531 533 /* AMD-V MSRs */ 532 534 533 535 #define MSR_VM_CR 0xc0010114

+3 -2

arch/x86/include/uapi/asm/vmx.h

··· 65 65 #define EXIT_REASON_EOI_INDUCED 45 66 66 #define EXIT_REASON_EPT_VIOLATION 48 67 67 #define EXIT_REASON_EPT_MISCONFIG 49 68 + #define EXIT_REASON_PREEMPTION_TIMER 52 68 69 #define EXIT_REASON_WBINVD 54 69 70 #define EXIT_REASON_XSETBV 55 70 71 #define EXIT_REASON_APIC_WRITE 56 ··· 111 110 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 112 111 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 113 112 { EXIT_REASON_INVD, "INVD" }, \ 114 - { EXIT_REASON_INVPCID, "INVPCID" } 115 - 113 + { EXIT_REASON_INVPCID, "INVPCID" }, \ 114 + { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" } 116 115 117 116 #endif /* _UAPIVMX_H */

+5

arch/x86/kernel/entry_64.S

··· 1166 1166 apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1167 1167 x86_platform_ipi smp_x86_platform_ipi 1168 1168 1169 + #ifdef CONFIG_HAVE_KVM 1170 + apicinterrupt POSTED_INTR_VECTOR \ 1171 + kvm_posted_intr_ipi smp_kvm_posted_intr_ipi 1172 + #endif 1173 + 1169 1174 apicinterrupt THRESHOLD_APIC_VECTOR \ 1170 1175 threshold_interrupt smp_threshold_interrupt 1171 1176 apicinterrupt THERMAL_APIC_VECTOR \

+22

arch/x86/kernel/irq.c

··· 224 224 set_irq_regs(old_regs); 225 225 } 226 226 227 + #ifdef CONFIG_HAVE_KVM 228 + /* 229 + * Handler for POSTED_INTERRUPT_VECTOR. 230 + */ 231 + void smp_kvm_posted_intr_ipi(struct pt_regs *regs) 232 + { 233 + struct pt_regs *old_regs = set_irq_regs(regs); 234 + 235 + ack_APIC_irq(); 236 + 237 + irq_enter(); 238 + 239 + exit_idle(); 240 + 241 + inc_irq_stat(kvm_posted_intr_ipis); 242 + 243 + irq_exit(); 244 + 245 + set_irq_regs(old_regs); 246 + } 247 + #endif 248 + 227 249 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 228 250 229 251 #ifdef CONFIG_HOTPLUG_CPU

+4

arch/x86/kernel/irqinit.c

··· 172 172 173 173 /* IPI for X86 platform specific use */ 174 174 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); 175 + #ifdef CONFIG_HAVE_KVM 176 + /* IPI for KVM to deliver posted interrupt */ 177 + alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); 178 + #endif 175 179 176 180 /* IPI vectors for APIC spurious and error interrupts */ 177 181 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);

+8 -1

arch/x86/kernel/kvmclock.c

··· 160 160 { 161 161 int cpu = smp_processor_id(); 162 162 int low, high, ret; 163 - struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; 163 + struct pvclock_vcpu_time_info *src; 164 164 165 + if (!hv_clock) 166 + return 0; 167 + 168 + src = &hv_clock[cpu].pvti; 165 169 low = (int)slow_virt_to_phys(src) | 1; 166 170 high = ((u64)slow_virt_to_phys(src) >> 32); 167 171 ret = native_write_msr_safe(msr_kvm_system_time, low, high); ··· 279 275 u8 flags; 280 276 struct pvclock_vcpu_time_info *vcpu_time; 281 277 unsigned int size; 278 + 279 + if (!hv_clock) 280 + return 0; 282 281 283 282 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); 284 283

+12 -2

arch/x86/kvm/Kconfig

··· 21 21 tristate "Kernel-based Virtual Machine (KVM) support" 22 22 depends on HAVE_KVM 23 23 depends on HIGH_RES_TIMERS 24 - # for device assignment: 25 - depends on PCI 26 24 # for TASKSTATS/TASK_DELAY_ACCT: 27 25 depends on NET 28 26 select PREEMPT_NOTIFIERS 29 27 select MMU_NOTIFIER 30 28 select ANON_INODES 31 29 select HAVE_KVM_IRQCHIP 30 + select HAVE_KVM_IRQ_ROUTING 32 31 select HAVE_KVM_EVENTFD 33 32 select KVM_APIC_ARCHITECTURE 34 33 select KVM_ASYNC_PF ··· 80 81 ---help--- 81 82 This option adds a R/W kVM module parameter 'mmu_audit', which allows 82 83 audit KVM MMU at runtime. 84 + 85 + config KVM_DEVICE_ASSIGNMENT 86 + bool "KVM legacy PCI device assignment support" 87 + depends on KVM && PCI && IOMMU_API 88 + default y 89 + ---help--- 90 + Provide support for legacy PCI device assignment through KVM. The 91 + kernel now also supports a full featured userspace device driver 92 + framework through VFIO, which supersedes much of this support. 93 + 94 + If unsure, say Y. 83 95 84 96 # OK, it's a little counter-intuitive to do this, but it puts it neatly under 85 97 # the virtualization menu.

+3 -2

arch/x86/kvm/Makefile

··· 7 7 8 8 kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 9 9 coalesced_mmio.o irq_comm.o eventfd.o \ 10 - assigned-dev.o) 11 - kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 10 + irqchip.o) 11 + kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ 12 + assigned-dev.o iommu.o) 12 13 kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 13 14 14 15 kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \

+23 -8

arch/x86/kvm/emulate.c

··· 132 132 #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 133 133 #define No64 (1<<28) 134 134 #define PageTable (1 << 29) /* instruction used to write page table */ 135 + #define NotImpl (1 << 30) /* instruction is not implemented */ 135 136 /* Source 2 operand type */ 136 - #define Src2Shift (30) 137 + #define Src2Shift (31) 137 138 #define Src2None (OpNone << Src2Shift) 138 139 #define Src2CL (OpCL << Src2Shift) 139 140 #define Src2ImmByte (OpImmByte << Src2Shift) ··· 1579 1578 1580 1579 memset(&seg_desc, 0, sizeof seg_desc); 1581 1580 1582 - if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) 1583 - || ctxt->mode == X86EMUL_MODE_REAL) { 1584 - /* set real mode segment descriptor */ 1581 + if (ctxt->mode == X86EMUL_MODE_REAL) { 1582 + /* set real mode segment descriptor (keep limit etc. for 1583 + * unreal mode) */ 1585 1584 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); 1586 1585 set_desc_base(&seg_desc, selector << 4); 1586 + goto load; 1587 + } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) { 1588 + /* VM86 needs a clean new segment descriptor */ 1589 + set_desc_base(&seg_desc, selector << 4); 1590 + set_desc_limit(&seg_desc, 0xffff); 1591 + seg_desc.type = 3; 1592 + seg_desc.p = 1; 1593 + seg_desc.s = 1; 1594 + seg_desc.dpl = 3; 1587 1595 goto load; 1588 1596 } 1589 1597 ··· 3625 3615 #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } 3626 3616 #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ 3627 3617 .check_perm = (_p) } 3628 - #define N D(0) 3618 + #define N D(NotImpl) 3629 3619 #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3630 3620 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3631 3621 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } ··· 3723 3713 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), 3724 3714 I(SrcMem | Stack, em_grp45), 3725 3715 I(SrcMemFAddr | ImplicitOps, em_grp45), 3726 - I(SrcMem | Stack, em_grp45), N, 3716 + I(SrcMem | Stack, em_grp45), D(Undefined), 3727 3717 }; 3728 3718 3729 3719 static const struct opcode group6[] = { ··· 4172 4162 break; 4173 4163 case OpMem8: 4174 4164 ctxt->memop.bytes = 1; 4165 + if (ctxt->memop.type == OP_REG) { 4166 + ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1); 4167 + fetch_register_operand(&ctxt->memop); 4168 + } 4175 4169 goto mem_common; 4176 4170 case OpMem16: 4177 4171 ctxt->memop.bytes = 2; ··· 4387 4373 ctxt->intercept = opcode.intercept; 4388 4374 4389 4375 /* Unrecognised? */ 4390 - if (ctxt->d == 0 || (ctxt->d & Undefined)) 4376 + if (ctxt->d == 0 || (ctxt->d & NotImpl)) 4391 4377 return EMULATION_FAILED; 4392 4378 4393 4379 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) ··· 4525 4511 4526 4512 ctxt->mem_read.pos = 0; 4527 4513 4528 - if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { 4514 + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || 4515 + (ctxt->d & Undefined)) { 4529 4516 rc = emulate_ud(ctxt); 4530 4517 goto done; 4531 4518 }

+2 -2

arch/x86/kvm/i8254.c

··· 290 290 } 291 291 spin_unlock(&ps->inject_lock); 292 292 if (inject) { 293 - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 294 - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 293 + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false); 294 + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false); 295 295 296 296 /* 297 297 * Provides NMI watchdog support via Virtual Wire mode.

+107 -82

arch/x86/kvm/lapic.c

··· 94 94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 95 95 } 96 96 97 + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) 98 + { 99 + struct kvm_lapic *apic = vcpu->arch.apic; 100 + 101 + return apic_test_vector(vector, apic->regs + APIC_ISR) || 102 + apic_test_vector(vector, apic->regs + APIC_IRR); 103 + } 104 + 97 105 static inline void apic_set_vector(int vec, void *bitmap) 98 106 { 99 107 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); ··· 151 143 static inline int kvm_apic_id(struct kvm_lapic *apic) 152 144 { 153 145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 154 - } 155 - 156 - void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 157 - struct kvm_lapic_irq *irq, 158 - u64 *eoi_exit_bitmap) 159 - { 160 - struct kvm_lapic **dst; 161 - struct kvm_apic_map *map; 162 - unsigned long bitmap = 1; 163 - int i; 164 - 165 - rcu_read_lock(); 166 - map = rcu_dereference(vcpu->kvm->arch.apic_map); 167 - 168 - if (unlikely(!map)) { 169 - __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap); 170 - goto out; 171 - } 172 - 173 - if (irq->dest_mode == 0) { /* physical mode */ 174 - if (irq->delivery_mode == APIC_DM_LOWEST || 175 - irq->dest_id == 0xff) { 176 - __set_bit(irq->vector, 177 - (unsigned long *)eoi_exit_bitmap); 178 - goto out; 179 - } 180 - dst = &map->phys_map[irq->dest_id & 0xff]; 181 - } else { 182 - u32 mda = irq->dest_id << (32 - map->ldr_bits); 183 - 184 - dst = map->logical_map[apic_cluster_id(map, mda)]; 185 - 186 - bitmap = apic_logical_id(map, mda); 187 - } 188 - 189 - for_each_set_bit(i, &bitmap, 16) { 190 - if (!dst[i]) 191 - continue; 192 - if (dst[i]->vcpu == vcpu) { 193 - __set_bit(irq->vector, 194 - (unsigned long *)eoi_exit_bitmap); 195 - break; 196 - } 197 - } 198 - 199 - out: 200 - rcu_read_unlock(); 201 146 } 202 147 203 148 static void recalculate_apic_map(struct kvm *kvm) ··· 217 256 if (old) 218 257 kfree_rcu(old, rcu); 219 258 220 - kvm_ioapic_make_eoibitmap_request(kvm); 259 + kvm_vcpu_request_scan_ioapic(kvm); 221 260 } 222 261 223 262 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) ··· 318 357 return count; 319 358 } 320 359 360 + void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) 361 + { 362 + u32 i, pir_val; 363 + struct kvm_lapic *apic = vcpu->arch.apic; 364 + 365 + for (i = 0; i <= 7; i++) { 366 + pir_val = xchg(&pir[i], 0); 367 + if (pir_val) 368 + *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; 369 + } 370 + } 371 + EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 372 + 321 373 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 322 374 { 323 375 apic->irr_pending = true; ··· 353 379 if (!apic->irr_pending) 354 380 return -1; 355 381 382 + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); 356 383 result = apic_search_irr(apic); 357 384 ASSERT(result == -1 || result >= 16); 358 385 ··· 406 431 } 407 432 408 433 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 409 - int vector, int level, int trig_mode); 434 + int vector, int level, int trig_mode, 435 + unsigned long *dest_map); 410 436 411 - int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) 437 + int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 438 + unsigned long *dest_map) 412 439 { 413 440 struct kvm_lapic *apic = vcpu->arch.apic; 414 441 415 442 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, 416 - irq->level, irq->trig_mode); 443 + irq->level, irq->trig_mode, dest_map); 417 444 } 418 445 419 446 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) ··· 480 503 ASSERT(result == -1 || result >= 16); 481 504 482 505 return result; 506 + } 507 + 508 + void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) 509 + { 510 + struct kvm_lapic *apic = vcpu->arch.apic; 511 + int i; 512 + 513 + for (i = 0; i < 8; i++) 514 + apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); 483 515 } 484 516 485 517 static void apic_update_ppr(struct kvm_lapic *apic) ··· 597 611 } 598 612 599 613 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 600 - struct kvm_lapic_irq *irq, int *r) 614 + struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map) 601 615 { 602 616 struct kvm_apic_map *map; 603 617 unsigned long bitmap = 1; ··· 608 622 *r = -1; 609 623 610 624 if (irq->shorthand == APIC_DEST_SELF) { 611 - *r = kvm_apic_set_irq(src->vcpu, irq); 625 + *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); 612 626 return true; 613 627 } 614 628 ··· 653 667 continue; 654 668 if (*r < 0) 655 669 *r = 0; 656 - *r += kvm_apic_set_irq(dst[i]->vcpu, irq); 670 + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); 657 671 } 658 672 659 673 ret = true; ··· 667 681 * Return 1 if successfully added and 0 if discarded. 668 682 */ 669 683 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 670 - int vector, int level, int trig_mode) 684 + int vector, int level, int trig_mode, 685 + unsigned long *dest_map) 671 686 { 672 687 int result = 0; 673 688 struct kvm_vcpu *vcpu = apic->vcpu; ··· 681 694 if (unlikely(!apic_enabled(apic))) 682 695 break; 683 696 684 - if (trig_mode) { 685 - apic_debug("level trig mode for vector %d", vector); 686 - apic_set_vector(vector, apic->regs + APIC_TMR); 687 - } else 688 - apic_clear_vector(vector, apic->regs + APIC_TMR); 697 + if (dest_map) 698 + __set_bit(vcpu->vcpu_id, dest_map); 689 699 690 - result = !apic_test_and_set_irr(vector, apic); 691 - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 692 - trig_mode, vector, !result); 693 - if (!result) { 694 - if (trig_mode) 695 - apic_debug("level trig mode repeatedly for " 696 - "vector %d", vector); 697 - break; 700 + if (kvm_x86_ops->deliver_posted_interrupt) { 701 + result = 1; 702 + kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); 703 + } else { 704 + result = !apic_test_and_set_irr(vector, apic); 705 + 706 + if (!result) { 707 + if (trig_mode) 708 + apic_debug("level trig mode repeatedly " 709 + "for vector %d", vector); 710 + goto out; 711 + } 712 + 713 + kvm_make_request(KVM_REQ_EVENT, vcpu); 714 + kvm_vcpu_kick(vcpu); 698 715 } 699 - 700 - kvm_make_request(KVM_REQ_EVENT, vcpu); 701 - kvm_vcpu_kick(vcpu); 716 + out: 717 + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 718 + trig_mode, vector, !result); 702 719 break; 703 720 704 721 case APIC_DM_REMRD: ··· 722 731 case APIC_DM_INIT: 723 732 if (!trig_mode || level) { 724 733 result = 1; 725 - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 734 + /* assumes that there are only KVM_APIC_INIT/SIPI */ 735 + apic->pending_events = (1UL << KVM_APIC_INIT); 736 + /* make sure pending_events is visible before sending 737 + * the request */ 738 + smp_wmb(); 726 739 kvm_make_request(KVM_REQ_EVENT, vcpu); 727 740 kvm_vcpu_kick(vcpu); 728 741 } else { ··· 738 743 case APIC_DM_STARTUP: 739 744 apic_debug("SIPI to vcpu %d vector 0x%02x\n", 740 745 vcpu->vcpu_id, vector); 741 - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 742 - result = 1; 743 - vcpu->arch.sipi_vector = vector; 744 - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 745 - kvm_make_request(KVM_REQ_EVENT, vcpu); 746 - kvm_vcpu_kick(vcpu); 747 - } 746 + result = 1; 747 + apic->sipi_vector = vector; 748 + /* make sure sipi_vector is visible for the receiver */ 749 + smp_wmb(); 750 + set_bit(KVM_APIC_SIPI, &apic->pending_events); 751 + kvm_make_request(KVM_REQ_EVENT, vcpu); 752 + kvm_vcpu_kick(vcpu); 748 753 break; 749 754 750 755 case APIC_DM_EXTINT: ··· 777 782 trigger_mode = IOAPIC_LEVEL_TRIG; 778 783 else 779 784 trigger_mode = IOAPIC_EDGE_TRIG; 780 - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 785 + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); 781 786 } 782 787 } 783 788 ··· 843 848 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 844 849 irq.vector); 845 850 846 - kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 851 + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); 847 852 } 848 853 849 854 static u32 apic_get_tmcct(struct kvm_lapic *apic) ··· 1479 1484 vector = reg & APIC_VECTOR_MASK; 1480 1485 mode = reg & APIC_MODE_MASK; 1481 1486 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; 1482 - return __apic_accept_irq(apic, mode, vector, 1, trig_mode); 1487 + return __apic_accept_irq(apic, mode, vector, 1, trig_mode, 1488 + NULL); 1483 1489 } 1484 1490 return 0; 1485 1491 } ··· 1650 1654 apic->highest_isr_cache = -1; 1651 1655 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); 1652 1656 kvm_make_request(KVM_REQ_EVENT, vcpu); 1657 + kvm_rtc_eoi_tracking_restore_one(vcpu); 1653 1658 } 1654 1659 1655 1660 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) ··· 1855 1858 return 0; 1856 1859 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, 1857 1860 addr, sizeof(u8)); 1861 + } 1862 + 1863 + void kvm_apic_accept_events(struct kvm_vcpu *vcpu) 1864 + { 1865 + struct kvm_lapic *apic = vcpu->arch.apic; 1866 + unsigned int sipi_vector; 1867 + 1868 + if (!kvm_vcpu_has_lapic(vcpu)) 1869 + return; 1870 + 1871 + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { 1872 + kvm_lapic_reset(vcpu); 1873 + kvm_vcpu_reset(vcpu); 1874 + if (kvm_vcpu_is_bsp(apic->vcpu)) 1875 + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 1876 + else 1877 + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 1878 + } 1879 + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && 1880 + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 1881 + /* evaluate pending_events before reading the vector */ 1882 + smp_rmb(); 1883 + sipi_vector = apic->sipi_vector; 1884 + pr_debug("vcpu %d received sipi with vector # %x\n", 1885 + vcpu->vcpu_id, sipi_vector); 1886 + kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); 1887 + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 1888 + } 1858 1889 } 1859 1890 1860 1891 void kvm_lapic_init(void)

+17 -5

arch/x86/kvm/lapic.h

··· 5 5 6 6 #include <linux/kvm_host.h> 7 7 8 + #define KVM_APIC_INIT 0 9 + #define KVM_APIC_SIPI 1 10 + 8 11 struct kvm_timer { 9 12 struct hrtimer timer; 10 13 s64 period; /* unit: ns */ ··· 35 32 void *regs; 36 33 gpa_t vapic_addr; 37 34 struct page *vapic_page; 35 + unsigned long pending_events; 36 + unsigned int sipi_vector; 38 37 }; 39 38 int kvm_create_lapic(struct kvm_vcpu *vcpu); 40 39 void kvm_free_lapic(struct kvm_vcpu *vcpu); ··· 44 39 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); 45 40 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); 46 41 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); 42 + void kvm_apic_accept_events(struct kvm_vcpu *vcpu); 47 43 void kvm_lapic_reset(struct kvm_vcpu *vcpu); 48 44 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 49 45 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); ··· 53 47 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 54 48 void kvm_apic_set_version(struct kvm_vcpu *vcpu); 55 49 50 + void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); 51 + void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); 56 52 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 57 53 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 58 - int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 54 + int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 55 + unsigned long *dest_map); 59 56 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 60 57 61 58 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 62 - struct kvm_lapic_irq *irq, int *r); 59 + struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); 63 60 64 61 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 65 62 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); ··· 163 154 return ldr & map->lid_mask; 164 155 } 165 156 166 - void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 167 - struct kvm_lapic_irq *irq, 168 - u64 *eoi_bitmap); 157 + static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) 158 + { 159 + return vcpu->arch.apic->pending_events; 160 + } 161 + 162 + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); 169 163 170 164 #endif

+61 -47

arch/x86/kvm/mmu.c

··· 199 199 200 200 static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) 201 201 { 202 + struct kvm_mmu_page *sp = page_header(__pa(sptep)); 203 + 202 204 access &= ACC_WRITE_MASK | ACC_USER_MASK; 203 205 206 + sp->mmio_cached = true; 204 207 trace_mark_mmio_spte(sptep, gfn, access); 205 208 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); 206 209 } ··· 1505 1502 u64 *parent_pte, int direct) 1506 1503 { 1507 1504 struct kvm_mmu_page *sp; 1505 + 1508 1506 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1509 1507 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1510 1508 if (!direct) ··· 1648 1644 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1649 1645 struct list_head *invalid_list); 1650 1646 1651 - #define for_each_gfn_sp(kvm, sp, gfn) \ 1652 - hlist_for_each_entry(sp, \ 1653 - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1654 - if ((sp)->gfn != (gfn)) {} else 1647 + #define for_each_gfn_sp(_kvm, _sp, _gfn) \ 1648 + hlist_for_each_entry(_sp, \ 1649 + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 1650 + if ((_sp)->gfn != (_gfn)) {} else 1655 1651 1656 - #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \ 1657 - hlist_for_each_entry(sp, \ 1658 - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1659 - if ((sp)->gfn != (gfn) || (sp)->role.direct || \ 1660 - (sp)->role.invalid) {} else 1652 + #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 1653 + for_each_gfn_sp(_kvm, _sp, _gfn) \ 1654 + if ((_sp)->role.direct || (_sp)->role.invalid) {} else 1661 1655 1662 1656 /* @sp->gfn should be write-protected at the call site */ 1663 1657 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, ··· 2091 2089 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2092 2090 struct list_head *invalid_list) 2093 2091 { 2094 - struct kvm_mmu_page *sp; 2092 + struct kvm_mmu_page *sp, *nsp; 2095 2093 2096 2094 if (list_empty(invalid_list)) 2097 2095 return; ··· 2108 2106 */ 2109 2107 kvm_flush_remote_tlbs(kvm); 2110 2108 2111 - do { 2112 - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2109 + list_for_each_entry_safe(sp, nsp, invalid_list, link) { 2113 2110 WARN_ON(!sp->role.invalid || sp->root_count); 2114 2111 kvm_mmu_free_page(sp); 2115 - } while (!list_empty(invalid_list)); 2112 + } 2113 + } 2114 + 2115 + static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, 2116 + struct list_head *invalid_list) 2117 + { 2118 + struct kvm_mmu_page *sp; 2119 + 2120 + if (list_empty(&kvm->arch.active_mmu_pages)) 2121 + return false; 2122 + 2123 + sp = list_entry(kvm->arch.active_mmu_pages.prev, 2124 + struct kvm_mmu_page, link); 2125 + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2126 + 2127 + return true; 2116 2128 } 2117 2129 2118 2130 /* ··· 2136 2120 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) 2137 2121 { 2138 2122 LIST_HEAD(invalid_list); 2139 - /* 2140 - * If we set the number of mmu pages to be smaller be than the 2141 - * number of actived pages , we must to free some mmu pages before we 2142 - * change the value 2143 - */ 2144 2123 2145 2124 spin_lock(&kvm->mmu_lock); 2146 2125 2147 2126 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2148 - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && 2149 - !list_empty(&kvm->arch.active_mmu_pages)) { 2150 - struct kvm_mmu_page *page; 2127 + /* Need to free some mmu pages to achieve the goal. */ 2128 + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) 2129 + if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) 2130 + break; 2151 2131 2152 - page = container_of(kvm->arch.active_mmu_pages.prev, 2153 - struct kvm_mmu_page, link); 2154 - kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); 2155 - } 2156 2132 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2157 2133 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2158 2134 } ··· 2802 2794 2803 2795 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2804 2796 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2797 + static void make_mmu_pages_available(struct kvm_vcpu *vcpu); 2805 2798 2806 2799 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 2807 2800 gfn_t gfn, bool prefault) ··· 2844 2835 spin_lock(&vcpu->kvm->mmu_lock); 2845 2836 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 2846 2837 goto out_unlock; 2847 - kvm_mmu_free_some_pages(vcpu); 2838 + make_mmu_pages_available(vcpu); 2848 2839 if (likely(!force_pt_level)) 2849 2840 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 2850 2841 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, ··· 2922 2913 2923 2914 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2924 2915 spin_lock(&vcpu->kvm->mmu_lock); 2925 - kvm_mmu_free_some_pages(vcpu); 2916 + make_mmu_pages_available(vcpu); 2926 2917 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 2927 2918 1, ACC_ALL, NULL); 2928 2919 ++sp->root_count; ··· 2934 2925 2935 2926 ASSERT(!VALID_PAGE(root)); 2936 2927 spin_lock(&vcpu->kvm->mmu_lock); 2937 - kvm_mmu_free_some_pages(vcpu); 2928 + make_mmu_pages_available(vcpu); 2938 2929 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 2939 2930 i << 30, 2940 2931 PT32_ROOT_LEVEL, 1, ACC_ALL, ··· 2973 2964 ASSERT(!VALID_PAGE(root)); 2974 2965 2975 2966 spin_lock(&vcpu->kvm->mmu_lock); 2976 - kvm_mmu_free_some_pages(vcpu); 2967 + make_mmu_pages_available(vcpu); 2977 2968 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 2978 2969 0, ACC_ALL, NULL); 2979 2970 root = __pa(sp->spt); ··· 3007 2998 return 1; 3008 2999 } 3009 3000 spin_lock(&vcpu->kvm->mmu_lock); 3010 - kvm_mmu_free_some_pages(vcpu); 3001 + make_mmu_pages_available(vcpu); 3011 3002 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 3012 3003 PT32_ROOT_LEVEL, 0, 3013 3004 ACC_ALL, NULL); ··· 3313 3304 spin_lock(&vcpu->kvm->mmu_lock); 3314 3305 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3315 3306 goto out_unlock; 3316 - kvm_mmu_free_some_pages(vcpu); 3307 + make_mmu_pages_available(vcpu); 3317 3308 if (likely(!force_pt_level)) 3318 3309 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3319 3310 r = __direct_map(vcpu, gpa, write, map_writable, ··· 4015 4006 } 4016 4007 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 4017 4008 4018 - void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 4009 + static void make_mmu_pages_available(struct kvm_vcpu *vcpu) 4019 4010 { 4020 4011 LIST_HEAD(invalid_list); 4021 4012 4022 - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && 4023 - !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 4024 - struct kvm_mmu_page *sp; 4013 + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) 4014 + return; 4025 4015 4026 - sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 4027 - struct kvm_mmu_page, link); 4028 - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 4016 + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { 4017 + if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) 4018 + break; 4019 + 4029 4020 ++vcpu->kvm->stat.mmu_recycled; 4030 4021 } 4031 4022 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); ··· 4194 4185 spin_unlock(&kvm->mmu_lock); 4195 4186 } 4196 4187 4197 - static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 4198 - struct list_head *invalid_list) 4188 + void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) 4199 4189 { 4200 - struct kvm_mmu_page *page; 4190 + struct kvm_mmu_page *sp, *node; 4191 + LIST_HEAD(invalid_list); 4201 4192 4202 - if (list_empty(&kvm->arch.active_mmu_pages)) 4203 - return; 4193 + spin_lock(&kvm->mmu_lock); 4194 + restart: 4195 + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 4196 + if (!sp->mmio_cached) 4197 + continue; 4198 + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 4199 + goto restart; 4200 + } 4204 4201 4205 - page = container_of(kvm->arch.active_mmu_pages.prev, 4206 - struct kvm_mmu_page, link); 4207 - kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 4202 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 4203 + spin_unlock(&kvm->mmu_lock); 4208 4204 } 4209 4205 4210 4206 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) ··· 4246 4232 idx = srcu_read_lock(&kvm->srcu); 4247 4233 spin_lock(&kvm->mmu_lock); 4248 4234 4249 - kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); 4235 + prepare_zap_oldest_mmu_page(kvm, &invalid_list); 4250 4236 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4251 4237 4252 4238 spin_unlock(&kvm->mmu_lock);

+4 -7

arch/x86/kvm/mmu.h

··· 57 57 58 58 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 59 59 { 60 - return kvm->arch.n_max_mmu_pages - 61 - kvm->arch.n_used_mmu_pages; 62 - } 60 + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) 61 + return kvm->arch.n_max_mmu_pages - 62 + kvm->arch.n_used_mmu_pages; 63 63 64 - static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 65 - { 66 - if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) 67 - __kvm_mmu_free_some_pages(vcpu); 64 + return 0; 68 65 } 69 66 70 67 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)

+1 -1

arch/x86/kvm/paging_tmpl.h

··· 627 627 goto out_unlock; 628 628 629 629 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 630 - kvm_mmu_free_some_pages(vcpu); 630 + make_mmu_pages_available(vcpu); 631 631 if (!force_pt_level) 632 632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 633 633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,

+11 -3

arch/x86/kvm/pmu.c

··· 360 360 return 1; 361 361 } 362 362 363 - int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) 363 + int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 364 364 { 365 365 struct kvm_pmu *pmu = &vcpu->arch.pmu; 366 366 struct kvm_pmc *pmc; 367 + u32 index = msr_info->index; 368 + u64 data = msr_info->data; 367 369 368 370 switch (index) { 369 371 case MSR_CORE_PERF_FIXED_CTR_CTRL: ··· 377 375 } 378 376 break; 379 377 case MSR_CORE_PERF_GLOBAL_STATUS: 378 + if (msr_info->host_initiated) { 379 + pmu->global_status = data; 380 + return 0; 381 + } 380 382 break; /* RO MSR */ 381 383 case MSR_CORE_PERF_GLOBAL_CTRL: 382 384 if (pmu->global_ctrl == data) ··· 392 386 break; 393 387 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 394 388 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { 395 - pmu->global_status &= ~data; 389 + if (!msr_info->host_initiated) 390 + pmu->global_status &= ~data; 396 391 pmu->global_ovf_ctrl = data; 397 392 return 0; 398 393 } ··· 401 394 default: 402 395 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || 403 396 (pmc = get_fixed_pmc(pmu, index))) { 404 - data = (s64)(s32)data; 397 + if (!msr_info->host_initiated) 398 + data = (s64)(s32)data; 405 399 pmc->counter += data - read_pmc(pmc); 406 400 return 0; 407 401 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {

+20 -20

arch/x86/kvm/svm.c

··· 1131 1131 init_seg(&save->gs); 1132 1132 1133 1133 save->cs.selector = 0xf000; 1134 + save->cs.base = 0xffff0000; 1134 1135 /* Executable/Readable Code Segment */ 1135 1136 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1136 1137 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1137 1138 save->cs.limit = 0xffff; 1138 - /* 1139 - * cs.base should really be 0xffff0000, but vmx can't handle that, so 1140 - * be consistent with it. 1141 - * 1142 - * Replace when we have real mode working for vmx. 1143 - */ 1144 - save->cs.base = 0xf0000; 1145 1139 1146 1140 save->gdtr.limit = 0xffff; 1147 1141 save->idtr.limit = 0xffff; ··· 1185 1191 enable_gif(svm); 1186 1192 } 1187 1193 1188 - static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 1194 + static void svm_vcpu_reset(struct kvm_vcpu *vcpu) 1189 1195 { 1190 1196 struct vcpu_svm *svm = to_svm(vcpu); 1191 1197 u32 dummy; ··· 1193 1199 1194 1200 init_vmcb(svm); 1195 1201 1196 - if (!kvm_vcpu_is_bsp(vcpu)) { 1197 - kvm_rip_write(vcpu, 0); 1198 - svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 1199 - svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 1200 - } 1201 - 1202 1202 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1203 1203 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1204 - 1205 - return 0; 1206 1204 } 1207 1205 1208 1206 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) ··· 3473 3487 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3474 3488 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 3475 3489 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 3476 - printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3490 + printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " 3477 3491 "exit_code 0x%x\n", 3478 3492 __func__, svm->vmcb->control.exit_int_info, 3479 3493 exit_code); ··· 3577 3591 return; 3578 3592 } 3579 3593 3594 + static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) 3595 + { 3596 + return; 3597 + } 3598 + 3580 3599 static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3581 3600 { 3582 3601 struct vcpu_svm *svm = to_svm(vcpu); ··· 3632 3641 return ret; 3633 3642 } 3634 3643 3635 - static void enable_irq_window(struct kvm_vcpu *vcpu) 3644 + static int enable_irq_window(struct kvm_vcpu *vcpu) 3636 3645 { 3637 3646 struct vcpu_svm *svm = to_svm(vcpu); 3638 3647 ··· 3646 3655 svm_set_vintr(svm); 3647 3656 svm_inject_irq(svm, 0x0); 3648 3657 } 3658 + return 0; 3649 3659 } 3650 3660 3651 - static void enable_nmi_window(struct kvm_vcpu *vcpu) 3661 + static int enable_nmi_window(struct kvm_vcpu *vcpu) 3652 3662 { 3653 3663 struct vcpu_svm *svm = to_svm(vcpu); 3654 3664 3655 3665 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) 3656 3666 == HF_NMI_MASK) 3657 - return; /* IRET will cause a vm exit */ 3667 + return 0; /* IRET will cause a vm exit */ 3658 3668 3659 3669 /* 3660 3670 * Something prevents NMI from been injected. Single step over possible ··· 3664 3672 svm->nmi_singlestep = true; 3665 3673 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3666 3674 update_db_bp_intercept(vcpu); 3675 + return 0; 3667 3676 } 3668 3677 3669 3678 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) ··· 4240 4247 return ret; 4241 4248 } 4242 4249 4250 + static void svm_handle_external_intr(struct kvm_vcpu *vcpu) 4251 + { 4252 + local_irq_enable(); 4253 + } 4254 + 4243 4255 static struct kvm_x86_ops svm_x86_ops = { 4244 4256 .cpu_has_kvm_support = has_svm, 4245 4257 .disabled_by_bios = is_disabled, ··· 4312 4314 .vm_has_apicv = svm_vm_has_apicv, 4313 4315 .load_eoi_exitmap = svm_load_eoi_exitmap, 4314 4316 .hwapic_isr_update = svm_hwapic_isr_update, 4317 + .sync_pir_to_irr = svm_sync_pir_to_irr, 4315 4318 4316 4319 .set_tss_addr = svm_set_tss_addr, 4317 4320 .get_tdp_level = get_npt_level, ··· 4341 4342 .set_tdp_cr3 = set_tdp_cr3, 4342 4343 4343 4344 .check_intercept = svm_check_intercept, 4345 + .handle_external_intr = svm_handle_external_intr, 4344 4346 }; 4345 4347 4346 4348 static int __init svm_init(void)

+834 -243

arch/x86/kvm/vmx.c

··· 84 84 static bool __read_mostly fasteoi = 1; 85 85 module_param(fasteoi, bool, S_IRUGO); 86 86 87 - static bool __read_mostly enable_apicv_reg_vid; 87 + static bool __read_mostly enable_apicv = 1; 88 + module_param(enable_apicv, bool, S_IRUGO); 88 89 90 + static bool __read_mostly enable_shadow_vmcs = 1; 91 + module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 89 92 /* 90 93 * If nested=1, nested virtualization is supported, i.e., guests may use 91 94 * VMX and be a hypervisor for its own guests. If nested=0, guests may not ··· 301 298 u32 guest_activity_state; 302 299 u32 guest_sysenter_cs; 303 300 u32 host_ia32_sysenter_cs; 304 - u32 padding32[8]; /* room for future expansion */ 301 + u32 vmx_preemption_timer_value; 302 + u32 padding32[7]; /* room for future expansion */ 305 303 u16 virtual_processor_id; 306 304 u16 guest_es_selector; 307 305 u16 guest_cs_selector; ··· 355 351 /* The host-usable pointer to the above */ 356 352 struct page *current_vmcs12_page; 357 353 struct vmcs12 *current_vmcs12; 354 + struct vmcs *current_shadow_vmcs; 355 + /* 356 + * Indicates if the shadow vmcs must be updated with the 357 + * data hold by vmcs12 358 + */ 359 + bool sync_shadow_vmcs; 358 360 359 361 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 360 362 struct list_head vmcs02_pool; ··· 375 365 struct page *apic_access_page; 376 366 }; 377 367 368 + #define POSTED_INTR_ON 0 369 + /* Posted-Interrupt Descriptor */ 370 + struct pi_desc { 371 + u32 pir[8]; /* Posted interrupt requested */ 372 + u32 control; /* bit 0 of control is outstanding notification bit */ 373 + u32 rsvd[7]; 374 + } __aligned(64); 375 + 376 + static bool pi_test_and_set_on(struct pi_desc *pi_desc) 377 + { 378 + return test_and_set_bit(POSTED_INTR_ON, 379 + (unsigned long *)&pi_desc->control); 380 + } 381 + 382 + static bool pi_test_and_clear_on(struct pi_desc *pi_desc) 383 + { 384 + return test_and_clear_bit(POSTED_INTR_ON, 385 + (unsigned long *)&pi_desc->control); 386 + } 387 + 388 + static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) 389 + { 390 + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 391 + } 392 + 378 393 struct vcpu_vmx { 379 394 struct kvm_vcpu vcpu; 380 395 unsigned long host_rsp; ··· 412 377 struct shared_msr_entry *guest_msrs; 413 378 int nmsrs; 414 379 int save_nmsrs; 380 + unsigned long host_idt_base; 415 381 #ifdef CONFIG_X86_64 416 382 u64 msr_host_kernel_gs_base; 417 383 u64 msr_guest_kernel_gs_base; ··· 464 428 465 429 bool rdtscp_enabled; 466 430 431 + /* Posted interrupt descriptor */ 432 + struct pi_desc pi_desc; 433 + 467 434 /* Support for a guest hypervisor (nested VMX) */ 468 435 struct nested_vmx nested; 469 436 }; ··· 489 450 #define FIELD(number, name) [number] = VMCS12_OFFSET(name) 490 451 #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 491 452 [number##_HIGH] = VMCS12_OFFSET(name)+4 453 + 454 + 455 + static const unsigned long shadow_read_only_fields[] = { 456 + /* 457 + * We do NOT shadow fields that are modified when L0 458 + * traps and emulates any vmx instruction (e.g. VMPTRLD, 459 + * VMXON...) executed by L1. 460 + * For example, VM_INSTRUCTION_ERROR is read 461 + * by L1 if a vmx instruction fails (part of the error path). 462 + * Note the code assumes this logic. If for some reason 463 + * we start shadowing these fields then we need to 464 + * force a shadow sync when L0 emulates vmx instructions 465 + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified 466 + * by nested_vmx_failValid) 467 + */ 468 + VM_EXIT_REASON, 469 + VM_EXIT_INTR_INFO, 470 + VM_EXIT_INSTRUCTION_LEN, 471 + IDT_VECTORING_INFO_FIELD, 472 + IDT_VECTORING_ERROR_CODE, 473 + VM_EXIT_INTR_ERROR_CODE, 474 + EXIT_QUALIFICATION, 475 + GUEST_LINEAR_ADDRESS, 476 + GUEST_PHYSICAL_ADDRESS 477 + }; 478 + static const int max_shadow_read_only_fields = 479 + ARRAY_SIZE(shadow_read_only_fields); 480 + 481 + static const unsigned long shadow_read_write_fields[] = { 482 + GUEST_RIP, 483 + GUEST_RSP, 484 + GUEST_CR0, 485 + GUEST_CR3, 486 + GUEST_CR4, 487 + GUEST_INTERRUPTIBILITY_INFO, 488 + GUEST_RFLAGS, 489 + GUEST_CS_SELECTOR, 490 + GUEST_CS_AR_BYTES, 491 + GUEST_CS_LIMIT, 492 + GUEST_CS_BASE, 493 + GUEST_ES_BASE, 494 + CR0_GUEST_HOST_MASK, 495 + CR0_READ_SHADOW, 496 + CR4_READ_SHADOW, 497 + TSC_OFFSET, 498 + EXCEPTION_BITMAP, 499 + CPU_BASED_VM_EXEC_CONTROL, 500 + VM_ENTRY_EXCEPTION_ERROR_CODE, 501 + VM_ENTRY_INTR_INFO_FIELD, 502 + VM_ENTRY_INSTRUCTION_LEN, 503 + VM_ENTRY_EXCEPTION_ERROR_CODE, 504 + HOST_FS_BASE, 505 + HOST_GS_BASE, 506 + HOST_FS_SELECTOR, 507 + HOST_GS_SELECTOR 508 + }; 509 + static const int max_shadow_read_write_fields = 510 + ARRAY_SIZE(shadow_read_write_fields); 492 511 493 512 static const unsigned short vmcs_field_to_offset_table[] = { 494 513 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), ··· 634 537 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), 635 538 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), 636 539 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), 540 + FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), 637 541 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), 638 542 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), 639 543 FIELD(CR0_READ_SHADOW, cr0_read_shadow), ··· 722 624 struct kvm_segment *var, int seg); 723 625 static bool guest_state_valid(struct kvm_vcpu *vcpu); 724 626 static u32 vmx_segment_access_rights(struct kvm_segment *var); 627 + static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); 628 + static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); 629 + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 725 630 726 631 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 727 632 static DEFINE_PER_CPU(struct vmcs *, current_vmcs); ··· 741 640 static unsigned long *vmx_msr_bitmap_longmode; 742 641 static unsigned long *vmx_msr_bitmap_legacy_x2apic; 743 642 static unsigned long *vmx_msr_bitmap_longmode_x2apic; 643 + static unsigned long *vmx_vmread_bitmap; 644 + static unsigned long *vmx_vmwrite_bitmap; 744 645 745 646 static bool cpu_has_load_ia32_efer; 746 647 static bool cpu_has_load_perf_global_ctrl; ··· 885 782 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 886 783 } 887 784 785 + static inline bool cpu_has_vmx_posted_intr(void) 786 + { 787 + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 788 + } 789 + 790 + static inline bool cpu_has_vmx_apicv(void) 791 + { 792 + return cpu_has_vmx_apic_register_virt() && 793 + cpu_has_vmx_virtual_intr_delivery() && 794 + cpu_has_vmx_posted_intr(); 795 + } 796 + 888 797 static inline bool cpu_has_vmx_flexpriority(void) 889 798 { 890 799 return cpu_has_vmx_tpr_shadow() && ··· 1008 893 { 1009 894 return vmcs_config.cpu_based_2nd_exec_ctrl & 1010 895 SECONDARY_EXEC_WBINVD_EXITING; 896 + } 897 + 898 + static inline bool cpu_has_vmx_shadow_vmcs(void) 899 + { 900 + u64 vmx_msr; 901 + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 902 + /* check if the cpu supports writing r/o exit information fields */ 903 + if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 904 + return false; 905 + 906 + return vmcs_config.cpu_based_2nd_exec_ctrl & 907 + SECONDARY_EXEC_SHADOW_VMCS; 1011 908 } 1012 909 1013 910 static inline bool report_flexpriority(void) ··· 1917 1790 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1918 1791 1919 1792 if (nr == PF_VECTOR && is_guest_mode(vcpu) && 1920 - nested_pf_handled(vcpu)) 1793 + !vmx->nested.nested_run_pending && nested_pf_handled(vcpu)) 1921 1794 return; 1922 1795 1923 1796 if (has_error_code) { ··· 2149 2022 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; 2150 2023 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; 2151 2024 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; 2025 + static u32 nested_vmx_misc_low, nested_vmx_misc_high; 2152 2026 static __init void nested_vmx_setup_ctls_msrs(void) 2153 2027 { 2154 2028 /* ··· 2168 2040 */ 2169 2041 2170 2042 /* pin-based controls */ 2043 + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2044 + nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); 2171 2045 /* 2172 2046 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is 2173 2047 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. 2174 2048 */ 2175 - nested_vmx_pinbased_ctls_low = 0x16 ; 2176 - nested_vmx_pinbased_ctls_high = 0x16 | 2177 - PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | 2178 - PIN_BASED_VIRTUAL_NMIS; 2049 + nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2050 + nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | 2051 + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | 2052 + PIN_BASED_VMX_PREEMPTION_TIMER; 2053 + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2179 2054 2180 - /* exit controls */ 2181 - nested_vmx_exit_ctls_low = 0; 2055 + /* 2056 + * Exit controls 2057 + * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and 2058 + * 17 must be 1. 2059 + */ 2060 + nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2182 2061 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2183 2062 #ifdef CONFIG_X86_64 2184 2063 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; 2185 2064 #else 2186 2065 nested_vmx_exit_ctls_high = 0; 2187 2066 #endif 2067 + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2188 2068 2189 2069 /* entry controls */ 2190 2070 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2191 2071 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); 2192 - nested_vmx_entry_ctls_low = 0; 2072 + /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ 2073 + nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2193 2074 nested_vmx_entry_ctls_high &= 2194 2075 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; 2076 + nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2195 2077 2196 2078 /* cpu-based controls */ 2197 2079 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ··· 2218 2080 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 2219 2081 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 2220 2082 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | 2083 + CPU_BASED_PAUSE_EXITING | 2221 2084 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2222 2085 /* 2223 2086 * We can allow some features even when not supported by the ··· 2233 2094 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); 2234 2095 nested_vmx_secondary_ctls_low = 0; 2235 2096 nested_vmx_secondary_ctls_high &= 2236 - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2097 + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2098 + SECONDARY_EXEC_WBINVD_EXITING; 2099 + 2100 + /* miscellaneous data */ 2101 + rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2102 + nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | 2103 + VMX_MISC_SAVE_EFER_LMA; 2104 + nested_vmx_misc_high = 0; 2237 2105 } 2238 2106 2239 2107 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) ··· 2311 2165 nested_vmx_entry_ctls_high); 2312 2166 break; 2313 2167 case MSR_IA32_VMX_MISC: 2314 - *pdata = 0; 2168 + *pdata = vmx_control_msr(nested_vmx_misc_low, 2169 + nested_vmx_misc_high); 2315 2170 break; 2316 2171 /* 2317 2172 * These MSRs specify bits which the guest must keep fixed (on or off) ··· 2676 2529 u32 _vmexit_control = 0; 2677 2530 u32 _vmentry_control = 0; 2678 2531 2679 - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 2680 - opt = PIN_BASED_VIRTUAL_NMIS; 2681 - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 2682 - &_pin_based_exec_control) < 0) 2683 - return -EIO; 2684 - 2685 2532 min = CPU_BASED_HLT_EXITING | 2686 2533 #ifdef CONFIG_X86_64 2687 2534 CPU_BASED_CR8_LOAD_EXITING | ··· 2714 2573 SECONDARY_EXEC_RDTSCP | 2715 2574 SECONDARY_EXEC_ENABLE_INVPCID | 2716 2575 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2717 - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 2576 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2577 + SECONDARY_EXEC_SHADOW_VMCS; 2718 2578 if (adjust_vmx_controls(min2, opt2, 2719 2579 MSR_IA32_VMX_PROCBASED_CTLS2, 2720 2580 &_cpu_based_2nd_exec_control) < 0) ··· 2747 2605 #ifdef CONFIG_X86_64 2748 2606 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2749 2607 #endif 2750 - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; 2608 + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | 2609 + VM_EXIT_ACK_INTR_ON_EXIT; 2751 2610 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2752 2611 &_vmexit_control) < 0) 2753 2612 return -EIO; 2613 + 2614 + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 2615 + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; 2616 + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 2617 + &_pin_based_exec_control) < 0) 2618 + return -EIO; 2619 + 2620 + if (!(_cpu_based_2nd_exec_control & 2621 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || 2622 + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) 2623 + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2754 2624 2755 2625 min = 0; 2756 2626 opt = VM_ENTRY_LOAD_IA32_PAT; ··· 2916 2762 2917 2763 if (!cpu_has_vmx_vpid()) 2918 2764 enable_vpid = 0; 2765 + if (!cpu_has_vmx_shadow_vmcs()) 2766 + enable_shadow_vmcs = 0; 2919 2767 2920 2768 if (!cpu_has_vmx_ept() || 2921 2769 !cpu_has_vmx_ept_4levels()) { ··· 2944 2788 if (!cpu_has_vmx_ple()) 2945 2789 ple_gap = 0; 2946 2790 2947 - if (!cpu_has_vmx_apic_register_virt() || 2948 - !cpu_has_vmx_virtual_intr_delivery()) 2949 - enable_apicv_reg_vid = 0; 2791 + if (!cpu_has_vmx_apicv()) 2792 + enable_apicv = 0; 2950 2793 2951 - if (enable_apicv_reg_vid) 2794 + if (enable_apicv) 2952 2795 kvm_x86_ops->update_cr8_intercept = NULL; 2953 - else 2796 + else { 2954 2797 kvm_x86_ops->hwapic_irr_update = NULL; 2798 + kvm_x86_ops->deliver_posted_interrupt = NULL; 2799 + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; 2800 + } 2955 2801 2956 2802 if (nested) 2957 2803 nested_vmx_setup_ctls_msrs(); ··· 3034 2876 vmx->cpl = 0; 3035 2877 } 3036 2878 3037 - static gva_t rmode_tss_base(struct kvm *kvm) 3038 - { 3039 - if (!kvm->arch.tss_addr) { 3040 - struct kvm_memslots *slots; 3041 - struct kvm_memory_slot *slot; 3042 - gfn_t base_gfn; 3043 - 3044 - slots = kvm_memslots(kvm); 3045 - slot = id_to_memslot(slots, 0); 3046 - base_gfn = slot->base_gfn + slot->npages - 3; 3047 - 3048 - return base_gfn << PAGE_SHIFT; 3049 - } 3050 - return kvm->arch.tss_addr; 3051 - } 3052 - 3053 2879 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3054 2880 { 3055 2881 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; ··· 3084 2942 3085 2943 /* 3086 2944 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3087 - * vcpu. Call it here with phys address pointing 16M below 4G. 2945 + * vcpu. Warn the user that an update is overdue. 3088 2946 */ 3089 - if (!vcpu->kvm->arch.tss_addr) { 2947 + if (!vcpu->kvm->arch.tss_addr) 3090 2948 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3091 2949 "called before entering vcpu\n"); 3092 - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 3093 - vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); 3094 - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 3095 - } 3096 2950 3097 2951 vmx_segment_cache_clear(vmx); 3098 2952 3099 - vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 2953 + vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); 3100 2954 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3101 2955 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3102 2956 ··· 3352 3214 */ 3353 3215 if (!nested_vmx_allowed(vcpu)) 3354 3216 return 1; 3355 - } else if (to_vmx(vcpu)->nested.vmxon) 3217 + } 3218 + if (to_vmx(vcpu)->nested.vmxon && 3219 + ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) 3356 3220 return 1; 3357 3221 3358 3222 vcpu->arch.cr4 = cr4; ··· 3690 3550 return true; 3691 3551 3692 3552 /* real mode guest state checks */ 3693 - if (!is_protmode(vcpu)) { 3553 + if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3694 3554 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3695 3555 return false; 3696 3556 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) ··· 3739 3599 int r, idx, ret = 0; 3740 3600 3741 3601 idx = srcu_read_lock(&kvm->srcu); 3742 - fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 3602 + fn = kvm->arch.tss_addr >> PAGE_SHIFT; 3743 3603 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3744 3604 if (r < 0) 3745 3605 goto out; ··· 3832 3692 kvm_userspace_mem.flags = 0; 3833 3693 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; 3834 3694 kvm_userspace_mem.memory_size = PAGE_SIZE; 3835 - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); 3695 + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 3836 3696 if (r) 3837 3697 goto out; 3838 3698 ··· 3862 3722 kvm_userspace_mem.guest_phys_addr = 3863 3723 kvm->arch.ept_identity_map_addr; 3864 3724 kvm_userspace_mem.memory_size = PAGE_SIZE; 3865 - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); 3725 + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 3866 3726 if (r) 3867 3727 goto out; 3868 3728 ··· 4009 3869 msr, MSR_TYPE_W); 4010 3870 } 4011 3871 3872 + static int vmx_vm_has_apicv(struct kvm *kvm) 3873 + { 3874 + return enable_apicv && irqchip_in_kernel(kvm); 3875 + } 3876 + 3877 + /* 3878 + * Send interrupt to vcpu via posted interrupt way. 3879 + * 1. If target vcpu is running(non-root mode), send posted interrupt 3880 + * notification to vcpu and hardware will sync PIR to vIRR atomically. 3881 + * 2. If target vcpu isn't running(root mode), kick it to pick up the 3882 + * interrupt from PIR in next vmentry. 3883 + */ 3884 + static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 3885 + { 3886 + struct vcpu_vmx *vmx = to_vmx(vcpu); 3887 + int r; 3888 + 3889 + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 3890 + return; 3891 + 3892 + r = pi_test_and_set_on(&vmx->pi_desc); 3893 + kvm_make_request(KVM_REQ_EVENT, vcpu); 3894 + #ifdef CONFIG_SMP 3895 + if (!r && (vcpu->mode == IN_GUEST_MODE)) 3896 + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), 3897 + POSTED_INTR_VECTOR); 3898 + else 3899 + #endif 3900 + kvm_vcpu_kick(vcpu); 3901 + } 3902 + 3903 + static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 3904 + { 3905 + struct vcpu_vmx *vmx = to_vmx(vcpu); 3906 + 3907 + if (!pi_test_and_clear_on(&vmx->pi_desc)) 3908 + return; 3909 + 3910 + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); 3911 + } 3912 + 3913 + static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) 3914 + { 3915 + return; 3916 + } 3917 + 4012 3918 /* 4013 3919 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4014 3920 * will not change in the lifetime of the guest. 4015 3921 * Note that host-state that does change is set elsewhere. E.g., host-state 4016 3922 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4017 3923 */ 4018 - static void vmx_set_constant_host_state(void) 3924 + static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4019 3925 { 4020 3926 u32 low32, high32; 4021 3927 unsigned long tmpl; ··· 4089 3903 4090 3904 native_store_idt(&dt); 4091 3905 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 3906 + vmx->host_idt_base = dt.address; 4092 3907 4093 3908 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ 4094 3909 ··· 4115 3928 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 4116 3929 } 4117 3930 3931 + static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 3932 + { 3933 + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 3934 + 3935 + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 3936 + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 3937 + return pin_based_exec_ctrl; 3938 + } 3939 + 4118 3940 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4119 3941 { 4120 3942 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; ··· 4139 3943 CPU_BASED_CR3_LOAD_EXITING | 4140 3944 CPU_BASED_INVLPG_EXITING; 4141 3945 return exec_control; 4142 - } 4143 - 4144 - static int vmx_vm_has_apicv(struct kvm *kvm) 4145 - { 4146 - return enable_apicv_reg_vid && irqchip_in_kernel(kvm); 4147 3946 } 4148 3947 4149 3948 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) ··· 4162 3971 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4163 3972 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4164 3973 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 3974 + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 3975 + (handle_vmptrld). 3976 + We can NOT enable shadow_vmcs here because we don't have yet 3977 + a current VMCS12 3978 + */ 3979 + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4165 3980 return exec_control; 4166 3981 } 4167 3982 ··· 4196 3999 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 4197 4000 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); 4198 4001 4002 + if (enable_shadow_vmcs) { 4003 + vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 4004 + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 4005 + } 4199 4006 if (cpu_has_vmx_msr_bitmap()) 4200 4007 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 4201 4008 4202 4009 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4203 4010 4204 4011 /* Control */ 4205 - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 4206 - vmcs_config.pin_based_exec_ctrl); 4012 + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4207 4013 4208 4014 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4209 4015 ··· 4215 4015 vmx_secondary_exec_control(vmx)); 4216 4016 } 4217 4017 4218 - if (enable_apicv_reg_vid) { 4018 + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { 4219 4019 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4220 4020 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4221 4021 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4222 4022 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4223 4023 4224 4024 vmcs_write16(GUEST_INTR_STATUS, 0); 4025 + 4026 + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4027 + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4225 4028 } 4226 4029 4227 4030 if (ple_gap) { ··· 4238 4035 4239 4036 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4240 4037 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4241 - vmx_set_constant_host_state(); 4038 + vmx_set_constant_host_state(vmx); 4242 4039 #ifdef CONFIG_X86_64 4243 4040 rdmsrl(MSR_FS_BASE, a); 4244 4041 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ ··· 4292 4089 return 0; 4293 4090 } 4294 4091 4295 - static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4092 + static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4296 4093 { 4297 4094 struct vcpu_vmx *vmx = to_vmx(vcpu); 4298 4095 u64 msr; 4299 - int ret; 4300 4096 4301 4097 vmx->rmode.vm86_active = 0; 4302 4098 ··· 4311 4109 vmx_segment_cache_clear(vmx); 4312 4110 4313 4111 seg_setup(VCPU_SREG_CS); 4314 - if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4315 - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4316 - else { 4317 - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 4318 - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 4319 - } 4112 + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4113 + vmcs_write32(GUEST_CS_BASE, 0xffff0000); 4320 4114 4321 4115 seg_setup(VCPU_SREG_DS); 4322 4116 seg_setup(VCPU_SREG_ES); ··· 4335 4137 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4336 4138 4337 4139 vmcs_writel(GUEST_RFLAGS, 0x02); 4338 - if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4339 - kvm_rip_write(vcpu, 0xfff0); 4340 - else 4341 - kvm_rip_write(vcpu, 0); 4140 + kvm_rip_write(vcpu, 0xfff0); 4342 4141 4343 4142 vmcs_writel(GUEST_GDTR_BASE, 0); 4344 4143 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); ··· 4366 4171 vmcs_write64(APIC_ACCESS_ADDR, 4367 4172 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); 4368 4173 4174 + if (vmx_vm_has_apicv(vcpu->kvm)) 4175 + memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); 4176 + 4369 4177 if (vmx->vpid != 0) 4370 4178 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4371 4179 4372 4180 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4373 - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4374 4181 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ 4375 - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4376 4182 vmx_set_cr4(&vmx->vcpu, 0); 4377 4183 vmx_set_efer(&vmx->vcpu, 0); 4378 4184 vmx_fpu_activate(&vmx->vcpu); 4379 4185 update_exception_bitmap(&vmx->vcpu); 4380 4186 4381 4187 vpid_sync_context(vmx); 4382 - 4383 - ret = 0; 4384 - 4385 - return ret; 4386 4188 } 4387 4189 4388 4190 /* ··· 4392 4200 PIN_BASED_EXT_INTR_MASK; 4393 4201 } 4394 4202 4395 - static void enable_irq_window(struct kvm_vcpu *vcpu) 4203 + static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 4204 + { 4205 + return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4206 + PIN_BASED_NMI_EXITING; 4207 + } 4208 + 4209 + static int enable_irq_window(struct kvm_vcpu *vcpu) 4396 4210 { 4397 4211 u32 cpu_based_vm_exec_control; 4398 - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4212 + 4213 + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 4399 4214 /* 4400 4215 * We get here if vmx_interrupt_allowed() said we can't 4401 - * inject to L1 now because L2 must run. Ask L2 to exit 4402 - * right after entry, so we can inject to L1 more promptly. 4216 + * inject to L1 now because L2 must run. The caller will have 4217 + * to make L2 exit right after entry, so we can inject to L1 4218 + * more promptly. 4403 4219 */ 4404 - kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); 4405 - return; 4406 - } 4220 + return -EBUSY; 4407 4221 4408 4222 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4409 4223 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 4410 4224 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4225 + return 0; 4411 4226 } 4412 4227 4413 - static void enable_nmi_window(struct kvm_vcpu *vcpu) 4228 + static int enable_nmi_window(struct kvm_vcpu *vcpu) 4414 4229 { 4415 4230 u32 cpu_based_vm_exec_control; 4416 4231 4417 - if (!cpu_has_virtual_nmis()) { 4418 - enable_irq_window(vcpu); 4419 - return; 4420 - } 4232 + if (!cpu_has_virtual_nmis()) 4233 + return enable_irq_window(vcpu); 4421 4234 4422 - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4423 - enable_irq_window(vcpu); 4424 - return; 4425 - } 4235 + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) 4236 + return enable_irq_window(vcpu); 4237 + 4426 4238 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4427 4239 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 4428 4240 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4241 + return 0; 4429 4242 } 4430 4243 4431 4244 static void vmx_inject_irq(struct kvm_vcpu *vcpu) ··· 4491 4294 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4492 4295 } 4493 4296 4494 - static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4495 - { 4496 - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 4497 - return 0; 4498 - 4499 - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4500 - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 4501 - | GUEST_INTR_STATE_NMI)); 4502 - } 4503 - 4504 4297 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4505 4298 { 4506 4299 if (!cpu_has_virtual_nmis()) ··· 4520 4333 } 4521 4334 } 4522 4335 4336 + static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4337 + { 4338 + if (is_guest_mode(vcpu)) { 4339 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4340 + 4341 + if (to_vmx(vcpu)->nested.nested_run_pending) 4342 + return 0; 4343 + if (nested_exit_on_nmi(vcpu)) { 4344 + nested_vmx_vmexit(vcpu); 4345 + vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI; 4346 + vmcs12->vm_exit_intr_info = NMI_VECTOR | 4347 + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK; 4348 + /* 4349 + * The NMI-triggered VM exit counts as injection: 4350 + * clear this one and block further NMIs. 4351 + */ 4352 + vcpu->arch.nmi_pending = 0; 4353 + vmx_set_nmi_mask(vcpu, true); 4354 + return 0; 4355 + } 4356 + } 4357 + 4358 + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 4359 + return 0; 4360 + 4361 + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4362 + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 4363 + | GUEST_INTR_STATE_NMI)); 4364 + } 4365 + 4523 4366 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4524 4367 { 4525 - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4368 + if (is_guest_mode(vcpu)) { 4526 4369 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4527 - if (to_vmx(vcpu)->nested.nested_run_pending || 4528 - (vmcs12->idt_vectoring_info_field & 4529 - VECTORING_INFO_VALID_MASK)) 4370 + 4371 + if (to_vmx(vcpu)->nested.nested_run_pending) 4530 4372 return 0; 4531 - nested_vmx_vmexit(vcpu); 4532 - vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; 4533 - vmcs12->vm_exit_intr_info = 0; 4534 - /* fall through to normal code, but now in L1, not L2 */ 4373 + if (nested_exit_on_intr(vcpu)) { 4374 + nested_vmx_vmexit(vcpu); 4375 + vmcs12->vm_exit_reason = 4376 + EXIT_REASON_EXTERNAL_INTERRUPT; 4377 + vmcs12->vm_exit_intr_info = 0; 4378 + /* 4379 + * fall through to normal code, but now in L1, not L2 4380 + */ 4381 + } 4535 4382 } 4536 4383 4537 4384 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && ··· 4583 4362 .flags = 0, 4584 4363 }; 4585 4364 4586 - ret = kvm_set_memory_region(kvm, &tss_mem, false); 4365 + ret = kvm_set_memory_region(kvm, &tss_mem); 4587 4366 if (ret) 4588 4367 return ret; 4589 4368 kvm->arch.tss_addr = addr; ··· 4824 4603 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 4825 4604 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4826 4605 { 4827 - if (to_vmx(vcpu)->nested.vmxon && 4828 - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) 4829 - return 1; 4830 - 4831 4606 if (is_guest_mode(vcpu)) { 4607 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4608 + unsigned long orig_val = val; 4609 + 4832 4610 /* 4833 4611 * We get here when L2 changed cr0 in a way that did not change 4834 4612 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 4835 - * but did change L0 shadowed bits. This can currently happen 4836 - * with the TS bit: L0 may want to leave TS on (for lazy fpu 4837 - * loading) while pretending to allow the guest to change it. 4613 + * but did change L0 shadowed bits. So we first calculate the 4614 + * effective cr0 value that L1 would like to write into the 4615 + * hardware. It consists of the L2-owned bits from the new 4616 + * value combined with the L1-owned bits from L1's guest_cr0. 4838 4617 */ 4839 - if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | 4840 - (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) 4618 + val = (val & ~vmcs12->cr0_guest_host_mask) | 4619 + (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 4620 + 4621 + /* TODO: will have to take unrestricted guest mode into 4622 + * account */ 4623 + if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) 4841 4624 return 1; 4842 - vmcs_writel(CR0_READ_SHADOW, val); 4625 + 4626 + if (kvm_set_cr0(vcpu, val)) 4627 + return 1; 4628 + vmcs_writel(CR0_READ_SHADOW, orig_val); 4843 4629 return 0; 4844 - } else 4630 + } else { 4631 + if (to_vmx(vcpu)->nested.vmxon && 4632 + ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) 4633 + return 1; 4845 4634 return kvm_set_cr0(vcpu, val); 4635 + } 4846 4636 } 4847 4637 4848 4638 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 4849 4639 { 4850 4640 if (is_guest_mode(vcpu)) { 4851 - if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | 4852 - (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) 4641 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4642 + unsigned long orig_val = val; 4643 + 4644 + /* analogously to handle_set_cr0 */ 4645 + val = (val & ~vmcs12->cr4_guest_host_mask) | 4646 + (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 4647 + if (kvm_set_cr4(vcpu, val)) 4853 4648 return 1; 4854 - vmcs_writel(CR4_READ_SHADOW, val); 4649 + vmcs_writel(CR4_READ_SHADOW, orig_val); 4855 4650 return 0; 4856 4651 } else 4857 4652 return kvm_set_cr4(vcpu, val); ··· 5420 5183 if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 5421 5184 return 1; 5422 5185 5423 - err = emulate_instruction(vcpu, 0); 5186 + err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 5424 5187 5425 5188 if (err == EMULATE_DO_MMIO) { 5426 5189 ret = 0; ··· 5496 5259 } 5497 5260 5498 5261 /* Create a new VMCS */ 5499 - item = (struct vmcs02_list *) 5500 - kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); 5262 + item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); 5501 5263 if (!item) 5502 5264 return NULL; 5503 5265 item->vmcs02.vmcs = alloc_vmcs(); ··· 5545 5309 free_loaded_vmcs(&vmx->vmcs01); 5546 5310 } 5547 5311 5312 + static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 5313 + u32 vm_instruction_error); 5314 + 5548 5315 /* 5549 5316 * Emulate the VMXON instruction. 5550 5317 * Currently, we just remember that VMX is active, and do not save or even ··· 5560 5321 { 5561 5322 struct kvm_segment cs; 5562 5323 struct vcpu_vmx *vmx = to_vmx(vcpu); 5324 + struct vmcs *shadow_vmcs; 5563 5325 5564 5326 /* The Intel VMX Instruction Reference lists a bunch of bits that 5565 5327 * are prerequisite to running VMXON, most notably cr4.VMXE must be ··· 5583 5343 if (vmx_get_cpl(vcpu)) { 5584 5344 kvm_inject_gp(vcpu, 0); 5585 5345 return 1; 5346 + } 5347 + if (vmx->nested.vmxon) { 5348 + nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5349 + skip_emulated_instruction(vcpu); 5350 + return 1; 5351 + } 5352 + if (enable_shadow_vmcs) { 5353 + shadow_vmcs = alloc_vmcs(); 5354 + if (!shadow_vmcs) 5355 + return -ENOMEM; 5356 + /* mark vmcs as shadow */ 5357 + shadow_vmcs->revision_id |= (1u << 31); 5358 + /* init shadow vmcs */ 5359 + vmcs_clear(shadow_vmcs); 5360 + vmx->nested.current_shadow_vmcs = shadow_vmcs; 5586 5361 } 5587 5362 5588 5363 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); ··· 5639 5384 return 1; 5640 5385 } 5641 5386 5387 + static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) 5388 + { 5389 + u32 exec_control; 5390 + if (enable_shadow_vmcs) { 5391 + if (vmx->nested.current_vmcs12 != NULL) { 5392 + /* copy to memory all shadowed fields in case 5393 + they were modified */ 5394 + copy_shadow_to_vmcs12(vmx); 5395 + vmx->nested.sync_shadow_vmcs = false; 5396 + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 5397 + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 5398 + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 5399 + vmcs_write64(VMCS_LINK_POINTER, -1ull); 5400 + } 5401 + } 5402 + kunmap(vmx->nested.current_vmcs12_page); 5403 + nested_release_page(vmx->nested.current_vmcs12_page); 5404 + } 5405 + 5642 5406 /* 5643 5407 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 5644 5408 * just stops using VMX. ··· 5668 5394 return; 5669 5395 vmx->nested.vmxon = false; 5670 5396 if (vmx->nested.current_vmptr != -1ull) { 5671 - kunmap(vmx->nested.current_vmcs12_page); 5672 - nested_release_page(vmx->nested.current_vmcs12_page); 5397 + nested_release_vmcs12(vmx); 5673 5398 vmx->nested.current_vmptr = -1ull; 5674 5399 vmx->nested.current_vmcs12 = NULL; 5675 5400 } 5401 + if (enable_shadow_vmcs) 5402 + free_vmcs(vmx->nested.current_shadow_vmcs); 5676 5403 /* Unpin physical memory we referred to in current vmcs02 */ 5677 5404 if (vmx->nested.apic_access_page) { 5678 5405 nested_release_page(vmx->nested.apic_access_page); ··· 5782 5507 X86_EFLAGS_SF | X86_EFLAGS_OF)) 5783 5508 | X86_EFLAGS_ZF); 5784 5509 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 5510 + /* 5511 + * We don't need to force a shadow sync because 5512 + * VM_INSTRUCTION_ERROR is not shadowed 5513 + */ 5785 5514 } 5786 5515 5787 5516 /* Emulate the VMCLEAR instruction */ ··· 5818 5539 } 5819 5540 5820 5541 if (vmptr == vmx->nested.current_vmptr) { 5821 - kunmap(vmx->nested.current_vmcs12_page); 5822 - nested_release_page(vmx->nested.current_vmcs12_page); 5542 + nested_release_vmcs12(vmx); 5823 5543 vmx->nested.current_vmptr = -1ull; 5824 5544 vmx->nested.current_vmcs12 = NULL; 5825 5545 } ··· 5917 5639 } 5918 5640 } 5919 5641 5642 + 5643 + static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, 5644 + unsigned long field, u64 field_value){ 5645 + short offset = vmcs_field_to_offset(field); 5646 + char *p = ((char *) get_vmcs12(vcpu)) + offset; 5647 + if (offset < 0) 5648 + return false; 5649 + 5650 + switch (vmcs_field_type(field)) { 5651 + case VMCS_FIELD_TYPE_U16: 5652 + *(u16 *)p = field_value; 5653 + return true; 5654 + case VMCS_FIELD_TYPE_U32: 5655 + *(u32 *)p = field_value; 5656 + return true; 5657 + case VMCS_FIELD_TYPE_U64: 5658 + *(u64 *)p = field_value; 5659 + return true; 5660 + case VMCS_FIELD_TYPE_NATURAL_WIDTH: 5661 + *(natural_width *)p = field_value; 5662 + return true; 5663 + default: 5664 + return false; /* can never happen. */ 5665 + } 5666 + 5667 + } 5668 + 5669 + static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 5670 + { 5671 + int i; 5672 + unsigned long field; 5673 + u64 field_value; 5674 + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 5675 + unsigned long *fields = (unsigned long *)shadow_read_write_fields; 5676 + int num_fields = max_shadow_read_write_fields; 5677 + 5678 + vmcs_load(shadow_vmcs); 5679 + 5680 + for (i = 0; i < num_fields; i++) { 5681 + field = fields[i]; 5682 + switch (vmcs_field_type(field)) { 5683 + case VMCS_FIELD_TYPE_U16: 5684 + field_value = vmcs_read16(field); 5685 + break; 5686 + case VMCS_FIELD_TYPE_U32: 5687 + field_value = vmcs_read32(field); 5688 + break; 5689 + case VMCS_FIELD_TYPE_U64: 5690 + field_value = vmcs_read64(field); 5691 + break; 5692 + case VMCS_FIELD_TYPE_NATURAL_WIDTH: 5693 + field_value = vmcs_readl(field); 5694 + break; 5695 + } 5696 + vmcs12_write_any(&vmx->vcpu, field, field_value); 5697 + } 5698 + 5699 + vmcs_clear(shadow_vmcs); 5700 + vmcs_load(vmx->loaded_vmcs->vmcs); 5701 + } 5702 + 5703 + static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 5704 + { 5705 + unsigned long *fields[] = { 5706 + (unsigned long *)shadow_read_write_fields, 5707 + (unsigned long *)shadow_read_only_fields 5708 + }; 5709 + int num_lists = ARRAY_SIZE(fields); 5710 + int max_fields[] = { 5711 + max_shadow_read_write_fields, 5712 + max_shadow_read_only_fields 5713 + }; 5714 + int i, q; 5715 + unsigned long field; 5716 + u64 field_value = 0; 5717 + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 5718 + 5719 + vmcs_load(shadow_vmcs); 5720 + 5721 + for (q = 0; q < num_lists; q++) { 5722 + for (i = 0; i < max_fields[q]; i++) { 5723 + field = fields[q][i]; 5724 + vmcs12_read_any(&vmx->vcpu, field, &field_value); 5725 + 5726 + switch (vmcs_field_type(field)) { 5727 + case VMCS_FIELD_TYPE_U16: 5728 + vmcs_write16(field, (u16)field_value); 5729 + break; 5730 + case VMCS_FIELD_TYPE_U32: 5731 + vmcs_write32(field, (u32)field_value); 5732 + break; 5733 + case VMCS_FIELD_TYPE_U64: 5734 + vmcs_write64(field, (u64)field_value); 5735 + break; 5736 + case VMCS_FIELD_TYPE_NATURAL_WIDTH: 5737 + vmcs_writel(field, (long)field_value); 5738 + break; 5739 + } 5740 + } 5741 + } 5742 + 5743 + vmcs_clear(shadow_vmcs); 5744 + vmcs_load(vmx->loaded_vmcs->vmcs); 5745 + } 5746 + 5920 5747 /* 5921 5748 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was 5922 5749 * used before) all generate the same failure when it is missing. ··· 6086 5703 gva_t gva; 6087 5704 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6088 5705 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6089 - char *p; 6090 - short offset; 6091 5706 /* The value to write might be 32 or 64 bits, depending on L1's long 6092 5707 * mode, and eventually we need to write that into a field of several 6093 5708 * possible lengths. The code below first zero-extends the value to 64 ··· 6122 5741 return 1; 6123 5742 } 6124 5743 6125 - offset = vmcs_field_to_offset(field); 6126 - if (offset < 0) { 6127 - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 6128 - skip_emulated_instruction(vcpu); 6129 - return 1; 6130 - } 6131 - p = ((char *) get_vmcs12(vcpu)) + offset; 6132 - 6133 - switch (vmcs_field_type(field)) { 6134 - case VMCS_FIELD_TYPE_U16: 6135 - *(u16 *)p = field_value; 6136 - break; 6137 - case VMCS_FIELD_TYPE_U32: 6138 - *(u32 *)p = field_value; 6139 - break; 6140 - case VMCS_FIELD_TYPE_U64: 6141 - *(u64 *)p = field_value; 6142 - break; 6143 - case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6144 - *(natural_width *)p = field_value; 6145 - break; 6146 - default: 5744 + if (!vmcs12_write_any(vcpu, field, field_value)) { 6147 5745 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 6148 5746 skip_emulated_instruction(vcpu); 6149 5747 return 1; ··· 6140 5780 gva_t gva; 6141 5781 gpa_t vmptr; 6142 5782 struct x86_exception e; 5783 + u32 exec_control; 6143 5784 6144 5785 if (!nested_vmx_check_permission(vcpu)) 6145 5786 return 1; ··· 6179 5818 skip_emulated_instruction(vcpu); 6180 5819 return 1; 6181 5820 } 6182 - if (vmx->nested.current_vmptr != -1ull) { 6183 - kunmap(vmx->nested.current_vmcs12_page); 6184 - nested_release_page(vmx->nested.current_vmcs12_page); 6185 - } 5821 + if (vmx->nested.current_vmptr != -1ull) 5822 + nested_release_vmcs12(vmx); 6186 5823 6187 5824 vmx->nested.current_vmptr = vmptr; 6188 5825 vmx->nested.current_vmcs12 = new_vmcs12; 6189 5826 vmx->nested.current_vmcs12_page = page; 5827 + if (enable_shadow_vmcs) { 5828 + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 5829 + exec_control |= SECONDARY_EXEC_SHADOW_VMCS; 5830 + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 5831 + vmcs_write64(VMCS_LINK_POINTER, 5832 + __pa(vmx->nested.current_shadow_vmcs)); 5833 + vmx->nested.sync_shadow_vmcs = true; 5834 + } 6190 5835 } 6191 5836 6192 5837 nested_vmx_succeed(vcpu); ··· 6275 5908 static const int kvm_vmx_max_exit_handlers = 6276 5909 ARRAY_SIZE(kvm_vmx_exit_handlers); 6277 5910 5911 + static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5912 + struct vmcs12 *vmcs12) 5913 + { 5914 + unsigned long exit_qualification; 5915 + gpa_t bitmap, last_bitmap; 5916 + unsigned int port; 5917 + int size; 5918 + u8 b; 5919 + 5920 + if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING)) 5921 + return 1; 5922 + 5923 + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5924 + return 0; 5925 + 5926 + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5927 + 5928 + port = exit_qualification >> 16; 5929 + size = (exit_qualification & 7) + 1; 5930 + 5931 + last_bitmap = (gpa_t)-1; 5932 + b = -1; 5933 + 5934 + while (size > 0) { 5935 + if (port < 0x8000) 5936 + bitmap = vmcs12->io_bitmap_a; 5937 + else if (port < 0x10000) 5938 + bitmap = vmcs12->io_bitmap_b; 5939 + else 5940 + return 1; 5941 + bitmap += (port & 0x7fff) / 8; 5942 + 5943 + if (last_bitmap != bitmap) 5944 + if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) 5945 + return 1; 5946 + if (b & (1 << (port & 7))) 5947 + return 1; 5948 + 5949 + port++; 5950 + size--; 5951 + last_bitmap = bitmap; 5952 + } 5953 + 5954 + return 0; 5955 + } 5956 + 6278 5957 /* 6279 5958 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 6280 5959 * rather than handle it ourselves in L0. I.e., check whether L1 expressed ··· 6352 5939 /* Then read the msr_index'th bit from this bitmap: */ 6353 5940 if (msr_index < 1024*8) { 6354 5941 unsigned char b; 6355 - kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); 5942 + if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) 5943 + return 1; 6356 5944 return 1 & (b >> (msr_index & 7)); 6357 5945 } else 6358 5946 return 1; /* let L1 handle the wrong parameter */ ··· 6447 6033 */ 6448 6034 static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) 6449 6035 { 6450 - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); 6451 6036 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6452 6037 struct vcpu_vmx *vmx = to_vmx(vcpu); 6453 6038 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6039 + u32 exit_reason = vmx->exit_reason; 6454 6040 6455 6041 if (vmx->nested.nested_run_pending) 6456 6042 return 0; ··· 6474 6060 case EXIT_REASON_TRIPLE_FAULT: 6475 6061 return 1; 6476 6062 case EXIT_REASON_PENDING_INTERRUPT: 6063 + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 6477 6064 case EXIT_REASON_NMI_WINDOW: 6478 - /* 6479 - * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit 6480 - * (aka Interrupt Window Exiting) only when L1 turned it on, 6481 - * so if we got a PENDING_INTERRUPT exit, this must be for L1. 6482 - * Same for NMI Window Exiting. 6483 - */ 6484 - return 1; 6065 + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 6485 6066 case EXIT_REASON_TASK_SWITCH: 6486 6067 return 1; 6487 6068 case EXIT_REASON_CPUID: ··· 6506 6097 case EXIT_REASON_DR_ACCESS: 6507 6098 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6508 6099 case EXIT_REASON_IO_INSTRUCTION: 6509 - /* TODO: support IO bitmaps */ 6510 - return 1; 6100 + return nested_vmx_exit_handled_io(vcpu, vmcs12); 6511 6101 case EXIT_REASON_MSR_READ: 6512 6102 case EXIT_REASON_MSR_WRITE: 6513 6103 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); ··· 6530 6122 case EXIT_REASON_EPT_VIOLATION: 6531 6123 case EXIT_REASON_EPT_MISCONFIG: 6532 6124 return 0; 6125 + case EXIT_REASON_PREEMPTION_TIMER: 6126 + return vmcs12->pin_based_vm_exec_control & 6127 + PIN_BASED_VMX_PREEMPTION_TIMER; 6533 6128 case EXIT_REASON_WBINVD: 6534 6129 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6535 6130 case EXIT_REASON_XSETBV: ··· 6727 6316 6728 6317 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6729 6318 { 6319 + if (!vmx_vm_has_apicv(vcpu->kvm)) 6320 + return; 6321 + 6730 6322 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6731 6323 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6732 6324 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); ··· 6758 6344 asm("int $2"); 6759 6345 kvm_after_handle_nmi(&vmx->vcpu); 6760 6346 } 6347 + } 6348 + 6349 + static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) 6350 + { 6351 + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6352 + 6353 + /* 6354 + * If external interrupt exists, IF bit is set in rflags/eflags on the 6355 + * interrupt stack frame, and interrupt will be enabled on a return 6356 + * from interrupt handler. 6357 + */ 6358 + if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) 6359 + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { 6360 + unsigned int vector; 6361 + unsigned long entry; 6362 + gate_desc *desc; 6363 + struct vcpu_vmx *vmx = to_vmx(vcpu); 6364 + #ifdef CONFIG_X86_64 6365 + unsigned long tmp; 6366 + #endif 6367 + 6368 + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 6369 + desc = (gate_desc *)vmx->host_idt_base + vector; 6370 + entry = gate_offset(*desc); 6371 + asm volatile( 6372 + #ifdef CONFIG_X86_64 6373 + "mov %%" _ASM_SP ", %[sp]\n\t" 6374 + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 6375 + "push $%c[ss]\n\t" 6376 + "push %[sp]\n\t" 6377 + #endif 6378 + "pushf\n\t" 6379 + "orl $0x200, (%%" _ASM_SP ")\n\t" 6380 + __ASM_SIZE(push) " $%c[cs]\n\t" 6381 + "call *%[entry]\n\t" 6382 + : 6383 + #ifdef CONFIG_X86_64 6384 + [sp]"=&r"(tmp) 6385 + #endif 6386 + : 6387 + [entry]"r"(entry), 6388 + [ss]"i"(__KERNEL_DS), 6389 + [cs]"i"(__KERNEL_CS) 6390 + ); 6391 + } else 6392 + local_irq_enable(); 6761 6393 } 6762 6394 6763 6395 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ··· 6848 6388 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 6849 6389 } 6850 6390 6851 - static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, 6391 + static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 6852 6392 u32 idt_vectoring_info, 6853 6393 int instr_len_field, 6854 6394 int error_code_field) ··· 6859 6399 6860 6400 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6861 6401 6862 - vmx->vcpu.arch.nmi_injected = false; 6863 - kvm_clear_exception_queue(&vmx->vcpu); 6864 - kvm_clear_interrupt_queue(&vmx->vcpu); 6402 + vcpu->arch.nmi_injected = false; 6403 + kvm_clear_exception_queue(vcpu); 6404 + kvm_clear_interrupt_queue(vcpu); 6865 6405 6866 6406 if (!idtv_info_valid) 6867 6407 return; 6868 6408 6869 - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 6409 + kvm_make_request(KVM_REQ_EVENT, vcpu); 6870 6410 6871 6411 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 6872 6412 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 6873 6413 6874 6414 switch (type) { 6875 6415 case INTR_TYPE_NMI_INTR: 6876 - vmx->vcpu.arch.nmi_injected = true; 6416 + vcpu->arch.nmi_injected = true; 6877 6417 /* 6878 6418 * SDM 3: 27.7.1.2 (September 2008) 6879 6419 * Clear bit "block by NMI" before VM entry if a NMI 6880 6420 * delivery faulted. 6881 6421 */ 6882 - vmx_set_nmi_mask(&vmx->vcpu, false); 6422 + vmx_set_nmi_mask(vcpu, false); 6883 6423 break; 6884 6424 case INTR_TYPE_SOFT_EXCEPTION: 6885 - vmx->vcpu.arch.event_exit_inst_len = 6886 - vmcs_read32(instr_len_field); 6425 + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6887 6426 /* fall through */ 6888 6427 case INTR_TYPE_HARD_EXCEPTION: 6889 6428 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 6890 6429 u32 err = vmcs_read32(error_code_field); 6891 - kvm_queue_exception_e(&vmx->vcpu, vector, err); 6430 + kvm_queue_exception_e(vcpu, vector, err); 6892 6431 } else 6893 - kvm_queue_exception(&vmx->vcpu, vector); 6432 + kvm_queue_exception(vcpu, vector); 6894 6433 break; 6895 6434 case INTR_TYPE_SOFT_INTR: 6896 - vmx->vcpu.arch.event_exit_inst_len = 6897 - vmcs_read32(instr_len_field); 6435 + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6898 6436 /* fall through */ 6899 6437 case INTR_TYPE_EXT_INTR: 6900 - kvm_queue_interrupt(&vmx->vcpu, vector, 6901 - type == INTR_TYPE_SOFT_INTR); 6438 + kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 6902 6439 break; 6903 6440 default: 6904 6441 break; ··· 6904 6447 6905 6448 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 6906 6449 { 6907 - if (is_guest_mode(&vmx->vcpu)) 6908 - return; 6909 - __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, 6450 + __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 6910 6451 VM_EXIT_INSTRUCTION_LEN, 6911 6452 IDT_VECTORING_ERROR_CODE); 6912 6453 } 6913 6454 6914 6455 static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 6915 6456 { 6916 - if (is_guest_mode(vcpu)) 6917 - return; 6918 - __vmx_complete_interrupts(to_vmx(vcpu), 6457 + __vmx_complete_interrupts(vcpu, 6919 6458 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6920 6459 VM_ENTRY_INSTRUCTION_LEN, 6921 6460 VM_ENTRY_EXCEPTION_ERROR_CODE); ··· 6942 6489 struct vcpu_vmx *vmx = to_vmx(vcpu); 6943 6490 unsigned long debugctlmsr; 6944 6491 6945 - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { 6946 - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6947 - if (vmcs12->idt_vectoring_info_field & 6948 - VECTORING_INFO_VALID_MASK) { 6949 - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 6950 - vmcs12->idt_vectoring_info_field); 6951 - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 6952 - vmcs12->vm_exit_instruction_len); 6953 - if (vmcs12->idt_vectoring_info_field & 6954 - VECTORING_INFO_DELIVER_CODE_MASK) 6955 - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 6956 - vmcs12->idt_vectoring_error_code); 6957 - } 6958 - } 6959 - 6960 6492 /* Record the guest's net vcpu time for enforced NMI injections. */ 6961 6493 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 6962 6494 vmx->entry_time = ktime_get(); ··· 6950 6512 start emulation until we arrive back to a valid state */ 6951 6513 if (vmx->emulation_required) 6952 6514 return; 6515 + 6516 + if (vmx->nested.sync_shadow_vmcs) { 6517 + copy_vmcs12_to_shadow(vmx); 6518 + vmx->nested.sync_shadow_vmcs = false; 6519 + } 6953 6520 6954 6521 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6955 6522 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); ··· 7105 6662 7106 6663 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7107 6664 7108 - if (is_guest_mode(vcpu)) { 7109 - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7110 - vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; 7111 - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 7112 - vmcs12->idt_vectoring_error_code = 7113 - vmcs_read32(IDT_VECTORING_ERROR_CODE); 7114 - vmcs12->vm_exit_instruction_len = 7115 - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 7116 - } 7117 - } 7118 - 7119 6665 vmx->loaded_vmcs->launched = 1; 7120 6666 7121 6667 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); ··· 7166 6734 put_cpu(); 7167 6735 if (err) 7168 6736 goto free_vmcs; 7169 - if (vm_need_virtualize_apic_accesses(kvm)) 6737 + if (vm_need_virtualize_apic_accesses(kvm)) { 7170 6738 err = alloc_apic_access_page(kvm); 7171 6739 if (err) 7172 6740 goto free_vmcs; 6741 + } 7173 6742 7174 6743 if (enable_ept) { 7175 6744 if (!kvm->arch.ept_identity_map_addr) ··· 7364 6931 vmcs12->vm_entry_instruction_len); 7365 6932 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 7366 6933 vmcs12->guest_interruptibility_info); 7367 - vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); 7368 6934 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 7369 - vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); 6935 + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 7370 6936 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); 7371 6937 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 7372 6938 vmcs12->guest_pending_dbg_exceptions); ··· 7377 6945 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 7378 6946 (vmcs_config.pin_based_exec_ctrl | 7379 6947 vmcs12->pin_based_vm_exec_control)); 6948 + 6949 + if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) 6950 + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 6951 + vmcs12->vmx_preemption_timer_value); 7380 6952 7381 6953 /* 7382 6954 * Whether page-faults are trapped is determined by a combination of ··· 7452 7016 * Other fields are different per CPU, and will be set later when 7453 7017 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 7454 7018 */ 7455 - vmx_set_constant_host_state(); 7019 + vmx_set_constant_host_state(vmx); 7456 7020 7457 7021 /* 7458 7022 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before ··· 7518 7082 7519 7083 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 7520 7084 vcpu->arch.efer = vmcs12->guest_ia32_efer; 7521 - if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 7085 + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 7522 7086 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 7523 7087 else 7524 7088 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); ··· 7557 7121 struct vcpu_vmx *vmx = to_vmx(vcpu); 7558 7122 int cpu; 7559 7123 struct loaded_vmcs *vmcs02; 7124 + bool ia32e; 7560 7125 7561 7126 if (!nested_vmx_check_permission(vcpu) || 7562 7127 !nested_vmx_check_vmcs12(vcpu)) ··· 7565 7128 7566 7129 skip_emulated_instruction(vcpu); 7567 7130 vmcs12 = get_vmcs12(vcpu); 7131 + 7132 + if (enable_shadow_vmcs) 7133 + copy_shadow_to_vmcs12(vmx); 7568 7134 7569 7135 /* 7570 7136 * The nested entry process starts with enforcing various prerequisites ··· 7583 7143 nested_vmx_failValid(vcpu, 7584 7144 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 7585 7145 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 7146 + return 1; 7147 + } 7148 + 7149 + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { 7150 + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 7586 7151 return 1; 7587 7152 } 7588 7153 ··· 7649 7204 } 7650 7205 7651 7206 /* 7207 + * If the load IA32_EFER VM-entry control is 1, the following checks 7208 + * are performed on the field for the IA32_EFER MSR: 7209 + * - Bits reserved in the IA32_EFER MSR must be 0. 7210 + * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 7211 + * the IA-32e mode guest VM-exit control. It must also be identical 7212 + * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 7213 + * CR0.PG) is 1. 7214 + */ 7215 + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { 7216 + ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 7217 + if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 7218 + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 7219 + ((vmcs12->guest_cr0 & X86_CR0_PG) && 7220 + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { 7221 + nested_vmx_entry_failure(vcpu, vmcs12, 7222 + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 7223 + return 1; 7224 + } 7225 + } 7226 + 7227 + /* 7228 + * If the load IA32_EFER VM-exit control is 1, bits reserved in the 7229 + * IA32_EFER MSR must be 0 in the field for that register. In addition, 7230 + * the values of the LMA and LME bits in the field must each be that of 7231 + * the host address-space size VM-exit control. 7232 + */ 7233 + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 7234 + ia32e = (vmcs12->vm_exit_controls & 7235 + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 7236 + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 7237 + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 7238 + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { 7239 + nested_vmx_entry_failure(vcpu, vmcs12, 7240 + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 7241 + return 1; 7242 + } 7243 + } 7244 + 7245 + /* 7652 7246 * We're finally done with prerequisite checking, and can start with 7653 7247 * the nested entry. 7654 7248 */ ··· 7706 7222 vmx_vcpu_load(vcpu, cpu); 7707 7223 vcpu->cpu = cpu; 7708 7224 put_cpu(); 7225 + 7226 + vmx_segment_cache_clear(vmx); 7709 7227 7710 7228 vmcs12->launch_state = 1; 7711 7229 ··· 7759 7273 vcpu->arch.cr4_guest_owned_bits)); 7760 7274 } 7761 7275 7276 + static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 7277 + struct vmcs12 *vmcs12) 7278 + { 7279 + u32 idt_vectoring; 7280 + unsigned int nr; 7281 + 7282 + if (vcpu->arch.exception.pending) { 7283 + nr = vcpu->arch.exception.nr; 7284 + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 7285 + 7286 + if (kvm_exception_is_soft(nr)) { 7287 + vmcs12->vm_exit_instruction_len = 7288 + vcpu->arch.event_exit_inst_len; 7289 + idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 7290 + } else 7291 + idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 7292 + 7293 + if (vcpu->arch.exception.has_error_code) { 7294 + idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 7295 + vmcs12->idt_vectoring_error_code = 7296 + vcpu->arch.exception.error_code; 7297 + } 7298 + 7299 + vmcs12->idt_vectoring_info_field = idt_vectoring; 7300 + } else if (vcpu->arch.nmi_pending) { 7301 + vmcs12->idt_vectoring_info_field = 7302 + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 7303 + } else if (vcpu->arch.interrupt.pending) { 7304 + nr = vcpu->arch.interrupt.nr; 7305 + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 7306 + 7307 + if (vcpu->arch.interrupt.soft) { 7308 + idt_vectoring |= INTR_TYPE_SOFT_INTR; 7309 + vmcs12->vm_entry_instruction_len = 7310 + vcpu->arch.event_exit_inst_len; 7311 + } else 7312 + idt_vectoring |= INTR_TYPE_EXT_INTR; 7313 + 7314 + vmcs12->idt_vectoring_info_field = idt_vectoring; 7315 + } 7316 + } 7317 + 7762 7318 /* 7763 7319 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 7764 7320 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), ··· 7812 7284 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 7813 7285 * which already writes to vmcs12 directly. 7814 7286 */ 7815 - void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 7287 + static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 7816 7288 { 7817 7289 /* update guest state fields: */ 7818 7290 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); ··· 7860 7332 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 7861 7333 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 7862 7334 7863 - vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); 7864 7335 vmcs12->guest_interruptibility_info = 7865 7336 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 7866 7337 vmcs12->guest_pending_dbg_exceptions = 7867 7338 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 7868 7339 7340 + vmcs12->vm_entry_controls = 7341 + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 7342 + (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); 7343 + 7869 7344 /* TODO: These cannot have changed unless we have MSR bitmaps and 7870 7345 * the relevant bit asks not to trap the change */ 7871 7346 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 7872 - if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) 7347 + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 7873 7348 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); 7874 7349 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 7875 7350 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); ··· 7880 7349 7881 7350 /* update exit information fields: */ 7882 7351 7883 - vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); 7352 + vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason; 7884 7353 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7885 7354 7886 7355 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7887 - vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 7888 - vmcs12->idt_vectoring_info_field = 7889 - vmcs_read32(IDT_VECTORING_INFO_FIELD); 7890 - vmcs12->idt_vectoring_error_code = 7891 - vmcs_read32(IDT_VECTORING_ERROR_CODE); 7356 + if ((vmcs12->vm_exit_intr_info & 7357 + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 7358 + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) 7359 + vmcs12->vm_exit_intr_error_code = 7360 + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 7361 + vmcs12->idt_vectoring_info_field = 0; 7892 7362 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 7893 7363 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7894 7364 7895 - /* clear vm-entry fields which are to be cleared on exit */ 7896 - if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 7365 + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 7366 + /* vm_entry_intr_info_field is cleared on exit. Emulate this 7367 + * instead of reading the real value. */ 7897 7368 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 7369 + 7370 + /* 7371 + * Transfer the event that L0 or L1 may wanted to inject into 7372 + * L2 to IDT_VECTORING_INFO_FIELD. 7373 + */ 7374 + vmcs12_save_pending_event(vcpu, vmcs12); 7375 + } 7376 + 7377 + /* 7378 + * Drop what we picked up for L2 via vmx_complete_interrupts. It is 7379 + * preserved above and would only end up incorrectly in L1. 7380 + */ 7381 + vcpu->arch.nmi_injected = false; 7382 + kvm_clear_exception_queue(vcpu); 7383 + kvm_clear_interrupt_queue(vcpu); 7898 7384 } 7899 7385 7900 7386 /* ··· 7923 7375 * Failures During or After Loading Guest State"). 7924 7376 * This function should be called when the active VMCS is L1's (vmcs01). 7925 7377 */ 7926 - void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 7378 + static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 7379 + struct vmcs12 *vmcs12) 7927 7380 { 7928 7381 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 7929 7382 vcpu->arch.efer = vmcs12->host_ia32_efer; 7930 - if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 7383 + else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 7931 7384 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 7932 7385 else 7933 7386 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); ··· 7936 7387 7937 7388 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 7938 7389 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 7390 + vmx_set_rflags(vcpu, X86_EFLAGS_BIT1); 7939 7391 /* 7940 7392 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 7941 7393 * actually changed, because it depends on the current state of ··· 7995 7445 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 7996 7446 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 7997 7447 vmcs12->host_ia32_perf_global_ctrl); 7448 + 7449 + kvm_set_dr(vcpu, 7, 0x400); 7450 + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 7998 7451 } 7999 7452 8000 7453 /* ··· 8011 7458 int cpu; 8012 7459 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8013 7460 7461 + /* trying to cancel vmlaunch/vmresume is a bug */ 7462 + WARN_ON_ONCE(vmx->nested.nested_run_pending); 7463 + 8014 7464 leave_guest_mode(vcpu); 8015 7465 prepare_vmcs12(vcpu, vmcs12); 8016 7466 ··· 8023 7467 vmx_vcpu_load(vcpu, cpu); 8024 7468 vcpu->cpu = cpu; 8025 7469 put_cpu(); 7470 + 7471 + vmx_segment_cache_clear(vmx); 8026 7472 8027 7473 /* if no vmcs02 cache requested, remove the one we used */ 8028 7474 if (VMCS02_POOL_SIZE == 0) ··· 8054 7496 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); 8055 7497 } else 8056 7498 nested_vmx_succeed(vcpu); 7499 + if (enable_shadow_vmcs) 7500 + vmx->nested.sync_shadow_vmcs = true; 8057 7501 } 8058 7502 8059 7503 /* ··· 8073 7513 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 8074 7514 vmcs12->exit_qualification = qualification; 8075 7515 nested_vmx_succeed(vcpu); 7516 + if (enable_shadow_vmcs) 7517 + to_vmx(vcpu)->nested.sync_shadow_vmcs = true; 8076 7518 } 8077 7519 8078 7520 static int vmx_check_intercept(struct kvm_vcpu *vcpu, ··· 8152 7590 .load_eoi_exitmap = vmx_load_eoi_exitmap, 8153 7591 .hwapic_irr_update = vmx_hwapic_irr_update, 8154 7592 .hwapic_isr_update = vmx_hwapic_isr_update, 7593 + .sync_pir_to_irr = vmx_sync_pir_to_irr, 7594 + .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 8155 7595 8156 7596 .set_tss_addr = vmx_set_tss_addr, 8157 7597 .get_tdp_level = get_ept_level, ··· 8182 7618 .set_tdp_cr3 = vmx_set_cr3, 8183 7619 8184 7620 .check_intercept = vmx_check_intercept, 7621 + .handle_external_intr = vmx_handle_external_intr, 8185 7622 }; 8186 7623 8187 7624 static int __init vmx_init(void) ··· 8221 7656 (unsigned long *)__get_free_page(GFP_KERNEL); 8222 7657 if (!vmx_msr_bitmap_longmode_x2apic) 8223 7658 goto out4; 7659 + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 7660 + if (!vmx_vmread_bitmap) 7661 + goto out5; 7662 + 7663 + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 7664 + if (!vmx_vmwrite_bitmap) 7665 + goto out6; 7666 + 7667 + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 7668 + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 7669 + /* shadowed read/write fields */ 7670 + for (i = 0; i < max_shadow_read_write_fields; i++) { 7671 + clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); 7672 + clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); 7673 + } 7674 + /* shadowed read only fields */ 7675 + for (i = 0; i < max_shadow_read_only_fields; i++) 7676 + clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap); 8224 7677 8225 7678 /* 8226 7679 * Allow direct access to the PC debug port (it is often used for I/O ··· 8257 7674 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 8258 7675 __alignof__(struct vcpu_vmx), THIS_MODULE); 8259 7676 if (r) 8260 - goto out3; 7677 + goto out7; 8261 7678 8262 7679 #ifdef CONFIG_KEXEC 8263 7680 rcu_assign_pointer(crash_vmclear_loaded_vmcss, ··· 8275 7692 memcpy(vmx_msr_bitmap_longmode_x2apic, 8276 7693 vmx_msr_bitmap_longmode, PAGE_SIZE); 8277 7694 8278 - if (enable_apicv_reg_vid) { 7695 + if (enable_apicv) { 8279 7696 for (msr = 0x800; msr <= 0x8ff; msr++) 8280 7697 vmx_disable_intercept_msr_read_x2apic(msr); 8281 7698 ··· 8305 7722 8306 7723 return 0; 8307 7724 7725 + out7: 7726 + free_page((unsigned long)vmx_vmwrite_bitmap); 7727 + out6: 7728 + free_page((unsigned long)vmx_vmread_bitmap); 7729 + out5: 7730 + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 8308 7731 out4: 8309 7732 free_page((unsigned long)vmx_msr_bitmap_longmode); 8310 7733 out3: ··· 8332 7743 free_page((unsigned long)vmx_msr_bitmap_longmode); 8333 7744 free_page((unsigned long)vmx_io_bitmap_b); 8334 7745 free_page((unsigned long)vmx_io_bitmap_a); 7746 + free_page((unsigned long)vmx_vmwrite_bitmap); 7747 + free_page((unsigned long)vmx_vmread_bitmap); 8335 7748 8336 7749 #ifdef CONFIG_KEXEC 8337 7750 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);

+158 -85

arch/x86/kvm/x86.c

··· 162 162 163 163 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 164 164 165 - static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); 166 - 167 165 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 168 166 { 169 167 int i; ··· 260 262 kvm_lapic_set_base(vcpu, data); 261 263 } 262 264 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 265 + 266 + asmlinkage void kvm_spurious_fault(void) 267 + { 268 + /* Fault while not rebooting. We want the trace. */ 269 + BUG(); 270 + } 271 + EXPORT_SYMBOL_GPL(kvm_spurious_fault); 263 272 264 273 #define EXCPT_BENIGN 0 265 274 #define EXCPT_CONTRIBUTORY 1 ··· 845 840 MSR_IA32_MCG_CTL, 846 841 }; 847 842 848 - static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 843 + bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) 849 844 { 850 - u64 old_efer = vcpu->arch.efer; 851 - 852 845 if (efer & efer_reserved_bits) 853 - return 1; 854 - 855 - if (is_paging(vcpu) 856 - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 857 - return 1; 846 + return false; 858 847 859 848 if (efer & EFER_FFXSR) { 860 849 struct kvm_cpuid_entry2 *feat; 861 850 862 851 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 863 852 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) 864 - return 1; 853 + return false; 865 854 } 866 855 867 856 if (efer & EFER_SVME) { ··· 863 864 864 865 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 865 866 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) 866 - return 1; 867 + return false; 867 868 } 869 + 870 + return true; 871 + } 872 + EXPORT_SYMBOL_GPL(kvm_valid_efer); 873 + 874 + static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 875 + { 876 + u64 old_efer = vcpu->arch.efer; 877 + 878 + if (!kvm_valid_efer(vcpu, efer)) 879 + return 1; 880 + 881 + if (is_paging(vcpu) 882 + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 883 + return 1; 868 884 869 885 efer &= ~EFER_LMA; 870 886 efer |= vcpu->arch.efer & EFER_LMA; ··· 1093 1079 u32 thresh_lo, thresh_hi; 1094 1080 int use_scaling = 0; 1095 1081 1082 + /* tsc_khz can be zero if TSC calibration fails */ 1083 + if (this_tsc_khz == 0) 1084 + return; 1085 + 1096 1086 /* Compute a scale to convert nanoseconds in TSC cycles */ 1097 1087 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 1098 1088 &vcpu->arch.virtual_tsc_shift, ··· 1174 1156 ns = get_kernel_ns(); 1175 1157 elapsed = ns - kvm->arch.last_tsc_nsec; 1176 1158 1177 - /* n.b - signed multiplication and division required */ 1178 - usdiff = data - kvm->arch.last_tsc_write; 1159 + if (vcpu->arch.virtual_tsc_khz) { 1160 + /* n.b - signed multiplication and division required */ 1161 + usdiff = data - kvm->arch.last_tsc_write; 1179 1162 #ifdef CONFIG_X86_64 1180 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1163 + usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1181 1164 #else 1182 - /* do_div() only does unsigned */ 1183 - asm("idivl %2; xor %%edx, %%edx" 1184 - : "=A"(usdiff) 1185 - : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1165 + /* do_div() only does unsigned */ 1166 + asm("idivl %2; xor %%edx, %%edx" 1167 + : "=A"(usdiff) 1168 + : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1186 1169 #endif 1187 - do_div(elapsed, 1000); 1188 - usdiff -= elapsed; 1189 - if (usdiff < 0) 1190 - usdiff = -usdiff; 1170 + do_div(elapsed, 1000); 1171 + usdiff -= elapsed; 1172 + if (usdiff < 0) 1173 + usdiff = -usdiff; 1174 + } else 1175 + usdiff = USEC_PER_SEC; /* disable TSC match window below */ 1191 1176 1192 1177 /* 1193 1178 * Special case: TSC write with a small delta (1 second) of virtual ··· 2055 2034 case MSR_P6_EVNTSEL0: 2056 2035 case MSR_P6_EVNTSEL1: 2057 2036 if (kvm_pmu_msr(vcpu, msr)) 2058 - return kvm_pmu_set_msr(vcpu, msr, data); 2037 + return kvm_pmu_set_msr(vcpu, msr_info); 2059 2038 2060 2039 if (pr || data != 0) 2061 2040 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " ··· 2101 2080 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 2102 2081 return xen_hvm_config(vcpu, data); 2103 2082 if (kvm_pmu_msr(vcpu, msr)) 2104 - return kvm_pmu_set_msr(vcpu, msr, data); 2083 + return kvm_pmu_set_msr(vcpu, msr_info); 2105 2084 if (!ignore_msrs) { 2106 2085 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 2107 2086 msr, data); ··· 2500 2479 case KVM_CAP_USER_NMI: 2501 2480 case KVM_CAP_REINJECT_CONTROL: 2502 2481 case KVM_CAP_IRQ_INJECT_STATUS: 2503 - case KVM_CAP_ASSIGN_DEV_IRQ: 2504 2482 case KVM_CAP_IRQFD: 2505 2483 case KVM_CAP_IOEVENTFD: 2506 2484 case KVM_CAP_PIT2: ··· 2517 2497 case KVM_CAP_XSAVE: 2518 2498 case KVM_CAP_ASYNC_PF: 2519 2499 case KVM_CAP_GET_TSC_KHZ: 2520 - case KVM_CAP_PCI_2_3: 2521 2500 case KVM_CAP_KVMCLOCK_CTRL: 2522 2501 case KVM_CAP_READONLY_MEM: 2523 - case KVM_CAP_IRQFD_RESAMPLE: 2502 + #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2503 + case KVM_CAP_ASSIGN_DEV_IRQ: 2504 + case KVM_CAP_PCI_2_3: 2505 + #endif 2524 2506 r = 1; 2525 2507 break; 2526 2508 case KVM_CAP_COALESCED_MMIO: ··· 2543 2521 case KVM_CAP_PV_MMU: /* obsolete */ 2544 2522 r = 0; 2545 2523 break; 2524 + #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2546 2525 case KVM_CAP_IOMMU: 2547 2526 r = iommu_present(&pci_bus_type); 2548 2527 break; 2528 + #endif 2549 2529 case KVM_CAP_MCE: 2550 2530 r = KVM_MAX_MCE_BANKS; 2551 2531 break; ··· 2703 2679 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2704 2680 struct kvm_lapic_state *s) 2705 2681 { 2682 + kvm_x86_ops->sync_pir_to_irr(vcpu); 2706 2683 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2707 2684 2708 2685 return 0; ··· 2721 2696 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2722 2697 struct kvm_interrupt *irq) 2723 2698 { 2724 - if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS) 2699 + if (irq->irq >= KVM_NR_INTERRUPTS) 2725 2700 return -EINVAL; 2726 2701 if (irqchip_in_kernel(vcpu->kvm)) 2727 2702 return -ENXIO; ··· 2844 2819 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2845 2820 events->nmi.pad = 0; 2846 2821 2847 - events->sipi_vector = vcpu->arch.sipi_vector; 2822 + events->sipi_vector = 0; /* never valid when reporting to user space */ 2848 2823 2849 2824 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2850 - | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2851 2825 | KVM_VCPUEVENT_VALID_SHADOW); 2852 2826 memset(&events->reserved, 0, sizeof(events->reserved)); 2853 2827 } ··· 2877 2853 vcpu->arch.nmi_pending = events->nmi.pending; 2878 2854 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 2879 2855 2880 - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2881 - vcpu->arch.sipi_vector = events->sipi_vector; 2856 + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && 2857 + kvm_vcpu_has_lapic(vcpu)) 2858 + vcpu->arch.apic->sipi_vector = events->sipi_vector; 2882 2859 2883 2860 kvm_make_request(KVM_REQ_EVENT, vcpu); 2884 2861 ··· 3503 3478 return r; 3504 3479 } 3505 3480 3506 - int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) 3481 + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, 3482 + bool line_status) 3507 3483 { 3508 3484 if (!irqchip_in_kernel(kvm)) 3509 3485 return -ENXIO; 3510 3486 3511 3487 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 3512 - irq_event->irq, irq_event->level); 3488 + irq_event->irq, irq_event->level, 3489 + line_status); 3513 3490 return 0; 3514 3491 } 3515 3492 ··· 4779 4752 } 4780 4753 4781 4754 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, 4782 - bool write_fault_to_shadow_pgtable) 4755 + bool write_fault_to_shadow_pgtable, 4756 + int emulation_type) 4783 4757 { 4784 4758 gpa_t gpa = cr2; 4785 4759 pfn_t pfn; 4760 + 4761 + if (emulation_type & EMULTYPE_NO_REEXECUTE) 4762 + return false; 4786 4763 4787 4764 if (!vcpu->arch.mmu.direct_map) { 4788 4765 /* ··· 4930 4899 if (r != EMULATION_OK) { 4931 4900 if (emulation_type & EMULTYPE_TRAP_UD) 4932 4901 return EMULATE_FAIL; 4933 - if (reexecute_instruction(vcpu, cr2, 4934 - write_fault_to_spt)) 4902 + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, 4903 + emulation_type)) 4935 4904 return EMULATE_DONE; 4936 4905 if (emulation_type & EMULTYPE_SKIP) 4937 4906 return EMULATE_FAIL; ··· 4961 4930 return EMULATE_DONE; 4962 4931 4963 4932 if (r == EMULATION_FAILED) { 4964 - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt)) 4933 + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, 4934 + emulation_type)) 4965 4935 return EMULATE_DONE; 4966 4936 4967 4937 return handle_emulation_failure(vcpu); ··· 5673 5641 #endif 5674 5642 } 5675 5643 5676 - static void update_eoi_exitmap(struct kvm_vcpu *vcpu) 5644 + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 5677 5645 { 5678 5646 u64 eoi_exit_bitmap[4]; 5647 + u32 tmr[8]; 5648 + 5649 + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) 5650 + return; 5679 5651 5680 5652 memset(eoi_exit_bitmap, 0, 32); 5653 + memset(tmr, 0, 32); 5681 5654 5682 - kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); 5655 + kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); 5683 5656 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); 5657 + kvm_apic_update_tmr(vcpu, tmr); 5684 5658 } 5685 5659 5686 5660 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ··· 5694 5656 int r; 5695 5657 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5696 5658 vcpu->run->request_interrupt_window; 5697 - bool req_immediate_exit = 0; 5659 + bool req_immediate_exit = false; 5698 5660 5699 5661 if (vcpu->requests) { 5700 5662 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) ··· 5736 5698 record_steal_time(vcpu); 5737 5699 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 5738 5700 process_nmi(vcpu); 5739 - req_immediate_exit = 5740 - kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); 5741 5701 if (kvm_check_request(KVM_REQ_PMU, vcpu)) 5742 5702 kvm_handle_pmu_event(vcpu); 5743 5703 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5744 5704 kvm_deliver_pmi(vcpu); 5745 - if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) 5746 - update_eoi_exitmap(vcpu); 5705 + if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) 5706 + vcpu_scan_ioapic(vcpu); 5747 5707 } 5748 5708 5749 5709 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5710 + kvm_apic_accept_events(vcpu); 5711 + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 5712 + r = 1; 5713 + goto out; 5714 + } 5715 + 5750 5716 inject_pending_event(vcpu); 5751 5717 5752 5718 /* enable NMI/IRQ window open exits if needed */ 5753 5719 if (vcpu->arch.nmi_pending) 5754 - kvm_x86_ops->enable_nmi_window(vcpu); 5720 + req_immediate_exit = 5721 + kvm_x86_ops->enable_nmi_window(vcpu) != 0; 5755 5722 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 5756 - kvm_x86_ops->enable_irq_window(vcpu); 5723 + req_immediate_exit = 5724 + kvm_x86_ops->enable_irq_window(vcpu) != 0; 5757 5725 5758 5726 if (kvm_lapic_enabled(vcpu)) { 5759 5727 /* ··· 5838 5794 5839 5795 vcpu->mode = OUTSIDE_GUEST_MODE; 5840 5796 smp_wmb(); 5841 - local_irq_enable(); 5797 + 5798 + /* Interrupt is enabled by handle_external_intr() */ 5799 + kvm_x86_ops->handle_external_intr(vcpu); 5842 5800 5843 5801 ++vcpu->stat.exits; 5844 5802 ··· 5889 5843 int r; 5890 5844 struct kvm *kvm = vcpu->kvm; 5891 5845 5892 - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 5893 - pr_debug("vcpu %d received sipi with vector # %x\n", 5894 - vcpu->vcpu_id, vcpu->arch.sipi_vector); 5895 - kvm_lapic_reset(vcpu); 5896 - r = kvm_vcpu_reset(vcpu); 5897 - if (r) 5898 - return r; 5899 - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5900 - } 5901 - 5902 5846 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5903 5847 r = vapic_enter(vcpu); 5904 5848 if (r) { ··· 5905 5869 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5906 5870 kvm_vcpu_block(vcpu); 5907 5871 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5908 - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) 5909 - { 5872 + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { 5873 + kvm_apic_accept_events(vcpu); 5910 5874 switch(vcpu->arch.mp_state) { 5911 5875 case KVM_MP_STATE_HALTED: 5912 5876 vcpu->arch.mp_state = ··· 5914 5878 case KVM_MP_STATE_RUNNABLE: 5915 5879 vcpu->arch.apf.halted = false; 5916 5880 break; 5917 - case KVM_MP_STATE_SIPI_RECEIVED: 5881 + case KVM_MP_STATE_INIT_RECEIVED: 5882 + break; 5918 5883 default: 5919 5884 r = -EINTR; 5920 5885 break; ··· 6050 6013 6051 6014 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 6052 6015 kvm_vcpu_block(vcpu); 6016 + kvm_apic_accept_events(vcpu); 6053 6017 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 6054 6018 r = -EAGAIN; 6055 6019 goto out; ··· 6207 6169 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 6208 6170 struct kvm_mp_state *mp_state) 6209 6171 { 6172 + kvm_apic_accept_events(vcpu); 6210 6173 mp_state->mp_state = vcpu->arch.mp_state; 6211 6174 return 0; 6212 6175 } ··· 6215 6176 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 6216 6177 struct kvm_mp_state *mp_state) 6217 6178 { 6218 - vcpu->arch.mp_state = mp_state->mp_state; 6179 + if (!kvm_vcpu_has_lapic(vcpu) && 6180 + mp_state->mp_state != KVM_MP_STATE_RUNNABLE) 6181 + return -EINVAL; 6182 + 6183 + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { 6184 + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 6185 + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); 6186 + } else 6187 + vcpu->arch.mp_state = mp_state->mp_state; 6219 6188 kvm_make_request(KVM_REQ_EVENT, vcpu); 6220 6189 return 0; 6221 6190 } ··· 6522 6475 r = vcpu_load(vcpu); 6523 6476 if (r) 6524 6477 return r; 6525 - r = kvm_vcpu_reset(vcpu); 6526 - if (r == 0) 6527 - r = kvm_mmu_setup(vcpu); 6478 + kvm_vcpu_reset(vcpu); 6479 + r = kvm_mmu_setup(vcpu); 6528 6480 vcpu_put(vcpu); 6529 6481 6530 6482 return r; ··· 6560 6514 kvm_x86_ops->vcpu_free(vcpu); 6561 6515 } 6562 6516 6563 - static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) 6517 + void kvm_vcpu_reset(struct kvm_vcpu *vcpu) 6564 6518 { 6565 6519 atomic_set(&vcpu->arch.nmi_queued, 0); 6566 6520 vcpu->arch.nmi_pending = 0; ··· 6587 6541 vcpu->arch.regs_avail = ~0; 6588 6542 vcpu->arch.regs_dirty = ~0; 6589 6543 6590 - return kvm_x86_ops->vcpu_reset(vcpu); 6544 + kvm_x86_ops->vcpu_reset(vcpu); 6545 + } 6546 + 6547 + void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) 6548 + { 6549 + struct kvm_segment cs; 6550 + 6551 + kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 6552 + cs.selector = vector << 8; 6553 + cs.base = vector << 12; 6554 + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); 6555 + kvm_rip_write(vcpu, 0); 6591 6556 } 6592 6557 6593 6558 int kvm_arch_hardware_enable(void *garbage) ··· 6763 6706 } 6764 6707 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 6765 6708 6766 - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6709 + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { 6710 + r = -ENOMEM; 6767 6711 goto fail_free_mce_banks; 6712 + } 6768 6713 6769 6714 r = fx_init(vcpu); 6770 6715 if (r) ··· 6870 6811 6871 6812 void kvm_arch_destroy_vm(struct kvm *kvm) 6872 6813 { 6814 + if (current->mm == kvm->mm) { 6815 + /* 6816 + * Free memory regions allocated on behalf of userspace, 6817 + * unless the the memory map has changed due to process exit 6818 + * or fd copying. 6819 + */ 6820 + struct kvm_userspace_memory_region mem; 6821 + memset(&mem, 0, sizeof(mem)); 6822 + mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 6823 + kvm_set_memory_region(kvm, &mem); 6824 + 6825 + mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 6826 + kvm_set_memory_region(kvm, &mem); 6827 + 6828 + mem.slot = TSS_PRIVATE_MEMSLOT; 6829 + kvm_set_memory_region(kvm, &mem); 6830 + } 6873 6831 kvm_iommu_unmap_guest(kvm); 6874 6832 kfree(kvm->arch.vpic); 6875 6833 kfree(kvm->arch.vioapic); ··· 6979 6903 6980 6904 int kvm_arch_prepare_memory_region(struct kvm *kvm, 6981 6905 struct kvm_memory_slot *memslot, 6982 - struct kvm_memory_slot old, 6983 6906 struct kvm_userspace_memory_region *mem, 6984 - bool user_alloc) 6907 + enum kvm_mr_change change) 6985 6908 { 6986 - int npages = memslot->npages; 6987 - 6988 6909 /* 6989 6910 * Only private memory slots need to be mapped here since 6990 6911 * KVM_SET_MEMORY_REGION ioctl is no longer supported. 6991 6912 */ 6992 - if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) { 6913 + if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) { 6993 6914 unsigned long userspace_addr; 6994 6915 6995 6916 /* 6996 6917 * MAP_SHARED to prevent internal slot pages from being moved 6997 6918 * by fork()/COW. 6998 6919 */ 6999 - userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, 6920 + userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE, 7000 6921 PROT_READ | PROT_WRITE, 7001 6922 MAP_SHARED | MAP_ANONYMOUS, 0); 7002 6923 ··· 7008 6935 7009 6936 void kvm_arch_commit_memory_region(struct kvm *kvm, 7010 6937 struct kvm_userspace_memory_region *mem, 7011 - struct kvm_memory_slot old, 7012 - bool user_alloc) 6938 + const struct kvm_memory_slot *old, 6939 + enum kvm_mr_change change) 7013 6940 { 7014 6941 7015 - int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6942 + int nr_mmu_pages = 0; 7016 6943 7017 - if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) { 6944 + if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { 7018 6945 int ret; 7019 6946 7020 - ret = vm_munmap(old.userspace_addr, 7021 - old.npages * PAGE_SIZE); 6947 + ret = vm_munmap(old->userspace_addr, 6948 + old->npages * PAGE_SIZE); 7022 6949 if (ret < 0) 7023 6950 printk(KERN_WARNING 7024 6951 "kvm_vm_ioctl_set_memory_region: " ··· 7035 6962 * Existing largepage mappings are destroyed here and new ones will 7036 6963 * not be created until the end of the logging. 7037 6964 */ 7038 - if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 6965 + if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7039 6966 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7040 6967 /* 7041 6968 * If memory slot is created, or moved, we need to clear all 7042 6969 * mmio sptes. 7043 6970 */ 7044 - if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) { 7045 - kvm_mmu_zap_all(kvm); 6971 + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 6972 + kvm_mmu_zap_mmio_sptes(kvm); 7046 6973 kvm_reload_remote_mmus(kvm); 7047 6974 } 7048 6975 } ··· 7064 6991 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 7065 6992 !vcpu->arch.apf.halted) 7066 6993 || !list_empty_careful(&vcpu->async_pf.done) 7067 - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6994 + || kvm_apic_has_events(vcpu) 7068 6995 || atomic_read(&vcpu->arch.nmi_queued) || 7069 6996 (kvm_arch_interrupt_allowed(vcpu) && 7070 6997 kvm_cpu_has_interrupt(vcpu));

+6 -5

drivers/s390/kvm/kvm_virtio.c

··· 443 443 } 444 444 /* 445 445 * Init function for virtio 446 - * devices are in a single page above top of "normal" mem 446 + * devices are in a single page above top of "normal" + standby mem 447 447 */ 448 448 static int __init kvm_devices_init(void) 449 449 { 450 450 int rc; 451 + unsigned long total_memory_size = sclp_get_rzm() * sclp_get_rnmax(); 451 452 452 453 if (!MACHINE_IS_KVM) 453 454 return -ENODEV; 454 455 455 - if (test_devices_support(real_memory_size) < 0) 456 + if (test_devices_support(total_memory_size) < 0) 456 457 return -ENODEV; 457 458 458 - rc = vmem_add_mapping(real_memory_size, PAGE_SIZE); 459 + rc = vmem_add_mapping(total_memory_size, PAGE_SIZE); 459 460 if (rc) 460 461 return rc; 461 462 462 - kvm_devices = (void *) real_memory_size; 463 + kvm_devices = (void *) total_memory_size; 463 464 464 465 kvm_root = root_device_register("kvm_s390"); 465 466 if (IS_ERR(kvm_root)) { 466 467 rc = PTR_ERR(kvm_root); 467 468 printk(KERN_ERR "Could not register kvm_s390 root device"); 468 - vmem_remove_mapping(real_memory_size, PAGE_SIZE); 469 + vmem_remove_mapping(total_memory_size, PAGE_SIZE); 469 470 return rc; 470 471 } 471 472

+12 -8

drivers/s390/kvm/virtio_ccw.c

··· 31 31 #include <asm/irq.h> 32 32 #include <asm/cio.h> 33 33 #include <asm/ccwdev.h> 34 + #include <asm/virtio-ccw.h> 34 35 35 36 /* 36 37 * virtio related functions ··· 78 77 void *queue; 79 78 struct vq_info_block *info_block; 80 79 struct list_head node; 80 + long cookie; 81 81 }; 82 - 83 - #define KVM_VIRTIO_CCW_RING_ALIGN 4096 84 - 85 - #define KVM_S390_VIRTIO_CCW_NOTIFY 3 86 82 87 83 #define CCW_CMD_SET_VQ 0x13 88 84 #define CCW_CMD_VDEV_RESET 0x33 ··· 133 135 do { 134 136 spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags); 135 137 ret = ccw_device_start(vcdev->cdev, ccw, intparm, 0, 0); 136 - if (!ret) 138 + if (!ret) { 139 + if (!vcdev->curr_io) 140 + vcdev->err = 0; 137 141 vcdev->curr_io |= flag; 142 + } 138 143 spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags); 139 144 cpu_relax(); 140 145 } while (ret == -EBUSY); ··· 146 145 } 147 146 148 147 static inline long do_kvm_notify(struct subchannel_id schid, 149 - unsigned long queue_index) 148 + unsigned long queue_index, 149 + long cookie) 150 150 { 151 151 register unsigned long __nr asm("1") = KVM_S390_VIRTIO_CCW_NOTIFY; 152 152 register struct subchannel_id __schid asm("2") = schid; 153 153 register unsigned long __index asm("3") = queue_index; 154 154 register long __rc asm("2"); 155 + register long __cookie asm("4") = cookie; 155 156 156 157 asm volatile ("diag 2,4,0x500\n" 157 - : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index) 158 + : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index), 159 + "d"(__cookie) 158 160 : "memory", "cc"); 159 161 return __rc; 160 162 } ··· 170 166 171 167 vcdev = to_vc_device(info->vq->vdev); 172 168 ccw_device_get_schid(vcdev->cdev, &schid); 173 - do_kvm_notify(schid, vq->index); 169 + info->cookie = do_kvm_notify(schid, vq->index, info->cookie); 174 170 } 175 171 176 172 static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,

+112 -54

include/linux/kvm_host.h

··· 117 117 #define KVM_REQ_APF_HALT 12 118 118 #define KVM_REQ_STEAL_UPDATE 13 119 119 #define KVM_REQ_NMI 14 120 - #define KVM_REQ_IMMEDIATE_EXIT 15 121 - #define KVM_REQ_PMU 16 122 - #define KVM_REQ_PMI 17 123 - #define KVM_REQ_WATCHDOG 18 124 - #define KVM_REQ_MASTERCLOCK_UPDATE 19 125 - #define KVM_REQ_MCLOCK_INPROGRESS 20 126 - #define KVM_REQ_EPR_EXIT 21 127 - #define KVM_REQ_EOIBITMAP 22 120 + #define KVM_REQ_PMU 15 121 + #define KVM_REQ_PMI 16 122 + #define KVM_REQ_WATCHDOG 17 123 + #define KVM_REQ_MASTERCLOCK_UPDATE 18 124 + #define KVM_REQ_MCLOCK_INPROGRESS 19 125 + #define KVM_REQ_EPR_EXIT 20 126 + #define KVM_REQ_SCAN_IOAPIC 21 128 127 129 128 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 130 129 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 ··· 131 132 struct kvm; 132 133 struct kvm_vcpu; 133 134 extern struct kmem_cache *kvm_vcpu_cache; 135 + 136 + extern raw_spinlock_t kvm_lock; 137 + extern struct list_head vm_list; 134 138 135 139 struct kvm_io_range { 136 140 gpa_t addr; ··· 151 149 enum kvm_bus { 152 150 KVM_MMIO_BUS, 153 151 KVM_PIO_BUS, 152 + KVM_VIRTIO_CCW_NOTIFY_BUS, 154 153 KVM_NR_BUSES 155 154 }; 156 155 ··· 255 252 bool dy_eligible; 256 253 } spin_loop; 257 254 #endif 255 + bool preempted; 258 256 struct kvm_vcpu_arch arch; 259 257 }; 260 258 ··· 289 285 u32 gsi; 290 286 u32 type; 291 287 int (*set)(struct kvm_kernel_irq_routing_entry *e, 292 - struct kvm *kvm, int irq_source_id, int level); 288 + struct kvm *kvm, int irq_source_id, int level, 289 + bool line_status); 293 290 union { 294 291 struct { 295 292 unsigned irqchip; ··· 301 296 struct hlist_node link; 302 297 }; 303 298 304 - #ifdef __KVM_HAVE_IOAPIC 299 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 305 300 306 301 struct kvm_irq_routing_table { 307 - int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS]; 302 + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; 308 303 struct kvm_kernel_irq_routing_entry *rt_entries; 309 304 u32 nr_rt_entries; 310 305 /* ··· 390 385 long mmu_notifier_count; 391 386 #endif 392 387 long tlbs_dirty; 388 + struct list_head devices; 393 389 }; 394 390 395 391 #define kvm_err(fmt, ...) \ ··· 430 424 int __must_check vcpu_load(struct kvm_vcpu *vcpu); 431 425 void vcpu_put(struct kvm_vcpu *vcpu); 432 426 427 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 428 + int kvm_irqfd_init(void); 429 + void kvm_irqfd_exit(void); 430 + #else 431 + static inline int kvm_irqfd_init(void) 432 + { 433 + return 0; 434 + } 435 + 436 + static inline void kvm_irqfd_exit(void) 437 + { 438 + } 439 + #endif 433 440 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 434 441 struct module *module); 435 442 void kvm_exit(void); ··· 471 452 return slot; 472 453 } 473 454 455 + /* 456 + * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: 457 + * - create a new memory slot 458 + * - delete an existing memory slot 459 + * - modify an existing memory slot 460 + * -- move it in the guest physical memory space 461 + * -- just change its flags 462 + * 463 + * Since flags can be changed by some of these operations, the following 464 + * differentiation is the best we can do for __kvm_set_memory_region(): 465 + */ 466 + enum kvm_mr_change { 467 + KVM_MR_CREATE, 468 + KVM_MR_DELETE, 469 + KVM_MR_MOVE, 470 + KVM_MR_FLAGS_ONLY, 471 + }; 472 + 474 473 int kvm_set_memory_region(struct kvm *kvm, 475 - struct kvm_userspace_memory_region *mem, 476 - bool user_alloc); 474 + struct kvm_userspace_memory_region *mem); 477 475 int __kvm_set_memory_region(struct kvm *kvm, 478 - struct kvm_userspace_memory_region *mem, 479 - bool user_alloc); 476 + struct kvm_userspace_memory_region *mem); 480 477 void kvm_arch_free_memslot(struct kvm_memory_slot *free, 481 478 struct kvm_memory_slot *dont); 482 479 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); 483 480 int kvm_arch_prepare_memory_region(struct kvm *kvm, 484 481 struct kvm_memory_slot *memslot, 485 - struct kvm_memory_slot old, 486 482 struct kvm_userspace_memory_region *mem, 487 - bool user_alloc); 483 + enum kvm_mr_change change); 488 484 void kvm_arch_commit_memory_region(struct kvm *kvm, 489 485 struct kvm_userspace_memory_region *mem, 490 - struct kvm_memory_slot old, 491 - bool user_alloc); 486 + const struct kvm_memory_slot *old, 487 + enum kvm_mr_change change); 492 488 bool kvm_largepages_enabled(void); 493 489 void kvm_disable_largepages(void); 494 490 /* flush all memory translations */ ··· 573 539 void kvm_flush_remote_tlbs(struct kvm *kvm); 574 540 void kvm_reload_remote_mmus(struct kvm *kvm); 575 541 void kvm_make_mclock_inprogress_request(struct kvm *kvm); 576 - void kvm_make_update_eoibitmap_request(struct kvm *kvm); 542 + void kvm_make_scan_ioapic_request(struct kvm *kvm); 577 543 578 544 long kvm_arch_dev_ioctl(struct file *filp, 579 545 unsigned int ioctl, unsigned long arg); ··· 589 555 struct kvm_dirty_log *log); 590 556 591 557 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 592 - struct 593 - kvm_userspace_memory_region *mem, 594 - bool user_alloc); 595 - int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level); 558 + struct kvm_userspace_memory_region *mem); 559 + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, 560 + bool line_status); 596 561 long kvm_arch_vm_ioctl(struct file *filp, 597 562 unsigned int ioctl, unsigned long arg); 598 563 ··· 665 632 666 633 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); 667 634 void kvm_arch_destroy_vm(struct kvm *kvm); 668 - void kvm_free_all_assigned_devices(struct kvm *kvm); 669 635 void kvm_arch_sync_events(struct kvm *kvm); 670 636 671 637 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); ··· 716 684 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 717 685 bool mask); 718 686 719 - #ifdef __KVM_HAVE_IOAPIC 720 - void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, 721 - union kvm_ioapic_redirect_entry *entry, 722 - unsigned long *deliver_bitmask); 723 - #endif 724 - int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); 687 + int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, 688 + bool line_status); 725 689 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); 726 690 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, 727 - int irq_source_id, int level); 691 + int irq_source_id, int level, bool line_status); 728 692 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); 729 693 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 730 694 void kvm_register_irq_ack_notifier(struct kvm *kvm, ··· 733 705 /* For vcpu->arch.iommu_flags */ 734 706 #define KVM_IOMMU_CACHE_COHERENCY 0x1 735 707 736 - #ifdef CONFIG_IOMMU_API 708 + #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 737 709 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); 738 710 void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); 739 711 int kvm_iommu_map_guest(struct kvm *kvm); ··· 742 714 struct kvm_assigned_dev_kernel *assigned_dev); 743 715 int kvm_deassign_device(struct kvm *kvm, 744 716 struct kvm_assigned_dev_kernel *assigned_dev); 745 - #else /* CONFIG_IOMMU_API */ 717 + #else 746 718 static inline int kvm_iommu_map_pages(struct kvm *kvm, 747 719 struct kvm_memory_slot *slot) 748 720 { ··· 754 726 { 755 727 } 756 728 757 - static inline int kvm_iommu_map_guest(struct kvm *kvm) 758 - { 759 - return -ENODEV; 760 - } 761 - 762 729 static inline int kvm_iommu_unmap_guest(struct kvm *kvm) 763 730 { 764 731 return 0; 765 732 } 766 - 767 - static inline int kvm_assign_device(struct kvm *kvm, 768 - struct kvm_assigned_dev_kernel *assigned_dev) 769 - { 770 - return 0; 771 - } 772 - 773 - static inline int kvm_deassign_device(struct kvm *kvm, 774 - struct kvm_assigned_dev_kernel *assigned_dev) 775 - { 776 - return 0; 777 - } 778 - #endif /* CONFIG_IOMMU_API */ 733 + #endif 779 734 780 735 static inline void __guest_enter(void) 781 736 { ··· 932 921 } 933 922 #endif 934 923 935 - #ifdef KVM_CAP_IRQ_ROUTING 924 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 936 925 937 926 #define KVM_MAX_IRQ_ROUTES 1024 938 927 ··· 941 930 const struct kvm_irq_routing_entry *entries, 942 931 unsigned nr, 943 932 unsigned flags); 933 + int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, 934 + struct kvm_kernel_irq_routing_entry *e, 935 + const struct kvm_irq_routing_entry *ue); 944 936 void kvm_free_irq_routing(struct kvm *kvm); 945 937 946 938 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); ··· 1012 998 1013 999 #endif 1014 1000 1015 - #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT 1001 + #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 1016 1002 1017 1003 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, 1018 1004 unsigned long arg); 1005 + 1006 + void kvm_free_all_assigned_devices(struct kvm *kvm); 1019 1007 1020 1008 #else 1021 1009 ··· 1026 1010 { 1027 1011 return -ENOTTY; 1028 1012 } 1013 + 1014 + static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} 1029 1015 1030 1016 #endif 1031 1017 ··· 1045 1027 return false; 1046 1028 } 1047 1029 } 1030 + 1031 + extern bool kvm_rebooting; 1032 + 1033 + struct kvm_device_ops; 1034 + 1035 + struct kvm_device { 1036 + struct kvm_device_ops *ops; 1037 + struct kvm *kvm; 1038 + void *private; 1039 + struct list_head vm_node; 1040 + }; 1041 + 1042 + /* create, destroy, and name are mandatory */ 1043 + struct kvm_device_ops { 1044 + const char *name; 1045 + int (*create)(struct kvm_device *dev, u32 type); 1046 + 1047 + /* 1048 + * Destroy is responsible for freeing dev. 1049 + * 1050 + * Destroy may be called before or after destructors are called 1051 + * on emulated I/O regions, depending on whether a reference is 1052 + * held by a vcpu or other kvm component that gets destroyed 1053 + * after the emulated I/O. 1054 + */ 1055 + void (*destroy)(struct kvm_device *dev); 1056 + 1057 + int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); 1058 + int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); 1059 + int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); 1060 + long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, 1061 + unsigned long arg); 1062 + }; 1063 + 1064 + void kvm_device_get(struct kvm_device *dev); 1065 + void kvm_device_put(struct kvm_device *dev); 1066 + struct kvm_device *kvm_device_from_filp(struct file *filp); 1067 + 1068 + extern struct kvm_device_ops kvm_mpic_ops; 1069 + extern struct kvm_device_ops kvm_xics_ops; 1048 1070 1049 1071 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 1050 1072

+10 -2

include/trace/events/kvm.h

··· 37 37 __entry->errno < 0 ? -__entry->errno : __entry->reason) 38 38 ); 39 39 40 - #if defined(__KVM_HAVE_IRQ_LINE) 40 + #if defined(CONFIG_HAVE_KVM_IRQCHIP) 41 41 TRACE_EVENT(kvm_set_irq, 42 42 TP_PROTO(unsigned int gsi, int level, int irq_source_id), 43 43 TP_ARGS(gsi, level, irq_source_id), ··· 122 122 {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ 123 123 {KVM_IRQCHIP_IOAPIC, "IOAPIC"} 124 124 125 + #endif /* defined(__KVM_HAVE_IOAPIC) */ 126 + 127 + #if defined(CONFIG_HAVE_KVM_IRQCHIP) 128 + 125 129 TRACE_EVENT(kvm_ack_irq, 126 130 TP_PROTO(unsigned int irqchip, unsigned int pin), 127 131 TP_ARGS(irqchip, pin), ··· 140 136 __entry->pin = pin; 141 137 ), 142 138 139 + #ifdef kvm_irqchips 143 140 TP_printk("irqchip %s pin %u", 144 141 __print_symbolic(__entry->irqchip, kvm_irqchips), 145 142 __entry->pin) 143 + #else 144 + TP_printk("irqchip %d pin %u", __entry->irqchip, __entry->pin) 145 + #endif 146 146 ); 147 147 148 + #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ 148 149 149 150 150 - #endif /* defined(__KVM_HAVE_IOAPIC) */ 151 151 152 152 #define KVM_TRACE_MMIO_READ_UNSATISFIED 0 153 153 #define KVM_TRACE_MMIO_READ 1

+39 -6

include/uapi/linux/kvm.h

··· 449 449 kvm_ioeventfd_flag_nr_datamatch, 450 450 kvm_ioeventfd_flag_nr_pio, 451 451 kvm_ioeventfd_flag_nr_deassign, 452 + kvm_ioeventfd_flag_nr_virtio_ccw_notify, 452 453 kvm_ioeventfd_flag_nr_max, 453 454 }; 454 455 455 456 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) 456 457 #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) 457 458 #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) 459 + #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ 460 + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) 458 461 459 462 #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) 460 463 ··· 561 558 #define KVM_CAP_MP_STATE 14 562 559 #define KVM_CAP_COALESCED_MMIO 15 563 560 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ 564 - #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT 565 561 #define KVM_CAP_DEVICE_ASSIGNMENT 17 566 - #endif 567 562 #define KVM_CAP_IOMMU 18 568 563 #ifdef __KVM_HAVE_MSI 569 564 #define KVM_CAP_DEVICE_MSI 20 ··· 577 576 #ifdef __KVM_HAVE_PIT 578 577 #define KVM_CAP_REINJECT_CONTROL 24 579 578 #endif 580 - #ifdef __KVM_HAVE_IOAPIC 581 579 #define KVM_CAP_IRQ_ROUTING 25 582 - #endif 583 580 #define KVM_CAP_IRQ_INJECT_STATUS 26 584 - #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT 585 581 #define KVM_CAP_DEVICE_DEASSIGNMENT 27 586 - #endif 587 582 #ifdef __KVM_HAVE_MSIX 588 583 #define KVM_CAP_DEVICE_MSIX 28 589 584 #endif ··· 662 665 #define KVM_CAP_PPC_EPR 86 663 666 #define KVM_CAP_ARM_PSCI 87 664 667 #define KVM_CAP_ARM_SET_DEVICE_ADDR 88 668 + #define KVM_CAP_DEVICE_CTRL 89 669 + #define KVM_CAP_IRQ_MPIC 90 670 + #define KVM_CAP_PPC_RTAS 91 671 + #define KVM_CAP_IRQ_XICS 92 665 672 666 673 #ifdef KVM_CAP_IRQ_ROUTING 667 674 ··· 819 818 }; 820 819 821 820 /* 821 + * Device control API, available with KVM_CAP_DEVICE_CTRL 822 + */ 823 + #define KVM_CREATE_DEVICE_TEST 1 824 + 825 + struct kvm_create_device { 826 + __u32 type; /* in: KVM_DEV_TYPE_xxx */ 827 + __u32 fd; /* out: device handle */ 828 + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ 829 + }; 830 + 831 + struct kvm_device_attr { 832 + __u32 flags; /* no flags currently defined */ 833 + __u32 group; /* device-defined */ 834 + __u64 attr; /* group-defined */ 835 + __u64 addr; /* userspace address of attr data */ 836 + }; 837 + 838 + #define KVM_DEV_TYPE_FSL_MPIC_20 1 839 + #define KVM_DEV_TYPE_FSL_MPIC_42 2 840 + #define KVM_DEV_TYPE_XICS 3 841 + 842 + /* 822 843 * ioctls for VM fds 823 844 */ 824 845 #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) ··· 927 904 #define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd) 928 905 /* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */ 929 906 #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) 907 + /* Available with KVM_CAP_PPC_RTAS */ 908 + #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) 909 + 910 + /* ioctl for vm fd */ 911 + #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) 912 + 913 + /* ioctls for fds returned by KVM_CREATE_DEVICE */ 914 + #define KVM_SET_DEVICE_ATTR _IOW(KVMIO, 0xe1, struct kvm_device_attr) 915 + #define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr) 916 + #define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr) 930 917 931 918 /* 932 919 * ioctls for vcpu fds

+3

virt/kvm/Kconfig

··· 6 6 config HAVE_KVM_IRQCHIP 7 7 bool 8 8 9 + config HAVE_KVM_IRQ_ROUTING 10 + bool 11 + 9 12 config HAVE_KVM_EVENTFD 10 13 bool 11 14 select EVENTFD

+7 -36

virt/kvm/assigned-dev.c

··· 80 80 spin_lock(&assigned_dev->intx_mask_lock); 81 81 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) 82 82 kvm_set_irq(assigned_dev->kvm, 83 - assigned_dev->irq_source_id, vector, 1); 83 + assigned_dev->irq_source_id, vector, 1, 84 + false); 84 85 spin_unlock(&assigned_dev->intx_mask_lock); 85 86 } else 86 87 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 87 - vector, 1); 88 + vector, 1, false); 88 89 } 89 90 90 91 static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) ··· 166 165 container_of(kian, struct kvm_assigned_dev_kernel, 167 166 ack_notifier); 168 167 169 - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 168 + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); 170 169 171 170 spin_lock(&dev->intx_mask_lock); 172 171 ··· 189 188 190 189 if (reassert) 191 190 kvm_set_irq(dev->kvm, dev->irq_source_id, 192 - dev->guest_irq, 1); 191 + dev->guest_irq, 1, false); 193 192 } 194 193 195 194 spin_unlock(&dev->intx_mask_lock); ··· 203 202 &assigned_dev->ack_notifier); 204 203 205 204 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 206 - assigned_dev->guest_irq, 0); 205 + assigned_dev->guest_irq, 0, false); 207 206 208 207 if (assigned_dev->irq_source_id != -1) 209 208 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); ··· 902 901 if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { 903 902 if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { 904 903 kvm_set_irq(match->kvm, match->irq_source_id, 905 - match->guest_irq, 0); 904 + match->guest_irq, 0, false); 906 905 /* 907 906 * Masking at hardware-level is performed on demand, 908 907 * i.e. when an IRQ actually arrives at the host. ··· 983 982 goto out; 984 983 break; 985 984 } 986 - #ifdef KVM_CAP_IRQ_ROUTING 987 - case KVM_SET_GSI_ROUTING: { 988 - struct kvm_irq_routing routing; 989 - struct kvm_irq_routing __user *urouting; 990 - struct kvm_irq_routing_entry *entries; 991 - 992 - r = -EFAULT; 993 - if (copy_from_user(&routing, argp, sizeof(routing))) 994 - goto out; 995 - r = -EINVAL; 996 - if (routing.nr >= KVM_MAX_IRQ_ROUTES) 997 - goto out; 998 - if (routing.flags) 999 - goto out; 1000 - r = -ENOMEM; 1001 - entries = vmalloc(routing.nr * sizeof(*entries)); 1002 - if (!entries) 1003 - goto out; 1004 - r = -EFAULT; 1005 - urouting = argp; 1006 - if (copy_from_user(entries, urouting->entries, 1007 - routing.nr * sizeof(*entries))) 1008 - goto out_free_irq_routing; 1009 - r = kvm_set_irq_routing(kvm, entries, routing.nr, 1010 - routing.flags); 1011 - out_free_irq_routing: 1012 - vfree(entries); 1013 - break; 1014 - } 1015 - #endif /* KVM_CAP_IRQ_ROUTING */ 1016 985 #ifdef __KVM_HAVE_MSIX 1017 986 case KVM_ASSIGN_SET_MSIX_NR: { 1018 987 struct kvm_assigned_msix_nr entry_nr;

+33 -20

virt/kvm/eventfd.c

··· 35 35 36 36 #include "iodev.h" 37 37 38 - #ifdef __KVM_HAVE_IOAPIC 38 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 39 39 /* 40 40 * -------------------------------------------------------------------- 41 41 * irqfd: Allows an fd to be used to inject an interrupt to the guest ··· 100 100 struct kvm *kvm = irqfd->kvm; 101 101 102 102 if (!irqfd->resampler) { 103 - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 104 - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); 103 + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, 104 + false); 105 + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, 106 + false); 105 107 } else 106 108 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 107 - irqfd->gsi, 1); 109 + irqfd->gsi, 1, false); 108 110 } 109 111 110 112 /* ··· 123 121 resampler = container_of(kian, struct _irqfd_resampler, notifier); 124 122 125 123 kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 126 - resampler->notifier.gsi, 0); 124 + resampler->notifier.gsi, 0, false); 127 125 128 126 rcu_read_lock(); 129 127 ··· 148 146 list_del(&resampler->link); 149 147 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 150 148 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 151 - resampler->notifier.gsi, 0); 149 + resampler->notifier.gsi, 0, false); 152 150 kfree(resampler); 153 151 } 154 152 ··· 227 225 irq = rcu_dereference(irqfd->irq_entry); 228 226 /* An event has been signaled, inject an interrupt */ 229 227 if (irq) 230 - kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); 228 + kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, 229 + false); 231 230 else 232 231 schedule_work(&irqfd->inject); 233 232 rcu_read_unlock(); ··· 433 430 void 434 431 kvm_eventfd_init(struct kvm *kvm) 435 432 { 436 - #ifdef __KVM_HAVE_IOAPIC 433 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 437 434 spin_lock_init(&kvm->irqfds.lock); 438 435 INIT_LIST_HEAD(&kvm->irqfds.items); 439 436 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); ··· 442 439 INIT_LIST_HEAD(&kvm->ioeventfds); 443 440 } 444 441 445 - #ifdef __KVM_HAVE_IOAPIC 442 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 446 443 /* 447 444 * shutdown any irqfd's that match fd+gsi 448 445 */ ··· 546 543 * aggregated from all vm* instances. We need our own isolated single-thread 547 544 * queue to prevent deadlock against flushing the normal work-queue. 548 545 */ 549 - static int __init irqfd_module_init(void) 546 + int kvm_irqfd_init(void) 550 547 { 551 548 irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); 552 549 if (!irqfd_cleanup_wq) ··· 555 552 return 0; 556 553 } 557 554 558 - static void __exit irqfd_module_exit(void) 555 + void kvm_irqfd_exit(void) 559 556 { 560 557 destroy_workqueue(irqfd_cleanup_wq); 561 558 } 562 - 563 - module_init(irqfd_module_init); 564 - module_exit(irqfd_module_exit); 565 559 #endif 566 560 567 561 /* ··· 577 577 struct eventfd_ctx *eventfd; 578 578 u64 datamatch; 579 579 struct kvm_io_device dev; 580 + u8 bus_idx; 580 581 bool wildcard; 581 582 }; 582 583 ··· 670 669 struct _ioeventfd *_p; 671 670 672 671 list_for_each_entry(_p, &kvm->ioeventfds, list) 673 - if (_p->addr == p->addr && _p->length == p->length && 672 + if (_p->bus_idx == p->bus_idx && 673 + _p->addr == p->addr && _p->length == p->length && 674 674 (_p->wildcard || p->wildcard || 675 675 _p->datamatch == p->datamatch)) 676 676 return true; ··· 679 677 return false; 680 678 } 681 679 680 + static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) 681 + { 682 + if (flags & KVM_IOEVENTFD_FLAG_PIO) 683 + return KVM_PIO_BUS; 684 + if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) 685 + return KVM_VIRTIO_CCW_NOTIFY_BUS; 686 + return KVM_MMIO_BUS; 687 + } 688 + 682 689 static int 683 690 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 684 691 { 685 - int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 686 - enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; 692 + enum kvm_bus bus_idx; 687 693 struct _ioeventfd *p; 688 694 struct eventfd_ctx *eventfd; 689 695 int ret; 690 696 697 + bus_idx = ioeventfd_bus_from_flags(args->flags); 691 698 /* must be natural-word sized */ 692 699 switch (args->len) { 693 700 case 1: ··· 728 717 729 718 INIT_LIST_HEAD(&p->list); 730 719 p->addr = args->addr; 720 + p->bus_idx = bus_idx; 731 721 p->length = args->len; 732 722 p->eventfd = eventfd; 733 723 ··· 772 760 static int 773 761 kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 774 762 { 775 - int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 776 - enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; 763 + enum kvm_bus bus_idx; 777 764 struct _ioeventfd *p, *tmp; 778 765 struct eventfd_ctx *eventfd; 779 766 int ret = -ENOENT; 780 767 768 + bus_idx = ioeventfd_bus_from_flags(args->flags); 781 769 eventfd = eventfd_ctx_fdget(args->fd); 782 770 if (IS_ERR(eventfd)) 783 771 return PTR_ERR(eventfd); ··· 787 775 list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { 788 776 bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 789 777 790 - if (p->eventfd != eventfd || 778 + if (p->bus_idx != bus_idx || 779 + p->eventfd != eventfd || 791 780 p->addr != args->addr || 792 781 p->length != args->len || 793 782 p->wildcard != wildcard)

+132 -31

virt/kvm/ioapic.c

··· 50 50 #else 51 51 #define ioapic_debug(fmt, arg...) 52 52 #endif 53 - static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq); 53 + static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq, 54 + bool line_status); 54 55 55 56 static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, 56 57 unsigned long addr, ··· 91 90 return result; 92 91 } 93 92 94 - static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) 93 + static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) 94 + { 95 + ioapic->rtc_status.pending_eoi = 0; 96 + bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); 97 + } 98 + 99 + static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) 100 + { 101 + bool new_val, old_val; 102 + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 103 + union kvm_ioapic_redirect_entry *e; 104 + 105 + e = &ioapic->redirtbl[RTC_GSI]; 106 + if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, 107 + e->fields.dest_mode)) 108 + return; 109 + 110 + new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); 111 + old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); 112 + 113 + if (new_val == old_val) 114 + return; 115 + 116 + if (new_val) { 117 + __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); 118 + ioapic->rtc_status.pending_eoi++; 119 + } else { 120 + __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); 121 + ioapic->rtc_status.pending_eoi--; 122 + } 123 + 124 + WARN_ON(ioapic->rtc_status.pending_eoi < 0); 125 + } 126 + 127 + void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) 128 + { 129 + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 130 + 131 + spin_lock(&ioapic->lock); 132 + __rtc_irq_eoi_tracking_restore_one(vcpu); 133 + spin_unlock(&ioapic->lock); 134 + } 135 + 136 + static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) 137 + { 138 + struct kvm_vcpu *vcpu; 139 + int i; 140 + 141 + if (RTC_GSI >= IOAPIC_NUM_PINS) 142 + return; 143 + 144 + rtc_irq_eoi_tracking_reset(ioapic); 145 + kvm_for_each_vcpu(i, vcpu, ioapic->kvm) 146 + __rtc_irq_eoi_tracking_restore_one(vcpu); 147 + } 148 + 149 + static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) 150 + { 151 + if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) 152 + --ioapic->rtc_status.pending_eoi; 153 + 154 + WARN_ON(ioapic->rtc_status.pending_eoi < 0); 155 + } 156 + 157 + static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) 158 + { 159 + if (ioapic->rtc_status.pending_eoi > 0) 160 + return true; /* coalesced */ 161 + 162 + return false; 163 + } 164 + 165 + static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx, 166 + bool line_status) 95 167 { 96 168 union kvm_ioapic_redirect_entry *pent; 97 169 int injected = -1; ··· 172 98 pent = &ioapic->redirtbl[idx]; 173 99 174 100 if (!pent->fields.mask) { 175 - injected = ioapic_deliver(ioapic, idx); 101 + injected = ioapic_deliver(ioapic, idx, line_status); 176 102 if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) 177 103 pent->fields.remote_irr = 1; 178 104 } ··· 193 119 smp_wmb(); 194 120 } 195 121 196 - void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 197 - u64 *eoi_exit_bitmap) 122 + void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, 123 + u32 *tmr) 198 124 { 199 125 struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 200 126 union kvm_ioapic_redirect_entry *e; 201 - struct kvm_lapic_irq irqe; 202 127 int index; 203 128 204 129 spin_lock(&ioapic->lock); 205 - /* traverse ioapic entry to set eoi exit bitmap*/ 206 130 for (index = 0; index < IOAPIC_NUM_PINS; index++) { 207 131 e = &ioapic->redirtbl[index]; 208 132 if (!e->fields.mask && 209 133 (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || 210 134 kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, 211 - index))) { 212 - irqe.dest_id = e->fields.dest_id; 213 - irqe.vector = e->fields.vector; 214 - irqe.dest_mode = e->fields.dest_mode; 215 - irqe.delivery_mode = e->fields.delivery_mode << 8; 216 - kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap); 135 + index) || index == RTC_GSI)) { 136 + if (kvm_apic_match_dest(vcpu, NULL, 0, 137 + e->fields.dest_id, e->fields.dest_mode)) { 138 + __set_bit(e->fields.vector, 139 + (unsigned long *)eoi_exit_bitmap); 140 + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) 141 + __set_bit(e->fields.vector, 142 + (unsigned long *)tmr); 143 + } 217 144 } 218 145 } 219 146 spin_unlock(&ioapic->lock); 220 147 } 221 - EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap); 222 148 223 - void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm) 149 + #ifdef CONFIG_X86 150 + void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) 224 151 { 225 152 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 226 153 227 - if (!kvm_apic_vid_enabled(kvm) || !ioapic) 154 + if (!ioapic) 228 155 return; 229 - kvm_make_update_eoibitmap_request(kvm); 156 + kvm_make_scan_ioapic_request(kvm); 230 157 } 158 + #else 159 + void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) 160 + { 161 + return; 162 + } 163 + #endif 231 164 232 165 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) 233 166 { ··· 276 195 kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); 277 196 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG 278 197 && ioapic->irr & (1 << index)) 279 - ioapic_service(ioapic, index); 280 - kvm_ioapic_make_eoibitmap_request(ioapic->kvm); 198 + ioapic_service(ioapic, index, false); 199 + kvm_vcpu_request_scan_ioapic(ioapic->kvm); 281 200 break; 282 201 } 283 202 } 284 203 285 - static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) 204 + static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status) 286 205 { 287 206 union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; 288 207 struct kvm_lapic_irq irqe; 208 + int ret; 289 209 290 210 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " 291 211 "vector=%x trig_mode=%x\n", ··· 302 220 irqe.level = 1; 303 221 irqe.shorthand = 0; 304 222 305 - return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); 223 + if (irq == RTC_GSI && line_status) { 224 + BUG_ON(ioapic->rtc_status.pending_eoi != 0); 225 + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, 226 + ioapic->rtc_status.dest_map); 227 + ioapic->rtc_status.pending_eoi = ret; 228 + } else 229 + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); 230 + 231 + return ret; 306 232 } 307 233 308 234 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 309 - int level) 235 + int level, bool line_status) 310 236 { 311 237 u32 old_irr; 312 238 u32 mask = 1 << irq; ··· 334 244 ret = 1; 335 245 } else { 336 246 int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); 247 + 248 + if (irq == RTC_GSI && line_status && 249 + rtc_irq_check_coalesced(ioapic)) { 250 + ret = 0; /* coalesced */ 251 + goto out; 252 + } 337 253 ioapic->irr |= mask; 338 254 if ((edge && old_irr != ioapic->irr) || 339 255 (!edge && !entry.fields.remote_irr)) 340 - ret = ioapic_service(ioapic, irq); 256 + ret = ioapic_service(ioapic, irq, line_status); 341 257 else 342 258 ret = 0; /* report coalesced interrupt */ 343 259 } 260 + out: 344 261 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); 345 262 spin_unlock(&ioapic->lock); 346 263 ··· 364 267 spin_unlock(&ioapic->lock); 365 268 } 366 269 367 - static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, 368 - int trigger_mode) 270 + static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, 271 + struct kvm_ioapic *ioapic, int vector, int trigger_mode) 369 272 { 370 273 int i; 371 274 ··· 375 278 if (ent->fields.vector != vector) 376 279 continue; 377 280 281 + if (i == RTC_GSI) 282 + rtc_irq_eoi(ioapic, vcpu); 378 283 /* 379 284 * We are dropping lock while calling ack notifiers because ack 380 285 * notifier callbacks for assigned devices call into IOAPIC ··· 395 296 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); 396 297 ent->fields.remote_irr = 0; 397 298 if (!ent->fields.mask && (ioapic->irr & (1 << i))) 398 - ioapic_service(ioapic, i); 299 + ioapic_service(ioapic, i, false); 399 300 } 400 301 } 401 302 ··· 406 307 return test_bit(vector, ioapic->handled_vectors); 407 308 } 408 309 409 - void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) 310 + void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) 410 311 { 411 - struct kvm_ioapic *ioapic = kvm->arch.vioapic; 312 + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; 412 313 413 314 spin_lock(&ioapic->lock); 414 - __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); 315 + __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode); 415 316 spin_unlock(&ioapic->lock); 416 317 } 417 318 ··· 509 410 break; 510 411 #ifdef CONFIG_IA64 511 412 case IOAPIC_REG_EOI: 512 - __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG); 413 + __kvm_ioapic_update_eoi(NULL, ioapic, data, IOAPIC_LEVEL_TRIG); 513 414 break; 514 415 #endif 515 416 ··· 530 431 ioapic->ioregsel = 0; 531 432 ioapic->irr = 0; 532 433 ioapic->id = 0; 434 + rtc_irq_eoi_tracking_reset(ioapic); 533 435 update_handled_vectors(ioapic); 534 436 } 535 437 ··· 596 496 spin_lock(&ioapic->lock); 597 497 memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); 598 498 update_handled_vectors(ioapic); 599 - kvm_ioapic_make_eoibitmap_request(kvm); 499 + kvm_vcpu_request_scan_ioapic(kvm); 500 + kvm_rtc_eoi_tracking_restore_all(ioapic); 600 501 spin_unlock(&ioapic->lock); 601 502 return 0; 602 503 }

+20 -7

virt/kvm/ioapic.h

··· 34 34 #define IOAPIC_INIT 0x5 35 35 #define IOAPIC_EXTINT 0x7 36 36 37 + #ifdef CONFIG_X86 38 + #define RTC_GSI 8 39 + #else 40 + #define RTC_GSI -1U 41 + #endif 42 + 43 + struct rtc_status { 44 + int pending_eoi; 45 + DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS); 46 + }; 47 + 37 48 struct kvm_ioapic { 38 49 u64 base_address; 39 50 u32 ioregsel; ··· 58 47 void (*ack_notifier)(void *opaque, int irq); 59 48 spinlock_t lock; 60 49 DECLARE_BITMAP(handled_vectors, 256); 50 + struct rtc_status rtc_status; 61 51 }; 62 52 63 53 #ifdef DEBUG ··· 79 67 return kvm->arch.vioapic; 80 68 } 81 69 70 + void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); 82 71 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 83 72 int short_hand, int dest, int dest_mode); 84 73 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 85 - void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); 74 + void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, 75 + int trigger_mode); 86 76 bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); 87 77 int kvm_ioapic_init(struct kvm *kvm); 88 78 void kvm_ioapic_destroy(struct kvm *kvm); 89 79 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 90 - int level); 80 + int level, bool line_status); 91 81 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); 92 82 void kvm_ioapic_reset(struct kvm_ioapic *ioapic); 93 83 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 94 - struct kvm_lapic_irq *irq); 84 + struct kvm_lapic_irq *irq, unsigned long *dest_map); 95 85 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 96 86 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 97 - void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm); 98 - void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 99 - u64 *eoi_exit_bitmap); 100 - 87 + void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); 88 + void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, 89 + u32 *tmr); 101 90 102 91 #endif

+16 -199

virt/kvm/irq_comm.c

··· 35 35 #include "ioapic.h" 36 36 37 37 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, 38 - struct kvm *kvm, int irq_source_id, int level) 38 + struct kvm *kvm, int irq_source_id, int level, 39 + bool line_status) 39 40 { 40 41 #ifdef CONFIG_X86 41 42 struct kvm_pic *pic = pic_irqchip(kvm); ··· 47 46 } 48 47 49 48 static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, 50 - struct kvm *kvm, int irq_source_id, int level) 49 + struct kvm *kvm, int irq_source_id, int level, 50 + bool line_status) 51 51 { 52 52 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 53 - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level); 53 + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, 54 + line_status); 54 55 } 55 56 56 57 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) ··· 66 63 } 67 64 68 65 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 69 - struct kvm_lapic_irq *irq) 66 + struct kvm_lapic_irq *irq, unsigned long *dest_map) 70 67 { 71 68 int i, r = -1; 72 69 struct kvm_vcpu *vcpu, *lowest = NULL; ··· 77 74 irq->delivery_mode = APIC_DM_FIXED; 78 75 } 79 76 80 - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r)) 77 + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) 81 78 return r; 82 79 83 80 kvm_for_each_vcpu(i, vcpu, kvm) { ··· 91 88 if (!kvm_is_dm_lowest_prio(irq)) { 92 89 if (r < 0) 93 90 r = 0; 94 - r += kvm_apic_set_irq(vcpu, irq); 91 + r += kvm_apic_set_irq(vcpu, irq, dest_map); 95 92 } else if (kvm_lapic_enabled(vcpu)) { 96 93 if (!lowest) 97 94 lowest = vcpu; ··· 101 98 } 102 99 103 100 if (lowest) 104 - r = kvm_apic_set_irq(lowest, irq); 101 + r = kvm_apic_set_irq(lowest, irq, dest_map); 105 102 106 103 return r; 107 104 } ··· 124 121 } 125 122 126 123 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 127 - struct kvm *kvm, int irq_source_id, int level) 124 + struct kvm *kvm, int irq_source_id, int level, bool line_status) 128 125 { 129 126 struct kvm_lapic_irq irq; 130 127 ··· 133 130 134 131 kvm_set_msi_irq(e, &irq); 135 132 136 - return kvm_irq_delivery_to_apic(kvm, NULL, &irq); 133 + return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); 137 134 } 138 135 139 136 ··· 145 142 146 143 kvm_set_msi_irq(e, &irq); 147 144 148 - if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r)) 145 + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) 149 146 return r; 150 147 else 151 148 return -EWOULDBLOCK; 152 - } 153 - 154 - int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) 155 - { 156 - struct kvm_kernel_irq_routing_entry route; 157 - 158 - if (!irqchip_in_kernel(kvm) || msi->flags != 0) 159 - return -EINVAL; 160 - 161 - route.msi.address_lo = msi->address_lo; 162 - route.msi.address_hi = msi->address_hi; 163 - route.msi.data = msi->data; 164 - 165 - return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); 166 - } 167 - 168 - /* 169 - * Return value: 170 - * < 0 Interrupt was ignored (masked or not delivered for other reasons) 171 - * = 0 Interrupt was coalesced (previous irq is still pending) 172 - * > 0 Number of CPUs interrupt was delivered to 173 - */ 174 - int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) 175 - { 176 - struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; 177 - int ret = -1, i = 0; 178 - struct kvm_irq_routing_table *irq_rt; 179 - 180 - trace_kvm_set_irq(irq, level, irq_source_id); 181 - 182 - /* Not possible to detect if the guest uses the PIC or the 183 - * IOAPIC. So set the bit in both. The guest will ignore 184 - * writes to the unused one. 185 - */ 186 - rcu_read_lock(); 187 - irq_rt = rcu_dereference(kvm->irq_routing); 188 - if (irq < irq_rt->nr_rt_entries) 189 - hlist_for_each_entry(e, &irq_rt->map[irq], link) 190 - irq_set[i++] = *e; 191 - rcu_read_unlock(); 192 - 193 - while(i--) { 194 - int r; 195 - r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level); 196 - if (r < 0) 197 - continue; 198 - 199 - ret = r + ((ret < 0) ? 0 : ret); 200 - } 201 - 202 - return ret; 203 149 } 204 150 205 151 /* ··· 186 234 } 187 235 rcu_read_unlock(); 188 236 return ret; 189 - } 190 - 191 - bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 192 - { 193 - struct kvm_irq_ack_notifier *kian; 194 - int gsi; 195 - 196 - rcu_read_lock(); 197 - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 198 - if (gsi != -1) 199 - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, 200 - link) 201 - if (kian->gsi == gsi) { 202 - rcu_read_unlock(); 203 - return true; 204 - } 205 - 206 - rcu_read_unlock(); 207 - 208 - return false; 209 - } 210 - EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 211 - 212 - void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 213 - { 214 - struct kvm_irq_ack_notifier *kian; 215 - int gsi; 216 - 217 - trace_kvm_ack_irq(irqchip, pin); 218 - 219 - rcu_read_lock(); 220 - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 221 - if (gsi != -1) 222 - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, 223 - link) 224 - if (kian->gsi == gsi) 225 - kian->irq_acked(kian); 226 - rcu_read_unlock(); 227 - } 228 - 229 - void kvm_register_irq_ack_notifier(struct kvm *kvm, 230 - struct kvm_irq_ack_notifier *kian) 231 - { 232 - mutex_lock(&kvm->irq_lock); 233 - hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 234 - mutex_unlock(&kvm->irq_lock); 235 - kvm_ioapic_make_eoibitmap_request(kvm); 236 - } 237 - 238 - void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 239 - struct kvm_irq_ack_notifier *kian) 240 - { 241 - mutex_lock(&kvm->irq_lock); 242 - hlist_del_init_rcu(&kian->link); 243 - mutex_unlock(&kvm->irq_lock); 244 - synchronize_rcu(); 245 - kvm_ioapic_make_eoibitmap_request(kvm); 246 237 } 247 238 248 239 int kvm_request_irq_source_id(struct kvm *kvm) ··· 271 376 rcu_read_unlock(); 272 377 } 273 378 274 - void kvm_free_irq_routing(struct kvm *kvm) 275 - { 276 - /* Called only during vm destruction. Nobody can use the pointer 277 - at this stage */ 278 - kfree(kvm->irq_routing); 279 - } 280 - 281 - static int setup_routing_entry(struct kvm_irq_routing_table *rt, 282 - struct kvm_kernel_irq_routing_entry *e, 283 - const struct kvm_irq_routing_entry *ue) 379 + int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, 380 + struct kvm_kernel_irq_routing_entry *e, 381 + const struct kvm_irq_routing_entry *ue) 284 382 { 285 383 int r = -EINVAL; 286 384 int delta; 287 385 unsigned max_pin; 288 - struct kvm_kernel_irq_routing_entry *ei; 289 386 290 - /* 291 - * Do not allow GSI to be mapped to the same irqchip more than once. 292 - * Allow only one to one mapping between GSI and MSI. 293 - */ 294 - hlist_for_each_entry(ei, &rt->map[ue->gsi], link) 295 - if (ei->type == KVM_IRQ_ROUTING_MSI || 296 - ue->type == KVM_IRQ_ROUTING_MSI || 297 - ue->u.irqchip.irqchip == ei->irqchip.irqchip) 298 - return r; 299 - 300 - e->gsi = ue->gsi; 301 - e->type = ue->type; 302 387 switch (ue->type) { 303 388 case KVM_IRQ_ROUTING_IRQCHIP: 304 389 delta = 0; ··· 315 440 goto out; 316 441 } 317 442 318 - hlist_add_head(&e->link, &rt->map[e->gsi]); 319 443 r = 0; 320 444 out: 321 - return r; 322 - } 323 - 324 - 325 - int kvm_set_irq_routing(struct kvm *kvm, 326 - const struct kvm_irq_routing_entry *ue, 327 - unsigned nr, 328 - unsigned flags) 329 - { 330 - struct kvm_irq_routing_table *new, *old; 331 - u32 i, j, nr_rt_entries = 0; 332 - int r; 333 - 334 - for (i = 0; i < nr; ++i) { 335 - if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) 336 - return -EINVAL; 337 - nr_rt_entries = max(nr_rt_entries, ue[i].gsi); 338 - } 339 - 340 - nr_rt_entries += 1; 341 - 342 - new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) 343 - + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), 344 - GFP_KERNEL); 345 - 346 - if (!new) 347 - return -ENOMEM; 348 - 349 - new->rt_entries = (void *)&new->map[nr_rt_entries]; 350 - 351 - new->nr_rt_entries = nr_rt_entries; 352 - for (i = 0; i < 3; i++) 353 - for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++) 354 - new->chip[i][j] = -1; 355 - 356 - for (i = 0; i < nr; ++i) { 357 - r = -EINVAL; 358 - if (ue->flags) 359 - goto out; 360 - r = setup_routing_entry(new, &new->rt_entries[i], ue); 361 - if (r) 362 - goto out; 363 - ++ue; 364 - } 365 - 366 - mutex_lock(&kvm->irq_lock); 367 - old = kvm->irq_routing; 368 - kvm_irq_routing_update(kvm, new); 369 - mutex_unlock(&kvm->irq_lock); 370 - 371 - synchronize_rcu(); 372 - 373 - new = old; 374 - r = 0; 375 - 376 - out: 377 - kfree(new); 378 445 return r; 379 446 } 380 447

+237

virt/kvm/irqchip.c

··· 1 + /* 2 + * irqchip.c: Common API for in kernel interrupt controllers 3 + * Copyright (c) 2007, Intel Corporation. 4 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. 5 + * Copyright (c) 2013, Alexander Graf <agraf@suse.de> 6 + * 7 + * This program is free software; you can redistribute it and/or modify it 8 + * under the terms and conditions of the GNU General Public License, 9 + * version 2, as published by the Free Software Foundation. 10 + * 11 + * This program is distributed in the hope it will be useful, but WITHOUT 12 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 + * more details. 15 + * 16 + * You should have received a copy of the GNU General Public License along with 17 + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple 18 + * Place - Suite 330, Boston, MA 02111-1307 USA. 19 + * 20 + * This file is derived from virt/kvm/irq_comm.c. 21 + * 22 + * Authors: 23 + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> 24 + * Alexander Graf <agraf@suse.de> 25 + */ 26 + 27 + #include <linux/kvm_host.h> 28 + #include <linux/slab.h> 29 + #include <linux/export.h> 30 + #include <trace/events/kvm.h> 31 + #include "irq.h" 32 + 33 + bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 34 + { 35 + struct kvm_irq_ack_notifier *kian; 36 + int gsi; 37 + 38 + rcu_read_lock(); 39 + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 40 + if (gsi != -1) 41 + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, 42 + link) 43 + if (kian->gsi == gsi) { 44 + rcu_read_unlock(); 45 + return true; 46 + } 47 + 48 + rcu_read_unlock(); 49 + 50 + return false; 51 + } 52 + EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 53 + 54 + void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 55 + { 56 + struct kvm_irq_ack_notifier *kian; 57 + int gsi; 58 + 59 + trace_kvm_ack_irq(irqchip, pin); 60 + 61 + rcu_read_lock(); 62 + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 63 + if (gsi != -1) 64 + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, 65 + link) 66 + if (kian->gsi == gsi) 67 + kian->irq_acked(kian); 68 + rcu_read_unlock(); 69 + } 70 + 71 + void kvm_register_irq_ack_notifier(struct kvm *kvm, 72 + struct kvm_irq_ack_notifier *kian) 73 + { 74 + mutex_lock(&kvm->irq_lock); 75 + hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 76 + mutex_unlock(&kvm->irq_lock); 77 + #ifdef __KVM_HAVE_IOAPIC 78 + kvm_vcpu_request_scan_ioapic(kvm); 79 + #endif 80 + } 81 + 82 + void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 83 + struct kvm_irq_ack_notifier *kian) 84 + { 85 + mutex_lock(&kvm->irq_lock); 86 + hlist_del_init_rcu(&kian->link); 87 + mutex_unlock(&kvm->irq_lock); 88 + synchronize_rcu(); 89 + #ifdef __KVM_HAVE_IOAPIC 90 + kvm_vcpu_request_scan_ioapic(kvm); 91 + #endif 92 + } 93 + 94 + int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) 95 + { 96 + struct kvm_kernel_irq_routing_entry route; 97 + 98 + if (!irqchip_in_kernel(kvm) || msi->flags != 0) 99 + return -EINVAL; 100 + 101 + route.msi.address_lo = msi->address_lo; 102 + route.msi.address_hi = msi->address_hi; 103 + route.msi.data = msi->data; 104 + 105 + return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); 106 + } 107 + 108 + /* 109 + * Return value: 110 + * < 0 Interrupt was ignored (masked or not delivered for other reasons) 111 + * = 0 Interrupt was coalesced (previous irq is still pending) 112 + * > 0 Number of CPUs interrupt was delivered to 113 + */ 114 + int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, 115 + bool line_status) 116 + { 117 + struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; 118 + int ret = -1, i = 0; 119 + struct kvm_irq_routing_table *irq_rt; 120 + 121 + trace_kvm_set_irq(irq, level, irq_source_id); 122 + 123 + /* Not possible to detect if the guest uses the PIC or the 124 + * IOAPIC. So set the bit in both. The guest will ignore 125 + * writes to the unused one. 126 + */ 127 + rcu_read_lock(); 128 + irq_rt = rcu_dereference(kvm->irq_routing); 129 + if (irq < irq_rt->nr_rt_entries) 130 + hlist_for_each_entry(e, &irq_rt->map[irq], link) 131 + irq_set[i++] = *e; 132 + rcu_read_unlock(); 133 + 134 + while(i--) { 135 + int r; 136 + r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, 137 + line_status); 138 + if (r < 0) 139 + continue; 140 + 141 + ret = r + ((ret < 0) ? 0 : ret); 142 + } 143 + 144 + return ret; 145 + } 146 + 147 + void kvm_free_irq_routing(struct kvm *kvm) 148 + { 149 + /* Called only during vm destruction. Nobody can use the pointer 150 + at this stage */ 151 + kfree(kvm->irq_routing); 152 + } 153 + 154 + static int setup_routing_entry(struct kvm_irq_routing_table *rt, 155 + struct kvm_kernel_irq_routing_entry *e, 156 + const struct kvm_irq_routing_entry *ue) 157 + { 158 + int r = -EINVAL; 159 + struct kvm_kernel_irq_routing_entry *ei; 160 + 161 + /* 162 + * Do not allow GSI to be mapped to the same irqchip more than once. 163 + * Allow only one to one mapping between GSI and MSI. 164 + */ 165 + hlist_for_each_entry(ei, &rt->map[ue->gsi], link) 166 + if (ei->type == KVM_IRQ_ROUTING_MSI || 167 + ue->type == KVM_IRQ_ROUTING_MSI || 168 + ue->u.irqchip.irqchip == ei->irqchip.irqchip) 169 + return r; 170 + 171 + e->gsi = ue->gsi; 172 + e->type = ue->type; 173 + r = kvm_set_routing_entry(rt, e, ue); 174 + if (r) 175 + goto out; 176 + 177 + hlist_add_head(&e->link, &rt->map[e->gsi]); 178 + r = 0; 179 + out: 180 + return r; 181 + } 182 + 183 + int kvm_set_irq_routing(struct kvm *kvm, 184 + const struct kvm_irq_routing_entry *ue, 185 + unsigned nr, 186 + unsigned flags) 187 + { 188 + struct kvm_irq_routing_table *new, *old; 189 + u32 i, j, nr_rt_entries = 0; 190 + int r; 191 + 192 + for (i = 0; i < nr; ++i) { 193 + if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) 194 + return -EINVAL; 195 + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); 196 + } 197 + 198 + nr_rt_entries += 1; 199 + 200 + new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) 201 + + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), 202 + GFP_KERNEL); 203 + 204 + if (!new) 205 + return -ENOMEM; 206 + 207 + new->rt_entries = (void *)&new->map[nr_rt_entries]; 208 + 209 + new->nr_rt_entries = nr_rt_entries; 210 + for (i = 0; i < KVM_NR_IRQCHIPS; i++) 211 + for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) 212 + new->chip[i][j] = -1; 213 + 214 + for (i = 0; i < nr; ++i) { 215 + r = -EINVAL; 216 + if (ue->flags) 217 + goto out; 218 + r = setup_routing_entry(new, &new->rt_entries[i], ue); 219 + if (r) 220 + goto out; 221 + ++ue; 222 + } 223 + 224 + mutex_lock(&kvm->irq_lock); 225 + old = kvm->irq_routing; 226 + kvm_irq_routing_update(kvm, new); 227 + mutex_unlock(&kvm->irq_lock); 228 + 229 + synchronize_rcu(); 230 + 231 + new = old; 232 + r = 0; 233 + 234 + out: 235 + kfree(new); 236 + return r; 237 + }

+211 -47

virt/kvm/kvm_main.c

··· 217 217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 218 218 } 219 219 220 - void kvm_make_update_eoibitmap_request(struct kvm *kvm) 220 + void kvm_make_scan_ioapic_request(struct kvm *kvm) 221 221 { 222 - make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP); 222 + make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); 223 223 } 224 224 225 225 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) ··· 244 244 245 245 kvm_vcpu_set_in_spin_loop(vcpu, false); 246 246 kvm_vcpu_set_dy_eligible(vcpu, false); 247 + vcpu->preempted = false; 247 248 248 249 r = kvm_arch_vcpu_init(vcpu); 249 250 if (r < 0) ··· 504 503 mutex_init(&kvm->irq_lock); 505 504 mutex_init(&kvm->slots_lock); 506 505 atomic_set(&kvm->users_count, 1); 506 + INIT_LIST_HEAD(&kvm->devices); 507 507 508 508 r = kvm_init_mmu_notifier(kvm); 509 509 if (r) ··· 582 580 kfree(kvm->memslots); 583 581 } 584 582 583 + static void kvm_destroy_devices(struct kvm *kvm) 584 + { 585 + struct list_head *node, *tmp; 586 + 587 + list_for_each_safe(node, tmp, &kvm->devices) { 588 + struct kvm_device *dev = 589 + list_entry(node, struct kvm_device, vm_node); 590 + 591 + list_del(node); 592 + dev->ops->destroy(dev); 593 + } 594 + } 595 + 585 596 static void kvm_destroy_vm(struct kvm *kvm) 586 597 { 587 598 int i; ··· 614 599 kvm_arch_flush_shadow_all(kvm); 615 600 #endif 616 601 kvm_arch_destroy_vm(kvm); 602 + kvm_destroy_devices(kvm); 617 603 kvm_free_physmem(kvm); 618 604 cleanup_srcu_struct(&kvm->srcu); 619 605 kvm_arch_free_vm(kvm); ··· 735 719 } 736 720 737 721 /* 738 - * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: 739 - * - create a new memory slot 740 - * - delete an existing memory slot 741 - * - modify an existing memory slot 742 - * -- move it in the guest physical memory space 743 - * -- just change its flags 744 - * 745 - * Since flags can be changed by some of these operations, the following 746 - * differentiation is the best we can do for __kvm_set_memory_region(): 747 - */ 748 - enum kvm_mr_change { 749 - KVM_MR_CREATE, 750 - KVM_MR_DELETE, 751 - KVM_MR_MOVE, 752 - KVM_MR_FLAGS_ONLY, 753 - }; 754 - 755 - /* 756 722 * Allocate some memory and give it an address in the guest physical address 757 723 * space. 758 724 * ··· 743 745 * Must be called holding mmap_sem for write. 744 746 */ 745 747 int __kvm_set_memory_region(struct kvm *kvm, 746 - struct kvm_userspace_memory_region *mem, 747 - bool user_alloc) 748 + struct kvm_userspace_memory_region *mem) 748 749 { 749 750 int r; 750 751 gfn_t base_gfn; ··· 764 767 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 765 768 goto out; 766 769 /* We can read the guest memory with __xxx_user() later on. */ 767 - if (user_alloc && 770 + if ((mem->slot < KVM_USER_MEM_SLOTS) && 768 771 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 769 772 !access_ok(VERIFY_WRITE, 770 773 (void __user *)(unsigned long)mem->userspace_addr, ··· 872 875 slots = old_memslots; 873 876 } 874 877 875 - r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 878 + r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 876 879 if (r) 877 880 goto out_slots; 878 881 ··· 912 915 913 916 old_memslots = install_new_memslots(kvm, slots, &new); 914 917 915 - kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 918 + kvm_arch_commit_memory_region(kvm, mem, &old, change); 916 919 917 920 kvm_free_physmem_slot(&old, &new); 918 921 kfree(old_memslots); ··· 929 932 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 930 933 931 934 int kvm_set_memory_region(struct kvm *kvm, 932 - struct kvm_userspace_memory_region *mem, 933 - bool user_alloc) 935 + struct kvm_userspace_memory_region *mem) 934 936 { 935 937 int r; 936 938 937 939 mutex_lock(&kvm->slots_lock); 938 - r = __kvm_set_memory_region(kvm, mem, user_alloc); 940 + r = __kvm_set_memory_region(kvm, mem); 939 941 mutex_unlock(&kvm->slots_lock); 940 942 return r; 941 943 } 942 944 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 943 945 944 946 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 945 - struct 946 - kvm_userspace_memory_region *mem, 947 - bool user_alloc) 947 + struct kvm_userspace_memory_region *mem) 948 948 { 949 949 if (mem->slot >= KVM_USER_MEM_SLOTS) 950 950 return -EINVAL; 951 - return kvm_set_memory_region(kvm, mem, user_alloc); 951 + return kvm_set_memory_region(kvm, mem); 952 952 } 953 953 954 954 int kvm_get_dirty_log(struct kvm *kvm, ··· 1093 1099 return __copy_from_user_inatomic(data, hva, len); 1094 1100 } 1095 1101 1096 - int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1102 + static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1097 1103 unsigned long start, int write, struct page **page) 1098 1104 { 1099 1105 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; ··· 1713 1719 smp_send_reschedule(cpu); 1714 1720 put_cpu(); 1715 1721 } 1722 + EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 1716 1723 #endif /* !CONFIG_S390 */ 1717 1724 1718 1725 void kvm_resched(struct kvm_vcpu *vcpu) ··· 1811 1816 continue; 1812 1817 } else if (pass && i > last_boosted_vcpu) 1813 1818 break; 1819 + if (!ACCESS_ONCE(vcpu->preempted)) 1820 + continue; 1814 1821 if (vcpu == me) 1815 1822 continue; 1816 1823 if (waitqueue_active(&vcpu->wq)) ··· 2201 2204 } 2202 2205 #endif 2203 2206 2207 + static int kvm_device_ioctl_attr(struct kvm_device *dev, 2208 + int (*accessor)(struct kvm_device *dev, 2209 + struct kvm_device_attr *attr), 2210 + unsigned long arg) 2211 + { 2212 + struct kvm_device_attr attr; 2213 + 2214 + if (!accessor) 2215 + return -EPERM; 2216 + 2217 + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 2218 + return -EFAULT; 2219 + 2220 + return accessor(dev, &attr); 2221 + } 2222 + 2223 + static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 2224 + unsigned long arg) 2225 + { 2226 + struct kvm_device *dev = filp->private_data; 2227 + 2228 + switch (ioctl) { 2229 + case KVM_SET_DEVICE_ATTR: 2230 + return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 2231 + case KVM_GET_DEVICE_ATTR: 2232 + return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 2233 + case KVM_HAS_DEVICE_ATTR: 2234 + return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 2235 + default: 2236 + if (dev->ops->ioctl) 2237 + return dev->ops->ioctl(dev, ioctl, arg); 2238 + 2239 + return -ENOTTY; 2240 + } 2241 + } 2242 + 2243 + static int kvm_device_release(struct inode *inode, struct file *filp) 2244 + { 2245 + struct kvm_device *dev = filp->private_data; 2246 + struct kvm *kvm = dev->kvm; 2247 + 2248 + kvm_put_kvm(kvm); 2249 + return 0; 2250 + } 2251 + 2252 + static const struct file_operations kvm_device_fops = { 2253 + .unlocked_ioctl = kvm_device_ioctl, 2254 + #ifdef CONFIG_COMPAT 2255 + .compat_ioctl = kvm_device_ioctl, 2256 + #endif 2257 + .release = kvm_device_release, 2258 + }; 2259 + 2260 + struct kvm_device *kvm_device_from_filp(struct file *filp) 2261 + { 2262 + if (filp->f_op != &kvm_device_fops) 2263 + return NULL; 2264 + 2265 + return filp->private_data; 2266 + } 2267 + 2268 + static int kvm_ioctl_create_device(struct kvm *kvm, 2269 + struct kvm_create_device *cd) 2270 + { 2271 + struct kvm_device_ops *ops = NULL; 2272 + struct kvm_device *dev; 2273 + bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 2274 + int ret; 2275 + 2276 + switch (cd->type) { 2277 + #ifdef CONFIG_KVM_MPIC 2278 + case KVM_DEV_TYPE_FSL_MPIC_20: 2279 + case KVM_DEV_TYPE_FSL_MPIC_42: 2280 + ops = &kvm_mpic_ops; 2281 + break; 2282 + #endif 2283 + #ifdef CONFIG_KVM_XICS 2284 + case KVM_DEV_TYPE_XICS: 2285 + ops = &kvm_xics_ops; 2286 + break; 2287 + #endif 2288 + default: 2289 + return -ENODEV; 2290 + } 2291 + 2292 + if (test) 2293 + return 0; 2294 + 2295 + dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2296 + if (!dev) 2297 + return -ENOMEM; 2298 + 2299 + dev->ops = ops; 2300 + dev->kvm = kvm; 2301 + 2302 + ret = ops->create(dev, cd->type); 2303 + if (ret < 0) { 2304 + kfree(dev); 2305 + return ret; 2306 + } 2307 + 2308 + ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR); 2309 + if (ret < 0) { 2310 + ops->destroy(dev); 2311 + return ret; 2312 + } 2313 + 2314 + list_add(&dev->vm_node, &kvm->devices); 2315 + kvm_get_kvm(kvm); 2316 + cd->fd = ret; 2317 + return 0; 2318 + } 2319 + 2204 2320 static long kvm_vm_ioctl(struct file *filp, 2205 2321 unsigned int ioctl, unsigned long arg) 2206 2322 { ··· 2335 2225 sizeof kvm_userspace_mem)) 2336 2226 goto out; 2337 2227 2338 - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true); 2228 + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 2339 2229 break; 2340 2230 } 2341 2231 case KVM_GET_DIRTY_LOG: { ··· 2414 2304 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2415 2305 goto out; 2416 2306 2417 - r = kvm_vm_ioctl_irq_line(kvm, &irq_event); 2307 + r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 2308 + ioctl == KVM_IRQ_LINE_STATUS); 2418 2309 if (r) 2419 2310 goto out; 2420 2311 ··· 2429 2318 break; 2430 2319 } 2431 2320 #endif 2321 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2322 + case KVM_SET_GSI_ROUTING: { 2323 + struct kvm_irq_routing routing; 2324 + struct kvm_irq_routing __user *urouting; 2325 + struct kvm_irq_routing_entry *entries; 2326 + 2327 + r = -EFAULT; 2328 + if (copy_from_user(&routing, argp, sizeof(routing))) 2329 + goto out; 2330 + r = -EINVAL; 2331 + if (routing.nr >= KVM_MAX_IRQ_ROUTES) 2332 + goto out; 2333 + if (routing.flags) 2334 + goto out; 2335 + r = -ENOMEM; 2336 + entries = vmalloc(routing.nr * sizeof(*entries)); 2337 + if (!entries) 2338 + goto out; 2339 + r = -EFAULT; 2340 + urouting = argp; 2341 + if (copy_from_user(entries, urouting->entries, 2342 + routing.nr * sizeof(*entries))) 2343 + goto out_free_irq_routing; 2344 + r = kvm_set_irq_routing(kvm, entries, routing.nr, 2345 + routing.flags); 2346 + out_free_irq_routing: 2347 + vfree(entries); 2348 + break; 2349 + } 2350 + #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 2351 + case KVM_CREATE_DEVICE: { 2352 + struct kvm_create_device cd; 2353 + 2354 + r = -EFAULT; 2355 + if (copy_from_user(&cd, argp, sizeof(cd))) 2356 + goto out; 2357 + 2358 + r = kvm_ioctl_create_device(kvm, &cd); 2359 + if (r) 2360 + goto out; 2361 + 2362 + r = -EFAULT; 2363 + if (copy_to_user(argp, &cd, sizeof(cd))) 2364 + goto out; 2365 + 2366 + r = 0; 2367 + break; 2368 + } 2432 2369 default: 2433 2370 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2434 2371 if (r == -ENOTTY) ··· 2606 2447 #ifdef CONFIG_HAVE_KVM_MSI 2607 2448 case KVM_CAP_SIGNAL_MSI: 2608 2449 #endif 2450 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2451 + case KVM_CAP_IRQFD_RESAMPLE: 2452 + #endif 2609 2453 return 1; 2610 - #ifdef KVM_CAP_IRQ_ROUTING 2454 + #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2611 2455 case KVM_CAP_IRQ_ROUTING: 2612 2456 return KVM_MAX_IRQ_ROUTES; 2613 2457 #endif ··· 2780 2618 return NOTIFY_OK; 2781 2619 } 2782 2620 2783 - 2784 - asmlinkage void kvm_spurious_fault(void) 2785 - { 2786 - /* Fault while not rebooting. We want the trace. */ 2787 - BUG(); 2788 - } 2789 - EXPORT_SYMBOL_GPL(kvm_spurious_fault); 2790 - 2791 2621 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2792 2622 void *v) 2793 2623 { ··· 2812 2658 kfree(bus); 2813 2659 } 2814 2660 2815 - int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2661 + static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2816 2662 { 2817 2663 const struct kvm_io_range *r1 = p1; 2818 2664 const struct kvm_io_range *r2 = p2; ··· 2824 2670 return 0; 2825 2671 } 2826 2672 2827 - int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2673 + static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2828 2674 gpa_t addr, int len) 2829 2675 { 2830 2676 bus->range[bus->dev_count++] = (struct kvm_io_range) { ··· 2839 2685 return 0; 2840 2686 } 2841 2687 2842 - int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 2688 + static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 2843 2689 gpa_t addr, int len) 2844 2690 { 2845 2691 struct kvm_io_range *range, key; ··· 3083 2929 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3084 2930 { 3085 2931 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2932 + if (vcpu->preempted) 2933 + vcpu->preempted = false; 3086 2934 3087 2935 kvm_arch_vcpu_load(vcpu, cpu); 3088 2936 } ··· 3094 2938 { 3095 2939 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3096 2940 2941 + if (current->state == TASK_RUNNING) 2942 + vcpu->preempted = true; 3097 2943 kvm_arch_vcpu_put(vcpu); 3098 2944 } 3099 2945 ··· 3105 2947 int r; 3106 2948 int cpu; 3107 2949 2950 + r = kvm_irqfd_init(); 2951 + if (r) 2952 + goto out_irqfd; 3108 2953 r = kvm_arch_init(opaque); 3109 2954 if (r) 3110 2955 goto out_fail; ··· 3188 3027 out_free_0: 3189 3028 kvm_arch_exit(); 3190 3029 out_fail: 3030 + kvm_irqfd_exit(); 3031 + out_irqfd: 3191 3032 return r; 3192 3033 } 3193 3034 EXPORT_SYMBOL_GPL(kvm_init); ··· 3206 3043 on_each_cpu(hardware_disable_nolock, NULL, 1); 3207 3044 kvm_arch_hardware_unsetup(); 3208 3045 kvm_arch_exit(); 3046 + kvm_irqfd_exit(); 3209 3047 free_cpumask_var(cpus_hardware_enabled); 3210 3048 } 3211 3049 EXPORT_SYMBOL_GPL(kvm_exit);