Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm into next

+36 -1

Documentation/devicetree/bindings/arm/psci.txt

··· 21 21 22 22 Main node required properties: 23 23 24 - - compatible : Must be "arm,psci" 24 + - compatible : should contain at least one of: 25 + 26 + * "arm,psci" : for implementations complying to PSCI versions prior to 27 + 0.2. For these cases function IDs must be provided. 28 + 29 + * "arm,psci-0.2" : for implementations complying to PSCI 0.2. Function 30 + IDs are not required and should be ignored by an OS with PSCI 0.2 31 + support, but are permitted to be present for compatibility with 32 + existing software when "arm,psci" is later in the compatible list. 25 33 26 34 - method : The method of calling the PSCI firmware. Permitted 27 35 values are: ··· 53 45 54 46 Example: 55 47 48 + Case 1: PSCI v0.1 only. 49 + 56 50 psci { 57 51 compatible = "arm,psci"; 58 52 method = "smc"; ··· 62 52 cpu_off = <0x95c10001>; 63 53 cpu_on = <0x95c10002>; 64 54 migrate = <0x95c10003>; 55 + }; 56 + 57 + 58 + Case 2: PSCI v0.2 only 59 + 60 + psci { 61 + compatible = "arm,psci-0.2"; 62 + method = "smc"; 63 + }; 64 + 65 + Case 3: PSCI v0.2 and PSCI v0.1. 66 + 67 + A DTB may provide IDs for use by kernels without PSCI 0.2 support, 68 + enabling firmware and hypervisors to support existing and new kernels. 69 + These IDs will be ignored by kernels with PSCI 0.2 support, which will 70 + use the standard PSCI 0.2 IDs exclusively. 71 + 72 + psci { 73 + compatible = "arm,psci-0.2", "arm,psci"; 74 + method = "hvc"; 75 + 76 + cpu_on = < arbitrary value >; 77 + cpu_off = < arbitrary value >; 78 + 79 + ... 65 80 };

+29 -4

Documentation/virtual/kvm/api.txt

··· 1794 1794 PPC | KVM_REG_PPC_MMCR0 | 64 1795 1795 PPC | KVM_REG_PPC_MMCR1 | 64 1796 1796 PPC | KVM_REG_PPC_MMCRA | 64 1797 + PPC | KVM_REG_PPC_MMCR2 | 64 1798 + PPC | KVM_REG_PPC_MMCRS | 64 1799 + PPC | KVM_REG_PPC_SIAR | 64 1800 + PPC | KVM_REG_PPC_SDAR | 64 1801 + PPC | KVM_REG_PPC_SIER | 64 1797 1802 PPC | KVM_REG_PPC_PMC1 | 32 1798 1803 PPC | KVM_REG_PPC_PMC2 | 32 1799 1804 PPC | KVM_REG_PPC_PMC3 | 32 ··· 1873 1868 PPC | KVM_REG_PPC_PPR | 64 1874 1869 PPC | KVM_REG_PPC_ARCH_COMPAT 32 1875 1870 PPC | KVM_REG_PPC_DABRX | 32 1871 + PPC | KVM_REG_PPC_WORT | 64 1876 1872 PPC | KVM_REG_PPC_TM_GPR0 | 64 1877 1873 ... 1878 1874 PPC | KVM_REG_PPC_TM_GPR31 | 64 ··· 2217 2211 KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm 2218 2212 KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm 2219 2213 KVM_S390_RESTART (vcpu) - restart 2214 + KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt 2215 + KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt 2220 2216 KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt 2221 2217 parameters in parm and parm64 2222 2218 KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm ··· 2322 2314 2323 2315 4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR 2324 2316 2325 - Capability: KVM_CAP_DEVICE_CTRL 2326 - Type: device ioctl 2317 + Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device 2318 + Type: device ioctl, vm ioctl 2327 2319 Parameters: struct kvm_device_attr 2328 2320 Returns: 0 on success, -1 on error 2329 2321 Errors: ··· 2348 2340 2349 2341 4.81 KVM_HAS_DEVICE_ATTR 2350 2342 2351 - Capability: KVM_CAP_DEVICE_CTRL 2352 - Type: device ioctl 2343 + Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device 2344 + Type: device ioctl, vm ioctl 2353 2345 Parameters: struct kvm_device_attr 2354 2346 Returns: 0 on success, -1 on error 2355 2347 Errors: ··· 2384 2376 Depends on KVM_CAP_ARM_PSCI. 2385 2377 - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode. 2386 2378 Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). 2379 + - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 for the CPU. 2380 + Depends on KVM_CAP_ARM_PSCI_0_2. 2387 2381 2388 2382 2389 2383 4.83 KVM_ARM_PREFERRED_TARGET ··· 2747 2737 It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an 2748 2738 external interrupt has just been delivered into the guest. User space 2749 2739 should put the acknowledged interrupt vector into the 'epr' field. 2740 + 2741 + /* KVM_EXIT_SYSTEM_EVENT */ 2742 + struct { 2743 + #define KVM_SYSTEM_EVENT_SHUTDOWN 1 2744 + #define KVM_SYSTEM_EVENT_RESET 2 2745 + __u32 type; 2746 + __u64 flags; 2747 + } system_event; 2748 + 2749 + If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered 2750 + a system-level event using some architecture specific mechanism (hypercall 2751 + or some special instruction). In case of ARM/ARM64, this is triggered using 2752 + HVC instruction based PSCI call from the vcpu. The 'type' field describes 2753 + the system-level event type. The 'flags' field describes architecture 2754 + specific flags for the system-level event. 2750 2755 2751 2756 /* Fix the size of the union. */ 2752 2757 char padding[256];

+26

Documentation/virtual/kvm/devices/vm.txt

··· 1 + Generic vm interface 2 + ==================================== 3 + 4 + The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, 5 + KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same 6 + struct kvm_device_attr as other devices, but targets VM-wide settings 7 + and controls. 8 + 9 + The groups and attributes per virtual machine, if any, are architecture 10 + specific. 11 + 12 + 1. GROUP: KVM_S390_VM_MEM_CTRL 13 + Architectures: s390 14 + 15 + 1.1. ATTRIBUTE: KVM_S390_VM_MEM_CTRL 16 + Parameters: none 17 + Returns: -EBUSY if already a vcpus is defined, otherwise 0 18 + 19 + Enables CMMA for the virtual machine 20 + 21 + 1.2. ATTRIBUTE: KVM_S390_VM_CLR_CMMA 22 + Parameteres: none 23 + Returns: 0 24 + 25 + Clear the CMMA status for all guest pages, so any pages the guest marked 26 + as unused are again used any may not be reclaimed by the host.

+14

Documentation/virtual/kvm/ppc-pv.txt

··· 94 94 The following enhancements to the magic page are currently available: 95 95 96 96 KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page 97 + KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs 97 98 98 99 For enhanced features in the magic page, please check for the existence of the 99 100 feature before using them! 101 + 102 + Magic page flags 103 + ================ 104 + 105 + In addition to features that indicate whether a host is capable of a particular 106 + feature we also have a channel for a guest to tell the guest whether it's capable 107 + of something. This is what we call "flags". 108 + 109 + Flags are passed to the host in the low 12 bits of the Effective Address. 110 + 111 + The following flags are currently available for a guest to expose: 112 + 113 + MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correclty wrt magic page 100 114 101 115 MSR bits 102 116 ========

+2

Documentation/virtual/kvm/s390-diag.txt

··· 78 78 79 79 If the function code specifies 0x501, breakpoint functions may be performed. 80 80 This function code is handled by userspace. 81 + 82 + This diagnose function code has no subfunctions and uses no parameters.

+1 -1

arch/arm/include/asm/kvm_host.h

··· 36 36 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 37 37 #define KVM_HAVE_ONE_REG 38 38 39 - #define KVM_VCPU_MAX_FEATURES 1 39 + #define KVM_VCPU_MAX_FEATURES 2 40 40 41 41 #include <kvm/arm_vgic.h> 42 42

+5 -1

arch/arm/include/asm/kvm_psci.h

··· 18 18 #ifndef __ARM_KVM_PSCI_H__ 19 19 #define __ARM_KVM_PSCI_H__ 20 20 21 - bool kvm_psci_call(struct kvm_vcpu *vcpu); 21 + #define KVM_ARM_PSCI_0_1 1 22 + #define KVM_ARM_PSCI_0_2 2 23 + 24 + int kvm_psci_version(struct kvm_vcpu *vcpu); 25 + int kvm_psci_call(struct kvm_vcpu *vcpu); 22 26 23 27 #endif /* __ARM_KVM_PSCI_H__ */

+5 -2

arch/arm/include/asm/psci.h

··· 29 29 int (*cpu_off)(struct psci_power_state state); 30 30 int (*cpu_on)(unsigned long cpuid, unsigned long entry_point); 31 31 int (*migrate)(unsigned long cpuid); 32 + int (*affinity_info)(unsigned long target_affinity, 33 + unsigned long lowest_affinity_level); 34 + int (*migrate_info_type)(void); 32 35 }; 33 36 34 37 extern struct psci_operations psci_ops; 35 38 extern struct smp_operations psci_smp_ops; 36 39 37 40 #ifdef CONFIG_ARM_PSCI 38 - void psci_init(void); 41 + int psci_init(void); 39 42 bool psci_smp_available(void); 40 43 #else 41 - static inline void psci_init(void) { } 44 + static inline int psci_init(void) { return 0; } 42 45 static inline bool psci_smp_available(void) { return false; } 43 46 #endif 44 47

+6 -4

arch/arm/include/uapi/asm/kvm.h

··· 20 20 #define __ARM_KVM_H__ 21 21 22 22 #include <linux/types.h> 23 + #include <linux/psci.h> 23 24 #include <asm/ptrace.h> 24 25 25 26 #define __KVM_HAVE_GUEST_DEBUG ··· 84 83 #define KVM_VGIC_V2_CPU_SIZE 0x2000 85 84 86 85 #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ 86 + #define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ 87 87 88 88 struct kvm_vcpu_init { 89 89 __u32 target; ··· 203 201 #define KVM_PSCI_FN_CPU_ON KVM_PSCI_FN(2) 204 202 #define KVM_PSCI_FN_MIGRATE KVM_PSCI_FN(3) 205 203 206 - #define KVM_PSCI_RET_SUCCESS 0 207 - #define KVM_PSCI_RET_NI ((unsigned long)-1) 208 - #define KVM_PSCI_RET_INVAL ((unsigned long)-2) 209 - #define KVM_PSCI_RET_DENIED ((unsigned long)-3) 204 + #define KVM_PSCI_RET_SUCCESS PSCI_RET_SUCCESS 205 + #define KVM_PSCI_RET_NI PSCI_RET_NOT_SUPPORTED 206 + #define KVM_PSCI_RET_INVAL PSCI_RET_INVALID_PARAMS 207 + #define KVM_PSCI_RET_DENIED PSCI_RET_DENIED 210 208 211 209 #endif /* __ARM_KVM_H__ */

+161 -39

arch/arm/kernel/psci.c

··· 17 17 18 18 #include <linux/init.h> 19 19 #include <linux/of.h> 20 + #include <linux/reboot.h> 21 + #include <linux/pm.h> 22 + #include <uapi/linux/psci.h> 20 23 21 24 #include <asm/compiler.h> 22 25 #include <asm/errno.h> 23 26 #include <asm/opcodes-sec.h> 24 27 #include <asm/opcodes-virt.h> 25 28 #include <asm/psci.h> 29 + #include <asm/system_misc.h> 26 30 27 31 struct psci_operations psci_ops; 28 32 29 33 static int (*invoke_psci_fn)(u32, u32, u32, u32); 34 + typedef int (*psci_initcall_t)(const struct device_node *); 30 35 31 36 enum psci_function { 32 37 PSCI_FN_CPU_SUSPEND, 33 38 PSCI_FN_CPU_ON, 34 39 PSCI_FN_CPU_OFF, 35 40 PSCI_FN_MIGRATE, 41 + PSCI_FN_AFFINITY_INFO, 42 + PSCI_FN_MIGRATE_INFO_TYPE, 36 43 PSCI_FN_MAX, 37 44 }; 38 45 39 46 static u32 psci_function_id[PSCI_FN_MAX]; 40 - 41 - #define PSCI_RET_SUCCESS 0 42 - #define PSCI_RET_EOPNOTSUPP -1 43 - #define PSCI_RET_EINVAL -2 44 - #define PSCI_RET_EPERM -3 45 47 46 48 static int psci_to_linux_errno(int errno) 47 49 { 48 50 switch (errno) { 49 51 case PSCI_RET_SUCCESS: 50 52 return 0; 51 - case PSCI_RET_EOPNOTSUPP: 53 + case PSCI_RET_NOT_SUPPORTED: 52 54 return -EOPNOTSUPP; 53 - case PSCI_RET_EINVAL: 55 + case PSCI_RET_INVALID_PARAMS: 54 56 return -EINVAL; 55 - case PSCI_RET_EPERM: 57 + case PSCI_RET_DENIED: 56 58 return -EPERM; 57 59 }; 58 60 59 61 return -EINVAL; 60 62 } 61 63 62 - #define PSCI_POWER_STATE_ID_MASK 0xffff 63 - #define PSCI_POWER_STATE_ID_SHIFT 0 64 - #define PSCI_POWER_STATE_TYPE_MASK 0x1 65 - #define PSCI_POWER_STATE_TYPE_SHIFT 16 66 - #define PSCI_POWER_STATE_AFFL_MASK 0x3 67 - #define PSCI_POWER_STATE_AFFL_SHIFT 24 68 - 69 64 static u32 psci_power_state_pack(struct psci_power_state state) 70 65 { 71 - return ((state.id & PSCI_POWER_STATE_ID_MASK) 72 - << PSCI_POWER_STATE_ID_SHIFT) | 73 - ((state.type & PSCI_POWER_STATE_TYPE_MASK) 74 - << PSCI_POWER_STATE_TYPE_SHIFT) | 75 - ((state.affinity_level & PSCI_POWER_STATE_AFFL_MASK) 76 - << PSCI_POWER_STATE_AFFL_SHIFT); 66 + return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT) 67 + & PSCI_0_2_POWER_STATE_ID_MASK) | 68 + ((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT) 69 + & PSCI_0_2_POWER_STATE_TYPE_MASK) | 70 + ((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT) 71 + & PSCI_0_2_POWER_STATE_AFFL_MASK); 77 72 } 78 73 79 74 /* ··· 103 108 : "r" (arg0), "r" (arg1), "r" (arg2)); 104 109 105 110 return function_id; 111 + } 112 + 113 + static int psci_get_version(void) 114 + { 115 + int err; 116 + 117 + err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); 118 + return err; 106 119 } 107 120 108 121 static int psci_cpu_suspend(struct psci_power_state state, ··· 156 153 return psci_to_linux_errno(err); 157 154 } 158 155 159 - static const struct of_device_id psci_of_match[] __initconst = { 160 - { .compatible = "arm,psci", }, 161 - {}, 162 - }; 163 - 164 - void __init psci_init(void) 156 + static int psci_affinity_info(unsigned long target_affinity, 157 + unsigned long lowest_affinity_level) 165 158 { 166 - struct device_node *np; 159 + int err; 160 + u32 fn; 161 + 162 + fn = psci_function_id[PSCI_FN_AFFINITY_INFO]; 163 + err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0); 164 + return err; 165 + } 166 + 167 + static int psci_migrate_info_type(void) 168 + { 169 + int err; 170 + u32 fn; 171 + 172 + fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE]; 173 + err = invoke_psci_fn(fn, 0, 0, 0); 174 + return err; 175 + } 176 + 177 + static int get_set_conduit_method(struct device_node *np) 178 + { 167 179 const char *method; 168 - u32 id; 169 180 170 - np = of_find_matching_node(NULL, psci_of_match); 171 - if (!np) 172 - return; 173 - 174 - pr_info("probing function IDs from device-tree\n"); 181 + pr_info("probing for conduit method from DT.\n"); 175 182 176 183 if (of_property_read_string(np, "method", &method)) { 177 - pr_warning("missing \"method\" property\n"); 178 - goto out_put_node; 184 + pr_warn("missing \"method\" property\n"); 185 + return -ENXIO; 179 186 } 180 187 181 188 if (!strcmp("hvc", method)) { ··· 193 180 } else if (!strcmp("smc", method)) { 194 181 invoke_psci_fn = __invoke_psci_fn_smc; 195 182 } else { 196 - pr_warning("invalid \"method\" property: %s\n", method); 197 - goto out_put_node; 183 + pr_warn("invalid \"method\" property: %s\n", method); 184 + return -EINVAL; 198 185 } 186 + return 0; 187 + } 188 + 189 + static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd) 190 + { 191 + invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0); 192 + } 193 + 194 + static void psci_sys_poweroff(void) 195 + { 196 + invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0); 197 + } 198 + 199 + /* 200 + * PSCI Function IDs for v0.2+ are well defined so use 201 + * standard values. 202 + */ 203 + static int psci_0_2_init(struct device_node *np) 204 + { 205 + int err, ver; 206 + 207 + err = get_set_conduit_method(np); 208 + 209 + if (err) 210 + goto out_put_node; 211 + 212 + ver = psci_get_version(); 213 + 214 + if (ver == PSCI_RET_NOT_SUPPORTED) { 215 + /* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */ 216 + pr_err("PSCI firmware does not comply with the v0.2 spec.\n"); 217 + err = -EOPNOTSUPP; 218 + goto out_put_node; 219 + } else { 220 + pr_info("PSCIv%d.%d detected in firmware.\n", 221 + PSCI_VERSION_MAJOR(ver), 222 + PSCI_VERSION_MINOR(ver)); 223 + 224 + if (PSCI_VERSION_MAJOR(ver) == 0 && 225 + PSCI_VERSION_MINOR(ver) < 2) { 226 + err = -EINVAL; 227 + pr_err("Conflicting PSCI version detected.\n"); 228 + goto out_put_node; 229 + } 230 + } 231 + 232 + pr_info("Using standard PSCI v0.2 function IDs\n"); 233 + psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN_CPU_SUSPEND; 234 + psci_ops.cpu_suspend = psci_cpu_suspend; 235 + 236 + psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF; 237 + psci_ops.cpu_off = psci_cpu_off; 238 + 239 + psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN_CPU_ON; 240 + psci_ops.cpu_on = psci_cpu_on; 241 + 242 + psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN_MIGRATE; 243 + psci_ops.migrate = psci_migrate; 244 + 245 + psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN_AFFINITY_INFO; 246 + psci_ops.affinity_info = psci_affinity_info; 247 + 248 + psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] = 249 + PSCI_0_2_FN_MIGRATE_INFO_TYPE; 250 + psci_ops.migrate_info_type = psci_migrate_info_type; 251 + 252 + arm_pm_restart = psci_sys_reset; 253 + 254 + pm_power_off = psci_sys_poweroff; 255 + 256 + out_put_node: 257 + of_node_put(np); 258 + return err; 259 + } 260 + 261 + /* 262 + * PSCI < v0.2 get PSCI Function IDs via DT. 263 + */ 264 + static int psci_0_1_init(struct device_node *np) 265 + { 266 + u32 id; 267 + int err; 268 + 269 + err = get_set_conduit_method(np); 270 + 271 + if (err) 272 + goto out_put_node; 273 + 274 + pr_info("Using PSCI v0.1 Function IDs from DT\n"); 199 275 200 276 if (!of_property_read_u32(np, "cpu_suspend", &id)) { 201 277 psci_function_id[PSCI_FN_CPU_SUSPEND] = id; ··· 308 206 309 207 out_put_node: 310 208 of_node_put(np); 311 - return; 209 + return err; 210 + } 211 + 212 + static const struct of_device_id psci_of_match[] __initconst = { 213 + { .compatible = "arm,psci", .data = psci_0_1_init}, 214 + { .compatible = "arm,psci-0.2", .data = psci_0_2_init}, 215 + {}, 216 + }; 217 + 218 + int __init psci_init(void) 219 + { 220 + struct device_node *np; 221 + const struct of_device_id *matched_np; 222 + psci_initcall_t init_fn; 223 + 224 + np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np); 225 + if (!np) 226 + return -ENODEV; 227 + 228 + init_fn = (psci_initcall_t)matched_np->data; 229 + return init_fn(np); 312 230 }

+33

arch/arm/kernel/psci_smp.c

··· 16 16 #include <linux/init.h> 17 17 #include <linux/smp.h> 18 18 #include <linux/of.h> 19 + #include <linux/delay.h> 20 + #include <uapi/linux/psci.h> 19 21 20 22 #include <asm/psci.h> 21 23 #include <asm/smp_plat.h> ··· 68 66 /* We should never return */ 69 67 panic("psci: cpu %d failed to shutdown\n", cpu); 70 68 } 69 + 70 + int __ref psci_cpu_kill(unsigned int cpu) 71 + { 72 + int err, i; 73 + 74 + if (!psci_ops.affinity_info) 75 + return 1; 76 + /* 77 + * cpu_kill could race with cpu_die and we can 78 + * potentially end up declaring this cpu undead 79 + * while it is dying. So, try again a few times. 80 + */ 81 + 82 + for (i = 0; i < 10; i++) { 83 + err = psci_ops.affinity_info(cpu_logical_map(cpu), 0); 84 + if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) { 85 + pr_info("CPU%d killed.\n", cpu); 86 + return 1; 87 + } 88 + 89 + msleep(10); 90 + pr_info("Retrying again to check for CPU kill\n"); 91 + } 92 + 93 + pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n", 94 + cpu, err); 95 + /* Make platform_cpu_kill() fail. */ 96 + return 0; 97 + } 98 + 71 99 #endif 72 100 73 101 bool __init psci_smp_available(void) ··· 110 78 .smp_boot_secondary = psci_boot_secondary, 111 79 #ifdef CONFIG_HOTPLUG_CPU 112 80 .cpu_die = psci_cpu_die, 81 + .cpu_kill = psci_cpu_kill, 113 82 #endif 114 83 };

+1

arch/arm/kvm/arm.c

··· 197 197 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 198 198 case KVM_CAP_ONE_REG: 199 199 case KVM_CAP_ARM_PSCI: 200 + case KVM_CAP_ARM_PSCI_0_2: 200 201 r = 1; 201 202 break; 202 203 case KVM_CAP_COALESCED_MMIO:

+7 -3

arch/arm/kvm/handle_exit.c

··· 38 38 39 39 static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) 40 40 { 41 + int ret; 42 + 41 43 trace_kvm_hvc(*vcpu_pc(vcpu), *vcpu_reg(vcpu, 0), 42 44 kvm_vcpu_hvc_get_imm(vcpu)); 43 45 44 - if (kvm_psci_call(vcpu)) 46 + ret = kvm_psci_call(vcpu); 47 + if (ret < 0) { 48 + kvm_inject_undefined(vcpu); 45 49 return 1; 50 + } 46 51 47 - kvm_inject_undefined(vcpu); 48 - return 1; 52 + return ret; 49 53 } 50 54 51 55 static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)

+216 -19

arch/arm/kvm/psci.c

··· 27 27 * as described in ARM document number ARM DEN 0022A. 28 28 */ 29 29 30 + #define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) 31 + 32 + static unsigned long psci_affinity_mask(unsigned long affinity_level) 33 + { 34 + if (affinity_level <= 3) 35 + return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level); 36 + 37 + return 0; 38 + } 39 + 40 + static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) 41 + { 42 + /* 43 + * NOTE: For simplicity, we make VCPU suspend emulation to be 44 + * same-as WFI (Wait-for-interrupt) emulation. 45 + * 46 + * This means for KVM the wakeup events are interrupts and 47 + * this is consistent with intended use of StateID as described 48 + * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A). 49 + * 50 + * Further, we also treat power-down request to be same as 51 + * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2 52 + * specification (ARM DEN 0022A). This means all suspend states 53 + * for KVM will preserve the register state. 54 + */ 55 + kvm_vcpu_block(vcpu); 56 + 57 + return PSCI_RET_SUCCESS; 58 + } 59 + 30 60 static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) 31 61 { 32 62 vcpu->arch.pause = true; ··· 68 38 struct kvm_vcpu *vcpu = NULL, *tmp; 69 39 wait_queue_head_t *wq; 70 40 unsigned long cpu_id; 41 + unsigned long context_id; 71 42 unsigned long mpidr; 72 43 phys_addr_t target_pc; 73 44 int i; ··· 89 58 * Make sure the caller requested a valid CPU and that the CPU is 90 59 * turned off. 91 60 */ 92 - if (!vcpu || !vcpu->arch.pause) 93 - return KVM_PSCI_RET_INVAL; 61 + if (!vcpu) 62 + return PSCI_RET_INVALID_PARAMS; 63 + if (!vcpu->arch.pause) { 64 + if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) 65 + return PSCI_RET_ALREADY_ON; 66 + else 67 + return PSCI_RET_INVALID_PARAMS; 68 + } 94 69 95 70 target_pc = *vcpu_reg(source_vcpu, 2); 71 + context_id = *vcpu_reg(source_vcpu, 3); 96 72 97 73 kvm_reset_vcpu(vcpu); 98 74 ··· 114 76 kvm_vcpu_set_be(vcpu); 115 77 116 78 *vcpu_pc(vcpu) = target_pc; 79 + /* 80 + * NOTE: We always update r0 (or x0) because for PSCI v0.1 81 + * the general puspose registers are undefined upon CPU_ON. 82 + */ 83 + *vcpu_reg(vcpu, 0) = context_id; 117 84 vcpu->arch.pause = false; 118 85 smp_mb(); /* Make sure the above is visible */ 119 86 120 87 wq = kvm_arch_vcpu_wq(vcpu); 121 88 wake_up_interruptible(wq); 122 89 123 - return KVM_PSCI_RET_SUCCESS; 90 + return PSCI_RET_SUCCESS; 124 91 } 125 92 126 - /** 127 - * kvm_psci_call - handle PSCI call if r0 value is in range 128 - * @vcpu: Pointer to the VCPU struct 129 - * 130 - * Handle PSCI calls from guests through traps from HVC instructions. 131 - * The calling convention is similar to SMC calls to the secure world where 132 - * the function number is placed in r0 and this function returns true if the 133 - * function number specified in r0 is withing the PSCI range, and false 134 - * otherwise. 135 - */ 136 - bool kvm_psci_call(struct kvm_vcpu *vcpu) 93 + static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) 94 + { 95 + int i; 96 + unsigned long mpidr; 97 + unsigned long target_affinity; 98 + unsigned long target_affinity_mask; 99 + unsigned long lowest_affinity_level; 100 + struct kvm *kvm = vcpu->kvm; 101 + struct kvm_vcpu *tmp; 102 + 103 + target_affinity = *vcpu_reg(vcpu, 1); 104 + lowest_affinity_level = *vcpu_reg(vcpu, 2); 105 + 106 + /* Determine target affinity mask */ 107 + target_affinity_mask = psci_affinity_mask(lowest_affinity_level); 108 + if (!target_affinity_mask) 109 + return PSCI_RET_INVALID_PARAMS; 110 + 111 + /* Ignore other bits of target affinity */ 112 + target_affinity &= target_affinity_mask; 113 + 114 + /* 115 + * If one or more VCPU matching target affinity are running 116 + * then ON else OFF 117 + */ 118 + kvm_for_each_vcpu(i, tmp, kvm) { 119 + mpidr = kvm_vcpu_get_mpidr(tmp); 120 + if (((mpidr & target_affinity_mask) == target_affinity) && 121 + !tmp->arch.pause) { 122 + return PSCI_0_2_AFFINITY_LEVEL_ON; 123 + } 124 + } 125 + 126 + return PSCI_0_2_AFFINITY_LEVEL_OFF; 127 + } 128 + 129 + static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) 130 + { 131 + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); 132 + vcpu->run->system_event.type = type; 133 + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 134 + } 135 + 136 + static void kvm_psci_system_off(struct kvm_vcpu *vcpu) 137 + { 138 + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN); 139 + } 140 + 141 + static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) 142 + { 143 + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); 144 + } 145 + 146 + int kvm_psci_version(struct kvm_vcpu *vcpu) 147 + { 148 + if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features)) 149 + return KVM_ARM_PSCI_0_2; 150 + 151 + return KVM_ARM_PSCI_0_1; 152 + } 153 + 154 + static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) 155 + { 156 + int ret = 1; 157 + unsigned long psci_fn = *vcpu_reg(vcpu, 0) & ~((u32) 0); 158 + unsigned long val; 159 + 160 + switch (psci_fn) { 161 + case PSCI_0_2_FN_PSCI_VERSION: 162 + /* 163 + * Bits[31:16] = Major Version = 0 164 + * Bits[15:0] = Minor Version = 2 165 + */ 166 + val = 2; 167 + break; 168 + case PSCI_0_2_FN_CPU_SUSPEND: 169 + case PSCI_0_2_FN64_CPU_SUSPEND: 170 + val = kvm_psci_vcpu_suspend(vcpu); 171 + break; 172 + case PSCI_0_2_FN_CPU_OFF: 173 + kvm_psci_vcpu_off(vcpu); 174 + val = PSCI_RET_SUCCESS; 175 + break; 176 + case PSCI_0_2_FN_CPU_ON: 177 + case PSCI_0_2_FN64_CPU_ON: 178 + val = kvm_psci_vcpu_on(vcpu); 179 + break; 180 + case PSCI_0_2_FN_AFFINITY_INFO: 181 + case PSCI_0_2_FN64_AFFINITY_INFO: 182 + val = kvm_psci_vcpu_affinity_info(vcpu); 183 + break; 184 + case PSCI_0_2_FN_MIGRATE: 185 + case PSCI_0_2_FN64_MIGRATE: 186 + val = PSCI_RET_NOT_SUPPORTED; 187 + break; 188 + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: 189 + /* 190 + * Trusted OS is MP hence does not require migration 191 + * or 192 + * Trusted OS is not present 193 + */ 194 + val = PSCI_0_2_TOS_MP; 195 + break; 196 + case PSCI_0_2_FN_MIGRATE_INFO_UP_CPU: 197 + case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU: 198 + val = PSCI_RET_NOT_SUPPORTED; 199 + break; 200 + case PSCI_0_2_FN_SYSTEM_OFF: 201 + kvm_psci_system_off(vcpu); 202 + /* 203 + * We should'nt be going back to guest VCPU after 204 + * receiving SYSTEM_OFF request. 205 + * 206 + * If user space accidently/deliberately resumes 207 + * guest VCPU after SYSTEM_OFF request then guest 208 + * VCPU should see internal failure from PSCI return 209 + * value. To achieve this, we preload r0 (or x0) with 210 + * PSCI return value INTERNAL_FAILURE. 211 + */ 212 + val = PSCI_RET_INTERNAL_FAILURE; 213 + ret = 0; 214 + break; 215 + case PSCI_0_2_FN_SYSTEM_RESET: 216 + kvm_psci_system_reset(vcpu); 217 + /* 218 + * Same reason as SYSTEM_OFF for preloading r0 (or x0) 219 + * with PSCI return value INTERNAL_FAILURE. 220 + */ 221 + val = PSCI_RET_INTERNAL_FAILURE; 222 + ret = 0; 223 + break; 224 + default: 225 + return -EINVAL; 226 + } 227 + 228 + *vcpu_reg(vcpu, 0) = val; 229 + return ret; 230 + } 231 + 232 + static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) 137 233 { 138 234 unsigned long psci_fn = *vcpu_reg(vcpu, 0) & ~((u32) 0); 139 235 unsigned long val; ··· 275 103 switch (psci_fn) { 276 104 case KVM_PSCI_FN_CPU_OFF: 277 105 kvm_psci_vcpu_off(vcpu); 278 - val = KVM_PSCI_RET_SUCCESS; 106 + val = PSCI_RET_SUCCESS; 279 107 break; 280 108 case KVM_PSCI_FN_CPU_ON: 281 109 val = kvm_psci_vcpu_on(vcpu); 282 110 break; 283 111 case KVM_PSCI_FN_CPU_SUSPEND: 284 112 case KVM_PSCI_FN_MIGRATE: 285 - val = KVM_PSCI_RET_NI; 113 + val = PSCI_RET_NOT_SUPPORTED; 286 114 break; 287 - 288 115 default: 289 - return false; 116 + return -EINVAL; 290 117 } 291 118 292 119 *vcpu_reg(vcpu, 0) = val; 293 - return true; 120 + return 1; 121 + } 122 + 123 + /** 124 + * kvm_psci_call - handle PSCI call if r0 value is in range 125 + * @vcpu: Pointer to the VCPU struct 126 + * 127 + * Handle PSCI calls from guests through traps from HVC instructions. 128 + * The calling convention is similar to SMC calls to the secure world 129 + * where the function number is placed in r0. 130 + * 131 + * This function returns: > 0 (success), 0 (success but exit to user 132 + * space), and < 0 (errors) 133 + * 134 + * Errors: 135 + * -EINVAL: Unrecognized PSCI function 136 + */ 137 + int kvm_psci_call(struct kvm_vcpu *vcpu) 138 + { 139 + switch (kvm_psci_version(vcpu)) { 140 + case KVM_ARM_PSCI_0_2: 141 + return kvm_psci_0_2_call(vcpu); 142 + case KVM_ARM_PSCI_0_1: 143 + return kvm_psci_0_1_call(vcpu); 144 + default: 145 + return -EINVAL; 146 + }; 294 147 }

+2

arch/arm64/include/asm/cpu_ops.h

··· 39 39 * from the cpu to be killed. 40 40 * @cpu_die: Makes a cpu leave the kernel. Must not fail. Called from the 41 41 * cpu being killed. 42 + * @cpu_kill: Ensures a cpu has left the kernel. Called from another cpu. 42 43 * @cpu_suspend: Suspends a cpu and saves the required context. May fail owing 43 44 * to wrong parameters or error conditions. Called from the 44 45 * CPU being suspended. Must be called with IRQs disabled. ··· 53 52 #ifdef CONFIG_HOTPLUG_CPU 54 53 int (*cpu_disable)(unsigned int cpu); 55 54 void (*cpu_die)(unsigned int cpu); 55 + int (*cpu_kill)(unsigned int cpu); 56 56 #endif 57 57 #ifdef CONFIG_ARM64_CPU_SUSPEND 58 58 int (*cpu_suspend)(unsigned long);

+1

arch/arm64/include/asm/cputype.h

··· 41 41 42 42 #define ARM_CPU_PART_AEM_V8 0xD0F0 43 43 #define ARM_CPU_PART_FOUNDATION 0xD000 44 + #define ARM_CPU_PART_CORTEX_A53 0xD030 44 45 #define ARM_CPU_PART_CORTEX_A57 0xD070 45 46 46 47 #define APM_CPU_PART_POTENZA 0x0000

+1 -1

arch/arm64/include/asm/kvm_host.h

··· 39 39 #include <kvm/arm_vgic.h> 40 40 #include <kvm/arm_arch_timer.h> 41 41 42 - #define KVM_VCPU_MAX_FEATURES 2 42 + #define KVM_VCPU_MAX_FEATURES 3 43 43 44 44 struct kvm_vcpu; 45 45 int kvm_target_cpu(void);

+5 -1

arch/arm64/include/asm/kvm_psci.h

··· 18 18 #ifndef __ARM64_KVM_PSCI_H__ 19 19 #define __ARM64_KVM_PSCI_H__ 20 20 21 - bool kvm_psci_call(struct kvm_vcpu *vcpu); 21 + #define KVM_ARM_PSCI_0_1 1 22 + #define KVM_ARM_PSCI_0_2 2 23 + 24 + int kvm_psci_version(struct kvm_vcpu *vcpu); 25 + int kvm_psci_call(struct kvm_vcpu *vcpu); 22 26 23 27 #endif /* __ARM64_KVM_PSCI_H__ */

+1 -1

arch/arm64/include/asm/psci.h

··· 14 14 #ifndef __ASM_PSCI_H 15 15 #define __ASM_PSCI_H 16 16 17 - void psci_init(void); 17 + int psci_init(void); 18 18 19 19 #endif /* __ASM_PSCI_H */

+8 -5

arch/arm64/include/uapi/asm/kvm.h

··· 31 31 #define KVM_NR_SPSR 5 32 32 33 33 #ifndef __ASSEMBLY__ 34 + #include <linux/psci.h> 34 35 #include <asm/types.h> 35 36 #include <asm/ptrace.h> 36 37 ··· 57 56 #define KVM_ARM_TARGET_FOUNDATION_V8 1 58 57 #define KVM_ARM_TARGET_CORTEX_A57 2 59 58 #define KVM_ARM_TARGET_XGENE_POTENZA 3 59 + #define KVM_ARM_TARGET_CORTEX_A53 4 60 60 61 - #define KVM_ARM_NUM_TARGETS 4 61 + #define KVM_ARM_NUM_TARGETS 5 62 62 63 63 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ 64 64 #define KVM_ARM_DEVICE_TYPE_SHIFT 0 ··· 79 77 80 78 #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ 81 79 #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ 80 + #define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */ 82 81 83 82 struct kvm_vcpu_init { 84 83 __u32 target; ··· 189 186 #define KVM_PSCI_FN_CPU_ON KVM_PSCI_FN(2) 190 187 #define KVM_PSCI_FN_MIGRATE KVM_PSCI_FN(3) 191 188 192 - #define KVM_PSCI_RET_SUCCESS 0 193 - #define KVM_PSCI_RET_NI ((unsigned long)-1) 194 - #define KVM_PSCI_RET_INVAL ((unsigned long)-2) 195 - #define KVM_PSCI_RET_DENIED ((unsigned long)-3) 189 + #define KVM_PSCI_RET_SUCCESS PSCI_RET_SUCCESS 190 + #define KVM_PSCI_RET_NI PSCI_RET_NOT_SUPPORTED 191 + #define KVM_PSCI_RET_INVAL PSCI_RET_INVALID_PARAMS 192 + #define KVM_PSCI_RET_DENIED PSCI_RET_DENIED 196 193 197 194 #endif 198 195

+196 -39

arch/arm64/kernel/psci.c

··· 18 18 #include <linux/init.h> 19 19 #include <linux/of.h> 20 20 #include <linux/smp.h> 21 + #include <linux/reboot.h> 22 + #include <linux/pm.h> 23 + #include <linux/delay.h> 24 + #include <uapi/linux/psci.h> 21 25 22 26 #include <asm/compiler.h> 23 27 #include <asm/cpu_ops.h> 24 28 #include <asm/errno.h> 25 29 #include <asm/psci.h> 26 30 #include <asm/smp_plat.h> 31 + #include <asm/system_misc.h> 27 32 28 33 #define PSCI_POWER_STATE_TYPE_STANDBY 0 29 34 #define PSCI_POWER_STATE_TYPE_POWER_DOWN 1 ··· 45 40 int (*cpu_off)(struct psci_power_state state); 46 41 int (*cpu_on)(unsigned long cpuid, unsigned long entry_point); 47 42 int (*migrate)(unsigned long cpuid); 43 + int (*affinity_info)(unsigned long target_affinity, 44 + unsigned long lowest_affinity_level); 45 + int (*migrate_info_type)(void); 48 46 }; 49 47 50 48 static struct psci_operations psci_ops; 51 49 52 50 static int (*invoke_psci_fn)(u64, u64, u64, u64); 51 + typedef int (*psci_initcall_t)(const struct device_node *); 53 52 54 53 enum psci_function { 55 54 PSCI_FN_CPU_SUSPEND, 56 55 PSCI_FN_CPU_ON, 57 56 PSCI_FN_CPU_OFF, 58 57 PSCI_FN_MIGRATE, 58 + PSCI_FN_AFFINITY_INFO, 59 + PSCI_FN_MIGRATE_INFO_TYPE, 59 60 PSCI_FN_MAX, 60 61 }; 61 62 62 63 static u32 psci_function_id[PSCI_FN_MAX]; 63 - 64 - #define PSCI_RET_SUCCESS 0 65 - #define PSCI_RET_EOPNOTSUPP -1 66 - #define PSCI_RET_EINVAL -2 67 - #define PSCI_RET_EPERM -3 68 64 69 65 static int psci_to_linux_errno(int errno) 70 66 { 71 67 switch (errno) { 72 68 case PSCI_RET_SUCCESS: 73 69 return 0; 74 - case PSCI_RET_EOPNOTSUPP: 70 + case PSCI_RET_NOT_SUPPORTED: 75 71 return -EOPNOTSUPP; 76 - case PSCI_RET_EINVAL: 72 + case PSCI_RET_INVALID_PARAMS: 77 73 return -EINVAL; 78 - case PSCI_RET_EPERM: 74 + case PSCI_RET_DENIED: 79 75 return -EPERM; 80 76 }; 81 77 82 78 return -EINVAL; 83 79 } 84 80 85 - #define PSCI_POWER_STATE_ID_MASK 0xffff 86 - #define PSCI_POWER_STATE_ID_SHIFT 0 87 - #define PSCI_POWER_STATE_TYPE_MASK 0x1 88 - #define PSCI_POWER_STATE_TYPE_SHIFT 16 89 - #define PSCI_POWER_STATE_AFFL_MASK 0x3 90 - #define PSCI_POWER_STATE_AFFL_SHIFT 24 91 - 92 81 static u32 psci_power_state_pack(struct psci_power_state state) 93 82 { 94 - return ((state.id & PSCI_POWER_STATE_ID_MASK) 95 - << PSCI_POWER_STATE_ID_SHIFT) | 96 - ((state.type & PSCI_POWER_STATE_TYPE_MASK) 97 - << PSCI_POWER_STATE_TYPE_SHIFT) | 98 - ((state.affinity_level & PSCI_POWER_STATE_AFFL_MASK) 99 - << PSCI_POWER_STATE_AFFL_SHIFT); 83 + return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT) 84 + & PSCI_0_2_POWER_STATE_ID_MASK) | 85 + ((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT) 86 + & PSCI_0_2_POWER_STATE_TYPE_MASK) | 87 + ((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT) 88 + & PSCI_0_2_POWER_STATE_AFFL_MASK); 100 89 } 101 90 102 91 /* ··· 125 126 : "r" (arg0), "r" (arg1), "r" (arg2)); 126 127 127 128 return function_id; 129 + } 130 + 131 + static int psci_get_version(void) 132 + { 133 + int err; 134 + 135 + err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); 136 + return err; 128 137 } 129 138 130 139 static int psci_cpu_suspend(struct psci_power_state state, ··· 178 171 return psci_to_linux_errno(err); 179 172 } 180 173 181 - static const struct of_device_id psci_of_match[] __initconst = { 182 - { .compatible = "arm,psci", }, 183 - {}, 184 - }; 185 - 186 - void __init psci_init(void) 174 + static int psci_affinity_info(unsigned long target_affinity, 175 + unsigned long lowest_affinity_level) 187 176 { 188 - struct device_node *np; 177 + int err; 178 + u32 fn; 179 + 180 + fn = psci_function_id[PSCI_FN_AFFINITY_INFO]; 181 + err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0); 182 + return err; 183 + } 184 + 185 + static int psci_migrate_info_type(void) 186 + { 187 + int err; 188 + u32 fn; 189 + 190 + fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE]; 191 + err = invoke_psci_fn(fn, 0, 0, 0); 192 + return err; 193 + } 194 + 195 + static int get_set_conduit_method(struct device_node *np) 196 + { 189 197 const char *method; 190 - u32 id; 191 198 192 - np = of_find_matching_node(NULL, psci_of_match); 193 - if (!np) 194 - return; 195 - 196 - pr_info("probing function IDs from device-tree\n"); 199 + pr_info("probing for conduit method from DT.\n"); 197 200 198 201 if (of_property_read_string(np, "method", &method)) { 199 - pr_warning("missing \"method\" property\n"); 200 - goto out_put_node; 202 + pr_warn("missing \"method\" property\n"); 203 + return -ENXIO; 201 204 } 202 205 203 206 if (!strcmp("hvc", method)) { ··· 215 198 } else if (!strcmp("smc", method)) { 216 199 invoke_psci_fn = __invoke_psci_fn_smc; 217 200 } else { 218 - pr_warning("invalid \"method\" property: %s\n", method); 219 - goto out_put_node; 201 + pr_warn("invalid \"method\" property: %s\n", method); 202 + return -EINVAL; 220 203 } 204 + return 0; 205 + } 206 + 207 + static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd) 208 + { 209 + invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0); 210 + } 211 + 212 + static void psci_sys_poweroff(void) 213 + { 214 + invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0); 215 + } 216 + 217 + /* 218 + * PSCI Function IDs for v0.2+ are well defined so use 219 + * standard values. 220 + */ 221 + static int psci_0_2_init(struct device_node *np) 222 + { 223 + int err, ver; 224 + 225 + err = get_set_conduit_method(np); 226 + 227 + if (err) 228 + goto out_put_node; 229 + 230 + ver = psci_get_version(); 231 + 232 + if (ver == PSCI_RET_NOT_SUPPORTED) { 233 + /* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */ 234 + pr_err("PSCI firmware does not comply with the v0.2 spec.\n"); 235 + err = -EOPNOTSUPP; 236 + goto out_put_node; 237 + } else { 238 + pr_info("PSCIv%d.%d detected in firmware.\n", 239 + PSCI_VERSION_MAJOR(ver), 240 + PSCI_VERSION_MINOR(ver)); 241 + 242 + if (PSCI_VERSION_MAJOR(ver) == 0 && 243 + PSCI_VERSION_MINOR(ver) < 2) { 244 + err = -EINVAL; 245 + pr_err("Conflicting PSCI version detected.\n"); 246 + goto out_put_node; 247 + } 248 + } 249 + 250 + pr_info("Using standard PSCI v0.2 function IDs\n"); 251 + psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND; 252 + psci_ops.cpu_suspend = psci_cpu_suspend; 253 + 254 + psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF; 255 + psci_ops.cpu_off = psci_cpu_off; 256 + 257 + psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN64_CPU_ON; 258 + psci_ops.cpu_on = psci_cpu_on; 259 + 260 + psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE; 261 + psci_ops.migrate = psci_migrate; 262 + 263 + psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN64_AFFINITY_INFO; 264 + psci_ops.affinity_info = psci_affinity_info; 265 + 266 + psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] = 267 + PSCI_0_2_FN_MIGRATE_INFO_TYPE; 268 + psci_ops.migrate_info_type = psci_migrate_info_type; 269 + 270 + arm_pm_restart = psci_sys_reset; 271 + 272 + pm_power_off = psci_sys_poweroff; 273 + 274 + out_put_node: 275 + of_node_put(np); 276 + return err; 277 + } 278 + 279 + /* 280 + * PSCI < v0.2 get PSCI Function IDs via DT. 281 + */ 282 + static int psci_0_1_init(struct device_node *np) 283 + { 284 + u32 id; 285 + int err; 286 + 287 + err = get_set_conduit_method(np); 288 + 289 + if (err) 290 + goto out_put_node; 291 + 292 + pr_info("Using PSCI v0.1 Function IDs from DT\n"); 221 293 222 294 if (!of_property_read_u32(np, "cpu_suspend", &id)) { 223 295 psci_function_id[PSCI_FN_CPU_SUSPEND] = id; ··· 330 224 331 225 out_put_node: 332 226 of_node_put(np); 333 - return; 227 + return err; 228 + } 229 + 230 + static const struct of_device_id psci_of_match[] __initconst = { 231 + { .compatible = "arm,psci", .data = psci_0_1_init}, 232 + { .compatible = "arm,psci-0.2", .data = psci_0_2_init}, 233 + {}, 234 + }; 235 + 236 + int __init psci_init(void) 237 + { 238 + struct device_node *np; 239 + const struct of_device_id *matched_np; 240 + psci_initcall_t init_fn; 241 + 242 + np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np); 243 + 244 + if (!np) 245 + return -ENODEV; 246 + 247 + init_fn = (psci_initcall_t)matched_np->data; 248 + return init_fn(np); 334 249 } 335 250 336 251 #ifdef CONFIG_SMP ··· 404 277 405 278 pr_crit("unable to power off CPU%u (%d)\n", cpu, ret); 406 279 } 280 + 281 + static int cpu_psci_cpu_kill(unsigned int cpu) 282 + { 283 + int err, i; 284 + 285 + if (!psci_ops.affinity_info) 286 + return 1; 287 + /* 288 + * cpu_kill could race with cpu_die and we can 289 + * potentially end up declaring this cpu undead 290 + * while it is dying. So, try again a few times. 291 + */ 292 + 293 + for (i = 0; i < 10; i++) { 294 + err = psci_ops.affinity_info(cpu_logical_map(cpu), 0); 295 + if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) { 296 + pr_info("CPU%d killed.\n", cpu); 297 + return 1; 298 + } 299 + 300 + msleep(10); 301 + pr_info("Retrying again to check for CPU kill\n"); 302 + } 303 + 304 + pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n", 305 + cpu, err); 306 + /* Make op_cpu_kill() fail. */ 307 + return 0; 308 + } 407 309 #endif 408 310 409 311 const struct cpu_operations cpu_psci_ops = { ··· 443 287 #ifdef CONFIG_HOTPLUG_CPU 444 288 .cpu_disable = cpu_psci_cpu_disable, 445 289 .cpu_die = cpu_psci_cpu_die, 290 + .cpu_kill = cpu_psci_cpu_kill, 446 291 #endif 447 292 }; 448 293

+22

arch/arm64/kernel/smp.c

··· 228 228 return 0; 229 229 } 230 230 231 + static int op_cpu_kill(unsigned int cpu) 232 + { 233 + /* 234 + * If we have no means of synchronising with the dying CPU, then assume 235 + * that it is really dead. We can only wait for an arbitrary length of 236 + * time and hope that it's dead, so let's skip the wait and just hope. 237 + */ 238 + if (!cpu_ops[cpu]->cpu_kill) 239 + return 1; 240 + 241 + return cpu_ops[cpu]->cpu_kill(cpu); 242 + } 243 + 231 244 static DECLARE_COMPLETION(cpu_died); 232 245 233 246 /* ··· 254 241 return; 255 242 } 256 243 pr_notice("CPU%u: shutdown\n", cpu); 244 + 245 + /* 246 + * Now that the dying CPU is beyond the point of no return w.r.t. 247 + * in-kernel synchronisation, try to get the firwmare to help us to 248 + * verify that it has really left the kernel before we consider 249 + * clobbering anything it might still be using. 250 + */ 251 + if (!op_cpu_kill(cpu)) 252 + pr_warn("CPU%d may not have shut down cleanly\n", cpu); 257 253 } 258 254 259 255 /*

+2

arch/arm64/kvm/guest.c

··· 214 214 return KVM_ARM_TARGET_AEM_V8; 215 215 case ARM_CPU_PART_FOUNDATION: 216 216 return KVM_ARM_TARGET_FOUNDATION_V8; 217 + case ARM_CPU_PART_CORTEX_A53: 218 + return KVM_ARM_TARGET_CORTEX_A53; 217 219 case ARM_CPU_PART_CORTEX_A57: 218 220 return KVM_ARM_TARGET_CORTEX_A57; 219 221 };

+8 -4

arch/arm64/kvm/handle_exit.c

··· 30 30 31 31 static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) 32 32 { 33 - if (kvm_psci_call(vcpu)) 34 - return 1; 33 + int ret; 35 34 36 - kvm_inject_undefined(vcpu); 37 - return 1; 35 + ret = kvm_psci_call(vcpu); 36 + if (ret < 0) { 37 + kvm_inject_undefined(vcpu); 38 + return 1; 39 + } 40 + 41 + return ret; 38 42 } 39 43 40 44 static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)

+2

arch/arm64/kvm/sys_regs_generic_v8.c

··· 88 88 &genericv8_target_table); 89 89 kvm_register_target_sys_reg_table(KVM_ARM_TARGET_FOUNDATION_V8, 90 90 &genericv8_target_table); 91 + kvm_register_target_sys_reg_table(KVM_ARM_TARGET_CORTEX_A53, 92 + &genericv8_target_table); 91 93 kvm_register_target_sys_reg_table(KVM_ARM_TARGET_CORTEX_A57, 92 94 &genericv8_target_table); 93 95 kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA,

+6 -6

arch/mips/Kconfig

··· 1756 1756 help 1757 1757 Select this option if building a guest kernel for KVM (Trap & Emulate) mode 1758 1758 1759 - config KVM_HOST_FREQ 1760 - int "KVM Host Processor Frequency (MHz)" 1759 + config KVM_GUEST_TIMER_FREQ 1760 + int "Count/Compare Timer Frequency (MHz)" 1761 1761 depends on KVM_GUEST 1762 - default 500 1762 + default 100 1763 1763 help 1764 - Select this option if building a guest kernel for KVM to skip 1765 - RTC emulation when determining guest CPU Frequency. Instead, the guest 1766 - processor frequency is automatically derived from the host frequency. 1764 + Set this to non-zero if building a guest kernel for KVM to skip RTC 1765 + emulation when determining guest CPU Frequency. Instead, the guest's 1766 + timer frequency is specified directly. 1767 1767 1768 1768 choice 1769 1769 prompt "Kernel page size"

+148 -35

arch/mips/include/asm/kvm_host.h

··· 19 19 #include <linux/threads.h> 20 20 #include <linux/spinlock.h> 21 21 22 + /* MIPS KVM register ids */ 23 + #define MIPS_CP0_32(_R, _S) \ 24 + (KVM_REG_MIPS | KVM_REG_SIZE_U32 | 0x10000 | (8 * (_R) + (_S))) 25 + 26 + #define MIPS_CP0_64(_R, _S) \ 27 + (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0x10000 | (8 * (_R) + (_S))) 28 + 29 + #define KVM_REG_MIPS_CP0_INDEX MIPS_CP0_32(0, 0) 30 + #define KVM_REG_MIPS_CP0_ENTRYLO0 MIPS_CP0_64(2, 0) 31 + #define KVM_REG_MIPS_CP0_ENTRYLO1 MIPS_CP0_64(3, 0) 32 + #define KVM_REG_MIPS_CP0_CONTEXT MIPS_CP0_64(4, 0) 33 + #define KVM_REG_MIPS_CP0_USERLOCAL MIPS_CP0_64(4, 2) 34 + #define KVM_REG_MIPS_CP0_PAGEMASK MIPS_CP0_32(5, 0) 35 + #define KVM_REG_MIPS_CP0_PAGEGRAIN MIPS_CP0_32(5, 1) 36 + #define KVM_REG_MIPS_CP0_WIRED MIPS_CP0_32(6, 0) 37 + #define KVM_REG_MIPS_CP0_HWRENA MIPS_CP0_32(7, 0) 38 + #define KVM_REG_MIPS_CP0_BADVADDR MIPS_CP0_64(8, 0) 39 + #define KVM_REG_MIPS_CP0_COUNT MIPS_CP0_32(9, 0) 40 + #define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0) 41 + #define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0) 42 + #define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0) 43 + #define KVM_REG_MIPS_CP0_CAUSE MIPS_CP0_32(13, 0) 44 + #define KVM_REG_MIPS_CP0_EPC MIPS_CP0_64(14, 0) 45 + #define KVM_REG_MIPS_CP0_EBASE MIPS_CP0_64(15, 1) 46 + #define KVM_REG_MIPS_CP0_CONFIG MIPS_CP0_32(16, 0) 47 + #define KVM_REG_MIPS_CP0_CONFIG1 MIPS_CP0_32(16, 1) 48 + #define KVM_REG_MIPS_CP0_CONFIG2 MIPS_CP0_32(16, 2) 49 + #define KVM_REG_MIPS_CP0_CONFIG3 MIPS_CP0_32(16, 3) 50 + #define KVM_REG_MIPS_CP0_CONFIG7 MIPS_CP0_32(16, 7) 51 + #define KVM_REG_MIPS_CP0_XCONTEXT MIPS_CP0_64(20, 0) 52 + #define KVM_REG_MIPS_CP0_ERROREPC MIPS_CP0_64(30, 0) 53 + 22 54 23 55 #define KVM_MAX_VCPUS 1 24 56 #define KVM_USER_MEM_SLOTS 8 ··· 404 372 405 373 u32 io_gpr; /* GPR used as IO source/target */ 406 374 407 - /* Used to calibrate the virutal count register for the guest */ 408 - int32_t host_cp0_count; 375 + struct hrtimer comparecount_timer; 376 + /* Count timer control KVM register */ 377 + uint32_t count_ctl; 378 + /* Count bias from the raw time */ 379 + uint32_t count_bias; 380 + /* Frequency of timer in Hz */ 381 + uint32_t count_hz; 382 + /* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */ 383 + s64 count_dyn_bias; 384 + /* Resume time */ 385 + ktime_t count_resume; 386 + /* Period of timer tick in ns */ 387 + u64 count_period; 409 388 410 389 /* Bitmask of exceptions that are pending */ 411 390 unsigned long pending_exceptions; ··· 437 394 uint32_t guest_kernel_asid[NR_CPUS]; 438 395 struct mm_struct guest_kernel_mm, guest_user_mm; 439 396 440 - struct hrtimer comparecount_timer; 441 - 442 397 int last_sched_cpu; 443 398 444 399 /* WAIT executed */ ··· 451 410 #define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0]) 452 411 #define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val)) 453 412 #define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2]) 413 + #define kvm_write_c0_guest_userlocal(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2] = (val)) 454 414 #define kvm_read_c0_guest_pagemask(cop0) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0]) 455 415 #define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val)) 456 416 #define kvm_read_c0_guest_wired(cop0) (cop0->reg[MIPS_CP0_TLB_WIRED][0]) ··· 491 449 #define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0]) 492 450 #define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val)) 493 451 452 + /* 453 + * Some of the guest registers may be modified asynchronously (e.g. from a 454 + * hrtimer callback in hard irq context) and therefore need stronger atomicity 455 + * guarantees than other registers. 456 + */ 457 + 458 + static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg, 459 + unsigned long val) 460 + { 461 + unsigned long temp; 462 + do { 463 + __asm__ __volatile__( 464 + " .set mips3 \n" 465 + " " __LL "%0, %1 \n" 466 + " or %0, %2 \n" 467 + " " __SC "%0, %1 \n" 468 + " .set mips0 \n" 469 + : "=&r" (temp), "+m" (*reg) 470 + : "r" (val)); 471 + } while (unlikely(!temp)); 472 + } 473 + 474 + static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg, 475 + unsigned long val) 476 + { 477 + unsigned long temp; 478 + do { 479 + __asm__ __volatile__( 480 + " .set mips3 \n" 481 + " " __LL "%0, %1 \n" 482 + " and %0, %2 \n" 483 + " " __SC "%0, %1 \n" 484 + " .set mips0 \n" 485 + : "=&r" (temp), "+m" (*reg) 486 + : "r" (~val)); 487 + } while (unlikely(!temp)); 488 + } 489 + 490 + static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg, 491 + unsigned long change, 492 + unsigned long val) 493 + { 494 + unsigned long temp; 495 + do { 496 + __asm__ __volatile__( 497 + " .set mips3 \n" 498 + " " __LL "%0, %1 \n" 499 + " and %0, %2 \n" 500 + " or %0, %3 \n" 501 + " " __SC "%0, %1 \n" 502 + " .set mips0 \n" 503 + : "=&r" (temp), "+m" (*reg) 504 + : "r" (~change), "r" (val & change)); 505 + } while (unlikely(!temp)); 506 + } 507 + 494 508 #define kvm_set_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] |= (val)) 495 509 #define kvm_clear_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val)) 496 - #define kvm_set_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] |= (val)) 497 - #define kvm_clear_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val)) 510 + 511 + /* Cause can be modified asynchronously from hardirq hrtimer callback */ 512 + #define kvm_set_c0_guest_cause(cop0, val) \ 513 + _kvm_atomic_set_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val) 514 + #define kvm_clear_c0_guest_cause(cop0, val) \ 515 + _kvm_atomic_clear_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val) 498 516 #define kvm_change_c0_guest_cause(cop0, change, val) \ 499 - { \ 500 - kvm_clear_c0_guest_cause(cop0, change); \ 501 - kvm_set_c0_guest_cause(cop0, ((val) & (change))); \ 502 - } 517 + _kvm_atomic_change_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], \ 518 + change, val) 519 + 503 520 #define kvm_set_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] |= (val)) 504 521 #define kvm_clear_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] &= ~(val)) 505 522 #define kvm_change_c0_guest_ebase(cop0, change, val) \ ··· 569 468 570 469 571 470 struct kvm_mips_callbacks { 572 - int (*handle_cop_unusable) (struct kvm_vcpu *vcpu); 573 - int (*handle_tlb_mod) (struct kvm_vcpu *vcpu); 574 - int (*handle_tlb_ld_miss) (struct kvm_vcpu *vcpu); 575 - int (*handle_tlb_st_miss) (struct kvm_vcpu *vcpu); 576 - int (*handle_addr_err_st) (struct kvm_vcpu *vcpu); 577 - int (*handle_addr_err_ld) (struct kvm_vcpu *vcpu); 578 - int (*handle_syscall) (struct kvm_vcpu *vcpu); 579 - int (*handle_res_inst) (struct kvm_vcpu *vcpu); 580 - int (*handle_break) (struct kvm_vcpu *vcpu); 581 - int (*vm_init) (struct kvm *kvm); 582 - int (*vcpu_init) (struct kvm_vcpu *vcpu); 583 - int (*vcpu_setup) (struct kvm_vcpu *vcpu); 584 - gpa_t(*gva_to_gpa) (gva_t gva); 585 - void (*queue_timer_int) (struct kvm_vcpu *vcpu); 586 - void (*dequeue_timer_int) (struct kvm_vcpu *vcpu); 587 - void (*queue_io_int) (struct kvm_vcpu *vcpu, 588 - struct kvm_mips_interrupt *irq); 589 - void (*dequeue_io_int) (struct kvm_vcpu *vcpu, 590 - struct kvm_mips_interrupt *irq); 591 - int (*irq_deliver) (struct kvm_vcpu *vcpu, unsigned int priority, 592 - uint32_t cause); 593 - int (*irq_clear) (struct kvm_vcpu *vcpu, unsigned int priority, 594 - uint32_t cause); 471 + int (*handle_cop_unusable)(struct kvm_vcpu *vcpu); 472 + int (*handle_tlb_mod)(struct kvm_vcpu *vcpu); 473 + int (*handle_tlb_ld_miss)(struct kvm_vcpu *vcpu); 474 + int (*handle_tlb_st_miss)(struct kvm_vcpu *vcpu); 475 + int (*handle_addr_err_st)(struct kvm_vcpu *vcpu); 476 + int (*handle_addr_err_ld)(struct kvm_vcpu *vcpu); 477 + int (*handle_syscall)(struct kvm_vcpu *vcpu); 478 + int (*handle_res_inst)(struct kvm_vcpu *vcpu); 479 + int (*handle_break)(struct kvm_vcpu *vcpu); 480 + int (*vm_init)(struct kvm *kvm); 481 + int (*vcpu_init)(struct kvm_vcpu *vcpu); 482 + int (*vcpu_setup)(struct kvm_vcpu *vcpu); 483 + gpa_t (*gva_to_gpa)(gva_t gva); 484 + void (*queue_timer_int)(struct kvm_vcpu *vcpu); 485 + void (*dequeue_timer_int)(struct kvm_vcpu *vcpu); 486 + void (*queue_io_int)(struct kvm_vcpu *vcpu, 487 + struct kvm_mips_interrupt *irq); 488 + void (*dequeue_io_int)(struct kvm_vcpu *vcpu, 489 + struct kvm_mips_interrupt *irq); 490 + int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority, 491 + uint32_t cause); 492 + int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority, 493 + uint32_t cause); 494 + int (*get_one_reg)(struct kvm_vcpu *vcpu, 495 + const struct kvm_one_reg *reg, s64 *v); 496 + int (*set_one_reg)(struct kvm_vcpu *vcpu, 497 + const struct kvm_one_reg *reg, s64 v); 595 498 }; 596 499 extern struct kvm_mips_callbacks *kvm_mips_callbacks; 597 500 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); ··· 714 609 extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, 715 610 struct kvm_run *run); 716 611 717 - enum emulation_result kvm_mips_emulate_count(struct kvm_vcpu *vcpu); 612 + uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu); 613 + void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count); 614 + void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare); 615 + void kvm_mips_init_count(struct kvm_vcpu *vcpu); 616 + int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl); 617 + int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume); 618 + int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz); 619 + void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu); 620 + void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu); 621 + enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu); 718 622 719 623 enum emulation_result kvm_mips_check_privilege(unsigned long cause, 720 624 uint32_t *opc, ··· 760 646 struct kvm_vcpu *vcpu); 761 647 762 648 /* Misc */ 763 - extern void mips32_SyncICache(unsigned long addr, unsigned long size); 764 649 extern int kvm_mips_dump_stats(struct kvm_vcpu *vcpu); 765 650 extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); 766 651

+35

arch/mips/include/uapi/asm/kvm.h

··· 106 106 #define KVM_REG_MIPS_LO (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 33) 107 107 #define KVM_REG_MIPS_PC (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 34) 108 108 109 + /* KVM specific control registers */ 110 + 111 + /* 112 + * CP0_Count control 113 + * DC: Set 0: Master disable CP0_Count and set COUNT_RESUME to now 114 + * Set 1: Master re-enable CP0_Count with unchanged bias, handling timer 115 + * interrupts since COUNT_RESUME 116 + * This can be used to freeze the timer to get a consistent snapshot of 117 + * the CP0_Count and timer interrupt pending state, while also resuming 118 + * safely without losing time or guest timer interrupts. 119 + * Other: Reserved, do not change. 120 + */ 121 + #define KVM_REG_MIPS_COUNT_CTL (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \ 122 + 0x20000 | 0) 123 + #define KVM_REG_MIPS_COUNT_CTL_DC 0x00000001 124 + 125 + /* 126 + * CP0_Count resume monotonic nanoseconds 127 + * The monotonic nanosecond time of the last set of COUNT_CTL.DC (master 128 + * disable). Any reads and writes of Count related registers while 129 + * COUNT_CTL.DC=1 will appear to occur at this time. When COUNT_CTL.DC is 130 + * cleared again (master enable) any timer interrupts since this time will be 131 + * emulated. 132 + * Modifications to times in the future are rejected. 133 + */ 134 + #define KVM_REG_MIPS_COUNT_RESUME (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \ 135 + 0x20000 | 1) 136 + /* 137 + * CP0_Count rate in Hz 138 + * Specifies the rate of the CP0_Count timer in Hz. Modifications occur without 139 + * discontinuities in CP0_Count. 140 + */ 141 + #define KVM_REG_MIPS_COUNT_HZ (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \ 142 + 0x20000 | 2) 143 + 109 144 /* 110 145 * KVM MIPS specific structures and definitions 111 146 *

-32

arch/mips/kvm/kvm_locore.S

··· 611 611 .word _C_LABEL(MIPSX(GuestException)) # 29 612 612 .word _C_LABEL(MIPSX(GuestException)) # 30 613 613 .word _C_LABEL(MIPSX(GuestException)) # 31 614 - 615 - 616 - /* This routine makes changes to the instruction stream effective to the hardware. 617 - * It should be called after the instruction stream is written. 618 - * On return, the new instructions are effective. 619 - * Inputs: 620 - * a0 = Start address of new instruction stream 621 - * a1 = Size, in bytes, of new instruction stream 622 - */ 623 - 624 - #define HW_SYNCI_Step $1 625 - LEAF(MIPSX(SyncICache)) 626 - .set push 627 - .set mips32r2 628 - beq a1, zero, 20f 629 - nop 630 - REG_ADDU a1, a0, a1 631 - rdhwr v0, HW_SYNCI_Step 632 - beq v0, zero, 20f 633 - nop 634 - 10: 635 - synci 0(a0) 636 - REG_ADDU a0, a0, v0 637 - sltu v1, a0, a1 638 - bne v1, zero, 10b 639 - nop 640 - sync 641 - 20: 642 - jr.hb ra 643 - nop 644 - .set pop 645 - END(MIPSX(SyncICache))

+73 -72

arch/mips/kvm/kvm_mips.c

··· 61 61 return 0; 62 62 } 63 63 64 - gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 65 - { 66 - return gfn; 67 - } 68 - 69 64 /* XXXKYMA: We are simulatoring a processor that has the WII bit set in Config7, so we 70 65 * are "runnable" if interrupts are pending 71 66 */ ··· 125 130 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 126 131 { 127 132 if (atomic_inc_return(&kvm_mips_instance) == 1) { 128 - kvm_info("%s: 1st KVM instance, setup host TLB parameters\n", 129 - __func__); 133 + kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n", 134 + __func__); 130 135 on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1); 131 136 } 132 137 ··· 144 149 if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE) 145 150 kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]); 146 151 } 147 - 148 - if (kvm->arch.guest_pmap) 149 - kfree(kvm->arch.guest_pmap); 152 + kfree(kvm->arch.guest_pmap); 150 153 151 154 kvm_for_each_vcpu(i, vcpu, kvm) { 152 155 kvm_arch_vcpu_free(vcpu); ··· 179 186 180 187 /* If this is the last instance, restore wired count */ 181 188 if (atomic_dec_return(&kvm_mips_instance) == 0) { 182 - kvm_info("%s: last KVM instance, restoring TLB parameters\n", 183 - __func__); 189 + kvm_debug("%s: last KVM instance, restoring TLB parameters\n", 190 + __func__); 184 191 on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1); 185 192 } 186 193 } ··· 242 249 goto out; 243 250 } 244 251 245 - kvm_info 246 - ("Allocated space for Guest PMAP Table (%ld pages) @ %p\n", 247 - npages, kvm->arch.guest_pmap); 252 + kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n", 253 + npages, kvm->arch.guest_pmap); 248 254 249 255 /* Now setup the page table */ 250 256 for (i = 0; i < npages; i++) { ··· 288 296 if (err) 289 297 goto out_free_cpu; 290 298 291 - kvm_info("kvm @ %p: create cpu %d at %p\n", kvm, id, vcpu); 299 + kvm_debug("kvm @ %p: create cpu %d at %p\n", kvm, id, vcpu); 292 300 293 301 /* Allocate space for host mode exception handlers that handle 294 302 * guest mode exits ··· 296 304 if (cpu_has_veic || cpu_has_vint) { 297 305 size = 0x200 + VECTORSPACING * 64; 298 306 } else { 299 - size = 0x200; 307 + size = 0x4000; 300 308 } 301 309 302 310 /* Save Linux EBASE */ ··· 308 316 err = -ENOMEM; 309 317 goto out_free_cpu; 310 318 } 311 - kvm_info("Allocated %d bytes for KVM Exception Handlers @ %p\n", 312 - ALIGN(size, PAGE_SIZE), gebase); 319 + kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", 320 + ALIGN(size, PAGE_SIZE), gebase); 313 321 314 322 /* Save new ebase */ 315 323 vcpu->arch.guest_ebase = gebase; ··· 334 342 335 343 /* General handler, relocate to unmapped space for sanity's sake */ 336 344 offset = 0x2000; 337 - kvm_info("Installing KVM Exception handlers @ %p, %#x bytes\n", 338 - gebase + offset, 339 - mips32_GuestExceptionEnd - mips32_GuestException); 345 + kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n", 346 + gebase + offset, 347 + mips32_GuestExceptionEnd - mips32_GuestException); 340 348 341 349 memcpy(gebase + offset, mips32_GuestException, 342 350 mips32_GuestExceptionEnd - mips32_GuestException); 343 351 344 352 /* Invalidate the icache for these ranges */ 345 - mips32_SyncICache((unsigned long) gebase, ALIGN(size, PAGE_SIZE)); 353 + local_flush_icache_range((unsigned long)gebase, 354 + (unsigned long)gebase + ALIGN(size, PAGE_SIZE)); 346 355 347 356 /* Allocate comm page for guest kernel, a TLB will be reserved for mapping GVA @ 0xFFFF8000 to this page */ 348 357 vcpu->arch.kseg0_commpage = kzalloc(PAGE_SIZE << 1, GFP_KERNEL); ··· 353 360 goto out_free_gebase; 354 361 } 355 362 356 - kvm_info("Allocated COMM page @ %p\n", vcpu->arch.kseg0_commpage); 363 + kvm_debug("Allocated COMM page @ %p\n", vcpu->arch.kseg0_commpage); 357 364 kvm_mips_commpage_init(vcpu); 358 365 359 366 /* Init */ 360 367 vcpu->arch.last_sched_cpu = -1; 361 368 362 369 /* Start off the timer */ 363 - kvm_mips_emulate_count(vcpu); 370 + kvm_mips_init_count(vcpu); 364 371 365 372 return vcpu; 366 373 ··· 382 389 383 390 kvm_mips_dump_stats(vcpu); 384 391 385 - if (vcpu->arch.guest_ebase) 386 - kfree(vcpu->arch.guest_ebase); 387 - 388 - if (vcpu->arch.kseg0_commpage) 389 - kfree(vcpu->arch.kseg0_commpage); 390 - 392 + kfree(vcpu->arch.guest_ebase); 393 + kfree(vcpu->arch.kseg0_commpage); 391 394 } 392 395 393 396 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) ··· 412 423 vcpu->mmio_needed = 0; 413 424 } 414 425 426 + local_irq_disable(); 415 427 /* Check if we have any exceptions/interrupts pending */ 416 428 kvm_mips_deliver_interrupts(vcpu, 417 429 kvm_read_c0_guest_cause(vcpu->arch.cop0)); 418 430 419 - local_irq_disable(); 420 431 kvm_guest_enter(); 421 432 422 433 r = __kvm_mips_vcpu_run(run, vcpu); ··· 479 490 return -ENOIOCTLCMD; 480 491 } 481 492 482 - #define MIPS_CP0_32(_R, _S) \ 483 - (KVM_REG_MIPS | KVM_REG_SIZE_U32 | 0x10000 | (8 * (_R) + (_S))) 484 - 485 - #define MIPS_CP0_64(_R, _S) \ 486 - (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0x10000 | (8 * (_R) + (_S))) 487 - 488 - #define KVM_REG_MIPS_CP0_INDEX MIPS_CP0_32(0, 0) 489 - #define KVM_REG_MIPS_CP0_ENTRYLO0 MIPS_CP0_64(2, 0) 490 - #define KVM_REG_MIPS_CP0_ENTRYLO1 MIPS_CP0_64(3, 0) 491 - #define KVM_REG_MIPS_CP0_CONTEXT MIPS_CP0_64(4, 0) 492 - #define KVM_REG_MIPS_CP0_USERLOCAL MIPS_CP0_64(4, 2) 493 - #define KVM_REG_MIPS_CP0_PAGEMASK MIPS_CP0_32(5, 0) 494 - #define KVM_REG_MIPS_CP0_PAGEGRAIN MIPS_CP0_32(5, 1) 495 - #define KVM_REG_MIPS_CP0_WIRED MIPS_CP0_32(6, 0) 496 - #define KVM_REG_MIPS_CP0_HWRENA MIPS_CP0_32(7, 0) 497 - #define KVM_REG_MIPS_CP0_BADVADDR MIPS_CP0_64(8, 0) 498 - #define KVM_REG_MIPS_CP0_COUNT MIPS_CP0_32(9, 0) 499 - #define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0) 500 - #define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0) 501 - #define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0) 502 - #define KVM_REG_MIPS_CP0_CAUSE MIPS_CP0_32(13, 0) 503 - #define KVM_REG_MIPS_CP0_EBASE MIPS_CP0_64(15, 1) 504 - #define KVM_REG_MIPS_CP0_CONFIG MIPS_CP0_32(16, 0) 505 - #define KVM_REG_MIPS_CP0_CONFIG1 MIPS_CP0_32(16, 1) 506 - #define KVM_REG_MIPS_CP0_CONFIG2 MIPS_CP0_32(16, 2) 507 - #define KVM_REG_MIPS_CP0_CONFIG3 MIPS_CP0_32(16, 3) 508 - #define KVM_REG_MIPS_CP0_CONFIG7 MIPS_CP0_32(16, 7) 509 - #define KVM_REG_MIPS_CP0_XCONTEXT MIPS_CP0_64(20, 0) 510 - #define KVM_REG_MIPS_CP0_ERROREPC MIPS_CP0_64(30, 0) 511 - 512 493 static u64 kvm_mips_get_one_regs[] = { 513 494 KVM_REG_MIPS_R0, 514 495 KVM_REG_MIPS_R1, ··· 519 560 520 561 KVM_REG_MIPS_CP0_INDEX, 521 562 KVM_REG_MIPS_CP0_CONTEXT, 563 + KVM_REG_MIPS_CP0_USERLOCAL, 522 564 KVM_REG_MIPS_CP0_PAGEMASK, 523 565 KVM_REG_MIPS_CP0_WIRED, 566 + KVM_REG_MIPS_CP0_HWRENA, 524 567 KVM_REG_MIPS_CP0_BADVADDR, 568 + KVM_REG_MIPS_CP0_COUNT, 525 569 KVM_REG_MIPS_CP0_ENTRYHI, 570 + KVM_REG_MIPS_CP0_COMPARE, 526 571 KVM_REG_MIPS_CP0_STATUS, 527 572 KVM_REG_MIPS_CP0_CAUSE, 528 - /* EPC set via kvm_regs, et al. */ 573 + KVM_REG_MIPS_CP0_EPC, 529 574 KVM_REG_MIPS_CP0_CONFIG, 530 575 KVM_REG_MIPS_CP0_CONFIG1, 531 576 KVM_REG_MIPS_CP0_CONFIG2, 532 577 KVM_REG_MIPS_CP0_CONFIG3, 533 578 KVM_REG_MIPS_CP0_CONFIG7, 534 - KVM_REG_MIPS_CP0_ERROREPC 579 + KVM_REG_MIPS_CP0_ERROREPC, 580 + 581 + KVM_REG_MIPS_COUNT_CTL, 582 + KVM_REG_MIPS_COUNT_RESUME, 583 + KVM_REG_MIPS_COUNT_HZ, 535 584 }; 536 585 537 586 static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, 538 587 const struct kvm_one_reg *reg) 539 588 { 540 589 struct mips_coproc *cop0 = vcpu->arch.cop0; 590 + int ret; 541 591 s64 v; 542 592 543 593 switch (reg->id) { ··· 569 601 case KVM_REG_MIPS_CP0_CONTEXT: 570 602 v = (long)kvm_read_c0_guest_context(cop0); 571 603 break; 604 + case KVM_REG_MIPS_CP0_USERLOCAL: 605 + v = (long)kvm_read_c0_guest_userlocal(cop0); 606 + break; 572 607 case KVM_REG_MIPS_CP0_PAGEMASK: 573 608 v = (long)kvm_read_c0_guest_pagemask(cop0); 574 609 break; 575 610 case KVM_REG_MIPS_CP0_WIRED: 576 611 v = (long)kvm_read_c0_guest_wired(cop0); 612 + break; 613 + case KVM_REG_MIPS_CP0_HWRENA: 614 + v = (long)kvm_read_c0_guest_hwrena(cop0); 577 615 break; 578 616 case KVM_REG_MIPS_CP0_BADVADDR: 579 617 v = (long)kvm_read_c0_guest_badvaddr(cop0); ··· 587 613 case KVM_REG_MIPS_CP0_ENTRYHI: 588 614 v = (long)kvm_read_c0_guest_entryhi(cop0); 589 615 break; 616 + case KVM_REG_MIPS_CP0_COMPARE: 617 + v = (long)kvm_read_c0_guest_compare(cop0); 618 + break; 590 619 case KVM_REG_MIPS_CP0_STATUS: 591 620 v = (long)kvm_read_c0_guest_status(cop0); 592 621 break; 593 622 case KVM_REG_MIPS_CP0_CAUSE: 594 623 v = (long)kvm_read_c0_guest_cause(cop0); 624 + break; 625 + case KVM_REG_MIPS_CP0_EPC: 626 + v = (long)kvm_read_c0_guest_epc(cop0); 595 627 break; 596 628 case KVM_REG_MIPS_CP0_ERROREPC: 597 629 v = (long)kvm_read_c0_guest_errorepc(cop0); ··· 616 636 break; 617 637 case KVM_REG_MIPS_CP0_CONFIG7: 618 638 v = (long)kvm_read_c0_guest_config7(cop0); 639 + break; 640 + /* registers to be handled specially */ 641 + case KVM_REG_MIPS_CP0_COUNT: 642 + case KVM_REG_MIPS_COUNT_CTL: 643 + case KVM_REG_MIPS_COUNT_RESUME: 644 + case KVM_REG_MIPS_COUNT_HZ: 645 + ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v); 646 + if (ret) 647 + return ret; 619 648 break; 620 649 default: 621 650 return -EINVAL; ··· 686 697 case KVM_REG_MIPS_CP0_CONTEXT: 687 698 kvm_write_c0_guest_context(cop0, v); 688 699 break; 700 + case KVM_REG_MIPS_CP0_USERLOCAL: 701 + kvm_write_c0_guest_userlocal(cop0, v); 702 + break; 689 703 case KVM_REG_MIPS_CP0_PAGEMASK: 690 704 kvm_write_c0_guest_pagemask(cop0, v); 691 705 break; 692 706 case KVM_REG_MIPS_CP0_WIRED: 693 707 kvm_write_c0_guest_wired(cop0, v); 708 + break; 709 + case KVM_REG_MIPS_CP0_HWRENA: 710 + kvm_write_c0_guest_hwrena(cop0, v); 694 711 break; 695 712 case KVM_REG_MIPS_CP0_BADVADDR: 696 713 kvm_write_c0_guest_badvaddr(cop0, v); ··· 707 712 case KVM_REG_MIPS_CP0_STATUS: 708 713 kvm_write_c0_guest_status(cop0, v); 709 714 break; 710 - case KVM_REG_MIPS_CP0_CAUSE: 711 - kvm_write_c0_guest_cause(cop0, v); 715 + case KVM_REG_MIPS_CP0_EPC: 716 + kvm_write_c0_guest_epc(cop0, v); 712 717 break; 713 718 case KVM_REG_MIPS_CP0_ERROREPC: 714 719 kvm_write_c0_guest_errorepc(cop0, v); 715 720 break; 721 + /* registers to be handled specially */ 722 + case KVM_REG_MIPS_CP0_COUNT: 723 + case KVM_REG_MIPS_CP0_COMPARE: 724 + case KVM_REG_MIPS_CP0_CAUSE: 725 + case KVM_REG_MIPS_COUNT_CTL: 726 + case KVM_REG_MIPS_COUNT_RESUME: 727 + case KVM_REG_MIPS_COUNT_HZ: 728 + return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); 716 729 default: 717 730 return -EINVAL; 718 731 } ··· 923 920 return -1; 924 921 925 922 printk("VCPU Register Dump:\n"); 926 - printk("\tpc = 0x%08lx\n", vcpu->arch.pc);; 923 + printk("\tpc = 0x%08lx\n", vcpu->arch.pc); 927 924 printk("\texceptions: %08lx\n", vcpu->arch.pending_exceptions); 928 925 929 926 for (i = 0; i < 32; i += 4) { ··· 972 969 return 0; 973 970 } 974 971 975 - void kvm_mips_comparecount_func(unsigned long data) 972 + static void kvm_mips_comparecount_func(unsigned long data) 976 973 { 977 974 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; 978 975 ··· 987 984 /* 988 985 * low level hrtimer wake routine. 989 986 */ 990 - enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) 987 + static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) 991 988 { 992 989 struct kvm_vcpu *vcpu; 993 990 994 991 vcpu = container_of(timer, struct kvm_vcpu, arch.comparecount_timer); 995 992 kvm_mips_comparecount_func((unsigned long) vcpu); 996 - hrtimer_forward_now(&vcpu->arch.comparecount_timer, 997 - ktime_set(0, MS_TO_NS(10))); 998 - return HRTIMER_RESTART; 993 + return kvm_mips_count_timeout(vcpu); 999 994 } 1000 995 1001 996 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)

+9 -6

arch/mips/kvm/kvm_mips_dyntrans.c

··· 16 16 #include <linux/vmalloc.h> 17 17 #include <linux/fs.h> 18 18 #include <linux/bootmem.h> 19 + #include <asm/cacheflush.h> 19 20 20 21 #include "kvm_mips_comm.h" 21 22 ··· 41 40 CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa 42 41 (vcpu, (unsigned long) opc)); 43 42 memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t)); 44 - mips32_SyncICache(kseg0_opc, 32); 43 + local_flush_icache_range(kseg0_opc, kseg0_opc + 32); 45 44 46 45 return result; 47 46 } ··· 67 66 CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa 68 67 (vcpu, (unsigned long) opc)); 69 68 memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t)); 70 - mips32_SyncICache(kseg0_opc, 32); 69 + local_flush_icache_range(kseg0_opc, kseg0_opc + 32); 71 70 72 71 return result; 73 72 } ··· 100 99 CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa 101 100 (vcpu, (unsigned long) opc)); 102 101 memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t)); 103 - mips32_SyncICache(kseg0_opc, 32); 102 + local_flush_icache_range(kseg0_opc, kseg0_opc + 32); 104 103 } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { 105 104 local_irq_save(flags); 106 105 memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t)); 107 - mips32_SyncICache((unsigned long) opc, 32); 106 + local_flush_icache_range((unsigned long)opc, 107 + (unsigned long)opc + 32); 108 108 local_irq_restore(flags); 109 109 } else { 110 110 kvm_err("%s: Invalid address: %p\n", __func__, opc); ··· 136 134 CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa 137 135 (vcpu, (unsigned long) opc)); 138 136 memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t)); 139 - mips32_SyncICache(kseg0_opc, 32); 137 + local_flush_icache_range(kseg0_opc, kseg0_opc + 32); 140 138 } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { 141 139 local_irq_save(flags); 142 140 memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t)); 143 - mips32_SyncICache((unsigned long) opc, 32); 141 + local_flush_icache_range((unsigned long)opc, 142 + (unsigned long)opc + 32); 144 143 local_irq_restore(flags); 145 144 } else { 146 145 kvm_err("%s: Invalid address: %p\n", __func__, opc);

+533 -26

arch/mips/kvm/kvm_mips_emul.c

··· 11 11 12 12 #include <linux/errno.h> 13 13 #include <linux/err.h> 14 + #include <linux/ktime.h> 14 15 #include <linux/kvm_host.h> 15 16 #include <linux/module.h> 16 17 #include <linux/vmalloc.h> ··· 229 228 return er; 230 229 } 231 230 232 - /* Everytime the compare register is written to, we need to decide when to fire 233 - * the timer that represents timer ticks to the GUEST. 231 + /** 232 + * kvm_mips_count_disabled() - Find whether the CP0_Count timer is disabled. 233 + * @vcpu: Virtual CPU. 234 234 * 235 + * Returns: 1 if the CP0_Count timer is disabled by either the guest 236 + * CP0_Cause.DC bit or the count_ctl.DC bit. 237 + * 0 otherwise (in which case CP0_Count timer is running). 235 238 */ 236 - enum emulation_result kvm_mips_emulate_count(struct kvm_vcpu *vcpu) 239 + static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu) 237 240 { 238 241 struct mips_coproc *cop0 = vcpu->arch.cop0; 239 - enum emulation_result er = EMULATE_DONE; 242 + return (vcpu->arch.count_ctl & KVM_REG_MIPS_COUNT_CTL_DC) || 243 + (kvm_read_c0_guest_cause(cop0) & CAUSEF_DC); 244 + } 240 245 241 - /* If COUNT is enabled */ 242 - if (!(kvm_read_c0_guest_cause(cop0) & CAUSEF_DC)) { 243 - hrtimer_try_to_cancel(&vcpu->arch.comparecount_timer); 244 - hrtimer_start(&vcpu->arch.comparecount_timer, 245 - ktime_set(0, MS_TO_NS(10)), HRTIMER_MODE_REL); 246 - } else { 247 - hrtimer_try_to_cancel(&vcpu->arch.comparecount_timer); 246 + /** 247 + * kvm_mips_ktime_to_count() - Scale ktime_t to a 32-bit count. 248 + * 249 + * Caches the dynamic nanosecond bias in vcpu->arch.count_dyn_bias. 250 + * 251 + * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). 252 + */ 253 + static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now) 254 + { 255 + s64 now_ns, periods; 256 + u64 delta; 257 + 258 + now_ns = ktime_to_ns(now); 259 + delta = now_ns + vcpu->arch.count_dyn_bias; 260 + 261 + if (delta >= vcpu->arch.count_period) { 262 + /* If delta is out of safe range the bias needs adjusting */ 263 + periods = div64_s64(now_ns, vcpu->arch.count_period); 264 + vcpu->arch.count_dyn_bias = -periods * vcpu->arch.count_period; 265 + /* Recalculate delta with new bias */ 266 + delta = now_ns + vcpu->arch.count_dyn_bias; 248 267 } 249 268 250 - return er; 269 + /* 270 + * We've ensured that: 271 + * delta < count_period 272 + * 273 + * Therefore the intermediate delta*count_hz will never overflow since 274 + * at the boundary condition: 275 + * delta = count_period 276 + * delta = NSEC_PER_SEC * 2^32 / count_hz 277 + * delta * count_hz = NSEC_PER_SEC * 2^32 278 + */ 279 + return div_u64(delta * vcpu->arch.count_hz, NSEC_PER_SEC); 280 + } 281 + 282 + /** 283 + * kvm_mips_count_time() - Get effective current time. 284 + * @vcpu: Virtual CPU. 285 + * 286 + * Get effective monotonic ktime. This is usually a straightforward ktime_get(), 287 + * except when the master disable bit is set in count_ctl, in which case it is 288 + * count_resume, i.e. the time that the count was disabled. 289 + * 290 + * Returns: Effective monotonic ktime for CP0_Count. 291 + */ 292 + static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu) 293 + { 294 + if (unlikely(vcpu->arch.count_ctl & KVM_REG_MIPS_COUNT_CTL_DC)) 295 + return vcpu->arch.count_resume; 296 + 297 + return ktime_get(); 298 + } 299 + 300 + /** 301 + * kvm_mips_read_count_running() - Read the current count value as if running. 302 + * @vcpu: Virtual CPU. 303 + * @now: Kernel time to read CP0_Count at. 304 + * 305 + * Returns the current guest CP0_Count register at time @now and handles if the 306 + * timer interrupt is pending and hasn't been handled yet. 307 + * 308 + * Returns: The current value of the guest CP0_Count register. 309 + */ 310 + static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) 311 + { 312 + ktime_t expires; 313 + int running; 314 + 315 + /* Is the hrtimer pending? */ 316 + expires = hrtimer_get_expires(&vcpu->arch.comparecount_timer); 317 + if (ktime_compare(now, expires) >= 0) { 318 + /* 319 + * Cancel it while we handle it so there's no chance of 320 + * interference with the timeout handler. 321 + */ 322 + running = hrtimer_cancel(&vcpu->arch.comparecount_timer); 323 + 324 + /* Nothing should be waiting on the timeout */ 325 + kvm_mips_callbacks->queue_timer_int(vcpu); 326 + 327 + /* 328 + * Restart the timer if it was running based on the expiry time 329 + * we read, so that we don't push it back 2 periods. 330 + */ 331 + if (running) { 332 + expires = ktime_add_ns(expires, 333 + vcpu->arch.count_period); 334 + hrtimer_start(&vcpu->arch.comparecount_timer, expires, 335 + HRTIMER_MODE_ABS); 336 + } 337 + } 338 + 339 + /* Return the biased and scaled guest CP0_Count */ 340 + return vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now); 341 + } 342 + 343 + /** 344 + * kvm_mips_read_count() - Read the current count value. 345 + * @vcpu: Virtual CPU. 346 + * 347 + * Read the current guest CP0_Count value, taking into account whether the timer 348 + * is stopped. 349 + * 350 + * Returns: The current guest CP0_Count value. 351 + */ 352 + uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu) 353 + { 354 + struct mips_coproc *cop0 = vcpu->arch.cop0; 355 + 356 + /* If count disabled just read static copy of count */ 357 + if (kvm_mips_count_disabled(vcpu)) 358 + return kvm_read_c0_guest_count(cop0); 359 + 360 + return kvm_mips_read_count_running(vcpu, ktime_get()); 361 + } 362 + 363 + /** 364 + * kvm_mips_freeze_hrtimer() - Safely stop the hrtimer. 365 + * @vcpu: Virtual CPU. 366 + * @count: Output pointer for CP0_Count value at point of freeze. 367 + * 368 + * Freeze the hrtimer safely and return both the ktime and the CP0_Count value 369 + * at the point it was frozen. It is guaranteed that any pending interrupts at 370 + * the point it was frozen are handled, and none after that point. 371 + * 372 + * This is useful where the time/CP0_Count is needed in the calculation of the 373 + * new parameters. 374 + * 375 + * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). 376 + * 377 + * Returns: The ktime at the point of freeze. 378 + */ 379 + static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, 380 + uint32_t *count) 381 + { 382 + ktime_t now; 383 + 384 + /* stop hrtimer before finding time */ 385 + hrtimer_cancel(&vcpu->arch.comparecount_timer); 386 + now = ktime_get(); 387 + 388 + /* find count at this point and handle pending hrtimer */ 389 + *count = kvm_mips_read_count_running(vcpu, now); 390 + 391 + return now; 392 + } 393 + 394 + 395 + /** 396 + * kvm_mips_resume_hrtimer() - Resume hrtimer, updating expiry. 397 + * @vcpu: Virtual CPU. 398 + * @now: ktime at point of resume. 399 + * @count: CP0_Count at point of resume. 400 + * 401 + * Resumes the timer and updates the timer expiry based on @now and @count. 402 + * This can be used in conjunction with kvm_mips_freeze_timer() when timer 403 + * parameters need to be changed. 404 + * 405 + * It is guaranteed that a timer interrupt immediately after resume will be 406 + * handled, but not if CP_Compare is exactly at @count. That case is already 407 + * handled by kvm_mips_freeze_timer(). 408 + * 409 + * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). 410 + */ 411 + static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, 412 + ktime_t now, uint32_t count) 413 + { 414 + struct mips_coproc *cop0 = vcpu->arch.cop0; 415 + uint32_t compare; 416 + u64 delta; 417 + ktime_t expire; 418 + 419 + /* Calculate timeout (wrap 0 to 2^32) */ 420 + compare = kvm_read_c0_guest_compare(cop0); 421 + delta = (u64)(uint32_t)(compare - count - 1) + 1; 422 + delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz); 423 + expire = ktime_add_ns(now, delta); 424 + 425 + /* Update hrtimer to use new timeout */ 426 + hrtimer_cancel(&vcpu->arch.comparecount_timer); 427 + hrtimer_start(&vcpu->arch.comparecount_timer, expire, HRTIMER_MODE_ABS); 428 + } 429 + 430 + /** 431 + * kvm_mips_update_hrtimer() - Update next expiry time of hrtimer. 432 + * @vcpu: Virtual CPU. 433 + * 434 + * Recalculates and updates the expiry time of the hrtimer. This can be used 435 + * after timer parameters have been altered which do not depend on the time that 436 + * the change occurs (in those cases kvm_mips_freeze_hrtimer() and 437 + * kvm_mips_resume_hrtimer() are used directly). 438 + * 439 + * It is guaranteed that no timer interrupts will be lost in the process. 440 + * 441 + * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). 442 + */ 443 + static void kvm_mips_update_hrtimer(struct kvm_vcpu *vcpu) 444 + { 445 + ktime_t now; 446 + uint32_t count; 447 + 448 + /* 449 + * freeze_hrtimer takes care of a timer interrupts <= count, and 450 + * resume_hrtimer the hrtimer takes care of a timer interrupts > count. 451 + */ 452 + now = kvm_mips_freeze_hrtimer(vcpu, &count); 453 + kvm_mips_resume_hrtimer(vcpu, now, count); 454 + } 455 + 456 + /** 457 + * kvm_mips_write_count() - Modify the count and update timer. 458 + * @vcpu: Virtual CPU. 459 + * @count: Guest CP0_Count value to set. 460 + * 461 + * Sets the CP0_Count value and updates the timer accordingly. 462 + */ 463 + void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count) 464 + { 465 + struct mips_coproc *cop0 = vcpu->arch.cop0; 466 + ktime_t now; 467 + 468 + /* Calculate bias */ 469 + now = kvm_mips_count_time(vcpu); 470 + vcpu->arch.count_bias = count - kvm_mips_ktime_to_count(vcpu, now); 471 + 472 + if (kvm_mips_count_disabled(vcpu)) 473 + /* The timer's disabled, adjust the static count */ 474 + kvm_write_c0_guest_count(cop0, count); 475 + else 476 + /* Update timeout */ 477 + kvm_mips_resume_hrtimer(vcpu, now, count); 478 + } 479 + 480 + /** 481 + * kvm_mips_init_count() - Initialise timer. 482 + * @vcpu: Virtual CPU. 483 + * 484 + * Initialise the timer to a sensible frequency, namely 100MHz, zero it, and set 485 + * it going if it's enabled. 486 + */ 487 + void kvm_mips_init_count(struct kvm_vcpu *vcpu) 488 + { 489 + /* 100 MHz */ 490 + vcpu->arch.count_hz = 100*1000*1000; 491 + vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, 492 + vcpu->arch.count_hz); 493 + vcpu->arch.count_dyn_bias = 0; 494 + 495 + /* Starting at 0 */ 496 + kvm_mips_write_count(vcpu, 0); 497 + } 498 + 499 + /** 500 + * kvm_mips_set_count_hz() - Update the frequency of the timer. 501 + * @vcpu: Virtual CPU. 502 + * @count_hz: Frequency of CP0_Count timer in Hz. 503 + * 504 + * Change the frequency of the CP0_Count timer. This is done atomically so that 505 + * CP0_Count is continuous and no timer interrupt is lost. 506 + * 507 + * Returns: -EINVAL if @count_hz is out of range. 508 + * 0 on success. 509 + */ 510 + int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz) 511 + { 512 + struct mips_coproc *cop0 = vcpu->arch.cop0; 513 + int dc; 514 + ktime_t now; 515 + u32 count; 516 + 517 + /* ensure the frequency is in a sensible range... */ 518 + if (count_hz <= 0 || count_hz > NSEC_PER_SEC) 519 + return -EINVAL; 520 + /* ... and has actually changed */ 521 + if (vcpu->arch.count_hz == count_hz) 522 + return 0; 523 + 524 + /* Safely freeze timer so we can keep it continuous */ 525 + dc = kvm_mips_count_disabled(vcpu); 526 + if (dc) { 527 + now = kvm_mips_count_time(vcpu); 528 + count = kvm_read_c0_guest_count(cop0); 529 + } else { 530 + now = kvm_mips_freeze_hrtimer(vcpu, &count); 531 + } 532 + 533 + /* Update the frequency */ 534 + vcpu->arch.count_hz = count_hz; 535 + vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, count_hz); 536 + vcpu->arch.count_dyn_bias = 0; 537 + 538 + /* Calculate adjusted bias so dynamic count is unchanged */ 539 + vcpu->arch.count_bias = count - kvm_mips_ktime_to_count(vcpu, now); 540 + 541 + /* Update and resume hrtimer */ 542 + if (!dc) 543 + kvm_mips_resume_hrtimer(vcpu, now, count); 544 + return 0; 545 + } 546 + 547 + /** 548 + * kvm_mips_write_compare() - Modify compare and update timer. 549 + * @vcpu: Virtual CPU. 550 + * @compare: New CP0_Compare value. 551 + * 552 + * Update CP0_Compare to a new value and update the timeout. 553 + */ 554 + void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare) 555 + { 556 + struct mips_coproc *cop0 = vcpu->arch.cop0; 557 + 558 + /* if unchanged, must just be an ack */ 559 + if (kvm_read_c0_guest_compare(cop0) == compare) 560 + return; 561 + 562 + /* Update compare */ 563 + kvm_write_c0_guest_compare(cop0, compare); 564 + 565 + /* Update timeout if count enabled */ 566 + if (!kvm_mips_count_disabled(vcpu)) 567 + kvm_mips_update_hrtimer(vcpu); 568 + } 569 + 570 + /** 571 + * kvm_mips_count_disable() - Disable count. 572 + * @vcpu: Virtual CPU. 573 + * 574 + * Disable the CP0_Count timer. A timer interrupt on or before the final stop 575 + * time will be handled but not after. 576 + * 577 + * Assumes CP0_Count was previously enabled but now Guest.CP0_Cause.DC or 578 + * count_ctl.DC has been set (count disabled). 579 + * 580 + * Returns: The time that the timer was stopped. 581 + */ 582 + static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu) 583 + { 584 + struct mips_coproc *cop0 = vcpu->arch.cop0; 585 + uint32_t count; 586 + ktime_t now; 587 + 588 + /* Stop hrtimer */ 589 + hrtimer_cancel(&vcpu->arch.comparecount_timer); 590 + 591 + /* Set the static count from the dynamic count, handling pending TI */ 592 + now = ktime_get(); 593 + count = kvm_mips_read_count_running(vcpu, now); 594 + kvm_write_c0_guest_count(cop0, count); 595 + 596 + return now; 597 + } 598 + 599 + /** 600 + * kvm_mips_count_disable_cause() - Disable count using CP0_Cause.DC. 601 + * @vcpu: Virtual CPU. 602 + * 603 + * Disable the CP0_Count timer and set CP0_Cause.DC. A timer interrupt on or 604 + * before the final stop time will be handled if the timer isn't disabled by 605 + * count_ctl.DC, but not after. 606 + * 607 + * Assumes CP0_Cause.DC is clear (count enabled). 608 + */ 609 + void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu) 610 + { 611 + struct mips_coproc *cop0 = vcpu->arch.cop0; 612 + 613 + kvm_set_c0_guest_cause(cop0, CAUSEF_DC); 614 + if (!(vcpu->arch.count_ctl & KVM_REG_MIPS_COUNT_CTL_DC)) 615 + kvm_mips_count_disable(vcpu); 616 + } 617 + 618 + /** 619 + * kvm_mips_count_enable_cause() - Enable count using CP0_Cause.DC. 620 + * @vcpu: Virtual CPU. 621 + * 622 + * Enable the CP0_Count timer and clear CP0_Cause.DC. A timer interrupt after 623 + * the start time will be handled if the timer isn't disabled by count_ctl.DC, 624 + * potentially before even returning, so the caller should be careful with 625 + * ordering of CP0_Cause modifications so as not to lose it. 626 + * 627 + * Assumes CP0_Cause.DC is set (count disabled). 628 + */ 629 + void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu) 630 + { 631 + struct mips_coproc *cop0 = vcpu->arch.cop0; 632 + uint32_t count; 633 + 634 + kvm_clear_c0_guest_cause(cop0, CAUSEF_DC); 635 + 636 + /* 637 + * Set the dynamic count to match the static count. 638 + * This starts the hrtimer if count_ctl.DC allows it. 639 + * Otherwise it conveniently updates the biases. 640 + */ 641 + count = kvm_read_c0_guest_count(cop0); 642 + kvm_mips_write_count(vcpu, count); 643 + } 644 + 645 + /** 646 + * kvm_mips_set_count_ctl() - Update the count control KVM register. 647 + * @vcpu: Virtual CPU. 648 + * @count_ctl: Count control register new value. 649 + * 650 + * Set the count control KVM register. The timer is updated accordingly. 651 + * 652 + * Returns: -EINVAL if reserved bits are set. 653 + * 0 on success. 654 + */ 655 + int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl) 656 + { 657 + struct mips_coproc *cop0 = vcpu->arch.cop0; 658 + s64 changed = count_ctl ^ vcpu->arch.count_ctl; 659 + s64 delta; 660 + ktime_t expire, now; 661 + uint32_t count, compare; 662 + 663 + /* Only allow defined bits to be changed */ 664 + if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC)) 665 + return -EINVAL; 666 + 667 + /* Apply new value */ 668 + vcpu->arch.count_ctl = count_ctl; 669 + 670 + /* Master CP0_Count disable */ 671 + if (changed & KVM_REG_MIPS_COUNT_CTL_DC) { 672 + /* Is CP0_Cause.DC already disabling CP0_Count? */ 673 + if (kvm_read_c0_guest_cause(cop0) & CAUSEF_DC) { 674 + if (count_ctl & KVM_REG_MIPS_COUNT_CTL_DC) 675 + /* Just record the current time */ 676 + vcpu->arch.count_resume = ktime_get(); 677 + } else if (count_ctl & KVM_REG_MIPS_COUNT_CTL_DC) { 678 + /* disable timer and record current time */ 679 + vcpu->arch.count_resume = kvm_mips_count_disable(vcpu); 680 + } else { 681 + /* 682 + * Calculate timeout relative to static count at resume 683 + * time (wrap 0 to 2^32). 684 + */ 685 + count = kvm_read_c0_guest_count(cop0); 686 + compare = kvm_read_c0_guest_compare(cop0); 687 + delta = (u64)(uint32_t)(compare - count - 1) + 1; 688 + delta = div_u64(delta * NSEC_PER_SEC, 689 + vcpu->arch.count_hz); 690 + expire = ktime_add_ns(vcpu->arch.count_resume, delta); 691 + 692 + /* Handle pending interrupt */ 693 + now = ktime_get(); 694 + if (ktime_compare(now, expire) >= 0) 695 + /* Nothing should be waiting on the timeout */ 696 + kvm_mips_callbacks->queue_timer_int(vcpu); 697 + 698 + /* Resume hrtimer without changing bias */ 699 + count = kvm_mips_read_count_running(vcpu, now); 700 + kvm_mips_resume_hrtimer(vcpu, now, count); 701 + } 702 + } 703 + 704 + return 0; 705 + } 706 + 707 + /** 708 + * kvm_mips_set_count_resume() - Update the count resume KVM register. 709 + * @vcpu: Virtual CPU. 710 + * @count_resume: Count resume register new value. 711 + * 712 + * Set the count resume KVM register. 713 + * 714 + * Returns: -EINVAL if out of valid range (0..now). 715 + * 0 on success. 716 + */ 717 + int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume) 718 + { 719 + /* 720 + * It doesn't make sense for the resume time to be in the future, as it 721 + * would be possible for the next interrupt to be more than a full 722 + * period in the future. 723 + */ 724 + if (count_resume < 0 || count_resume > ktime_to_ns(ktime_get())) 725 + return -EINVAL; 726 + 727 + vcpu->arch.count_resume = ns_to_ktime(count_resume); 728 + return 0; 729 + } 730 + 731 + /** 732 + * kvm_mips_count_timeout() - Push timer forward on timeout. 733 + * @vcpu: Virtual CPU. 734 + * 735 + * Handle an hrtimer event by push the hrtimer forward a period. 736 + * 737 + * Returns: The hrtimer_restart value to return to the hrtimer subsystem. 738 + */ 739 + enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu) 740 + { 741 + /* Add the Count period to the current expiry time */ 742 + hrtimer_add_expires_ns(&vcpu->arch.comparecount_timer, 743 + vcpu->arch.count_period); 744 + return HRTIMER_RESTART; 251 745 } 252 746 253 747 enum emulation_result kvm_mips_emul_eret(struct kvm_vcpu *vcpu) ··· 967 471 #endif 968 472 /* Get reg */ 969 473 if ((rd == MIPS_CP0_COUNT) && (sel == 0)) { 970 - /* XXXKYMA: Run the Guest count register @ 1/4 the rate of the host */ 971 - vcpu->arch.gprs[rt] = (read_c0_count() >> 2); 474 + vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu); 972 475 } else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) { 973 476 vcpu->arch.gprs[rt] = 0x0; 974 477 #ifdef CONFIG_KVM_MIPS_DYN_TRANS ··· 1034 539 } 1035 540 /* Are we writing to COUNT */ 1036 541 else if ((rd == MIPS_CP0_COUNT) && (sel == 0)) { 1037 - /* Linux doesn't seem to write into COUNT, we throw an error 1038 - * if we notice a write to COUNT 1039 - */ 1040 - /*er = EMULATE_FAIL; */ 542 + kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]); 1041 543 goto done; 1042 544 } else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) { 1043 545 kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n", ··· 1044 552 /* If we are writing to COMPARE */ 1045 553 /* Clear pending timer interrupt, if any */ 1046 554 kvm_mips_callbacks->dequeue_timer_int(vcpu); 1047 - kvm_write_c0_guest_compare(cop0, 1048 - vcpu->arch.gprs[rt]); 555 + kvm_mips_write_compare(vcpu, 556 + vcpu->arch.gprs[rt]); 1049 557 } else if ((rd == MIPS_CP0_STATUS) && (sel == 0)) { 1050 558 kvm_write_c0_guest_status(cop0, 1051 559 vcpu->arch.gprs[rt]); ··· 1056 564 #ifdef CONFIG_KVM_MIPS_DYN_TRANS 1057 565 kvm_mips_trans_mtc0(inst, opc, vcpu); 1058 566 #endif 567 + } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) { 568 + uint32_t old_cause, new_cause; 569 + old_cause = kvm_read_c0_guest_cause(cop0); 570 + new_cause = vcpu->arch.gprs[rt]; 571 + /* Update R/W bits */ 572 + kvm_change_c0_guest_cause(cop0, 0x08800300, 573 + new_cause); 574 + /* DC bit enabling/disabling timer? */ 575 + if ((old_cause ^ new_cause) & CAUSEF_DC) { 576 + if (new_cause & CAUSEF_DC) 577 + kvm_mips_count_disable_cause(vcpu); 578 + else 579 + kvm_mips_count_enable_cause(vcpu); 580 + } 1059 581 } else { 1060 582 cop0->reg[rd][sel] = vcpu->arch.gprs[rt]; 1061 583 #ifdef CONFIG_KVM_MIPS_DYN_TRANS ··· 1393 887 1394 888 printk("%s: va: %#lx, unmapped: %#x\n", __func__, va, CKSEG0ADDR(pa)); 1395 889 1396 - mips32_SyncICache(CKSEG0ADDR(pa), 32); 890 + local_flush_icache_range(CKSEG0ADDR(pa), 32); 1397 891 return 0; 1398 892 } 1399 893 ··· 1831 1325 struct kvm_run *run, struct kvm_vcpu *vcpu) 1832 1326 { 1833 1327 enum emulation_result er = EMULATE_DONE; 1834 - 1835 1328 #ifdef DEBUG 1329 + struct mips_coproc *cop0 = vcpu->arch.cop0; 1330 + unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) | 1331 + (kvm_read_c0_guest_entryhi(cop0) & ASID_MASK); 1332 + int index; 1333 + 1836 1334 /* 1837 1335 * If address not in the guest TLB, then we are in trouble 1838 1336 */ ··· 2063 1553 current_cpu_data.icache.linesz); 2064 1554 break; 2065 1555 case 2: /* Read count register */ 2066 - printk("RDHWR: Cont register\n"); 2067 - arch->gprs[rt] = kvm_read_c0_guest_count(cop0); 1556 + arch->gprs[rt] = kvm_mips_read_count(vcpu); 2068 1557 break; 2069 1558 case 3: /* Count register resolution */ 2070 1559 switch (current_cpu_data.cputype) { ··· 2319 1810 er = EMULATE_FAIL; 2320 1811 } 2321 1812 } else { 2322 - #ifdef DEBUG 2323 1813 kvm_debug 2324 1814 ("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n", 2325 1815 tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1); 2326 - #endif 2327 1816 /* OK we have a Guest TLB entry, now inject it into the shadow host TLB */ 2328 1817 kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL, 2329 1818 NULL);

+40 -37

arch/mips/kvm/kvm_tlb.c

··· 222 222 return -1; 223 223 } 224 224 225 - if (idx < 0) { 226 - idx = read_c0_random() % current_cpu_data.tlbsize; 227 - write_c0_index(idx); 228 - mtc0_tlbw_hazard(); 229 - } 230 225 write_c0_entrylo0(entrylo0); 231 226 write_c0_entrylo1(entrylo1); 232 227 mtc0_tlbw_hazard(); 233 228 234 - tlb_write_indexed(); 229 + if (idx < 0) 230 + tlb_write_random(); 231 + else 232 + tlb_write_indexed(); 235 233 tlbw_use_hazard(); 236 234 237 - #ifdef DEBUG 238 - if (debug) { 239 - kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] " 240 - "entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n", 241 - vcpu->arch.pc, idx, read_c0_entryhi(), 242 - read_c0_entrylo0(), read_c0_entrylo1()); 243 - } 244 - #endif 235 + kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n", 236 + vcpu->arch.pc, idx, read_c0_entryhi(), 237 + read_c0_entrylo0(), read_c0_entrylo1()); 245 238 246 239 /* Flush D-cache */ 247 240 if (flush_dcache_mask) { ··· 341 348 mtc0_tlbw_hazard(); 342 349 tlbw_use_hazard(); 343 350 344 - #ifdef DEBUG 345 351 kvm_debug ("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n", 346 352 vcpu->arch.pc, read_c0_index(), read_c0_entryhi(), 347 353 read_c0_entrylo0(), read_c0_entrylo1()); 348 - #endif 349 354 350 355 /* Restore old ASID */ 351 356 write_c0_entryhi(old_entryhi); ··· 391 400 entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | 392 401 (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V); 393 402 394 - #ifdef DEBUG 395 403 kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, 396 404 tlb->tlb_lo0, tlb->tlb_lo1); 397 - #endif 398 405 399 406 return kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, 400 407 tlb->tlb_mask); ··· 413 424 } 414 425 } 415 426 416 - #ifdef DEBUG 417 427 kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n", 418 428 __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1); 419 - #endif 420 429 421 430 return index; 422 431 } ··· 448 461 449 462 local_irq_restore(flags); 450 463 451 - #ifdef DEBUG 452 464 kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx); 453 - #endif 454 465 455 466 return idx; 456 467 } ··· 493 508 494 509 local_irq_restore(flags); 495 510 496 - #ifdef DEBUG 497 - if (idx > 0) { 511 + if (idx > 0) 498 512 kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__, 499 - (va & VPN2_MASK) | (vcpu->arch.asid_map[va & ASID_MASK] & ASID_MASK), idx); 500 - } 501 - #endif 513 + (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx); 502 514 503 515 return 0; 504 516 } ··· 640 658 local_irq_restore(flags); 641 659 } 642 660 661 + /** 662 + * kvm_mips_migrate_count() - Migrate timer. 663 + * @vcpu: Virtual CPU. 664 + * 665 + * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it 666 + * if it was running prior to being cancelled. 667 + * 668 + * Must be called when the VCPU is migrated to a different CPU to ensure that 669 + * timer expiry during guest execution interrupts the guest and causes the 670 + * interrupt to be delivered in a timely manner. 671 + */ 672 + static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu) 673 + { 674 + if (hrtimer_cancel(&vcpu->arch.comparecount_timer)) 675 + hrtimer_restart(&vcpu->arch.comparecount_timer); 676 + } 677 + 643 678 /* Restore ASID once we are scheduled back after preemption */ 644 679 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 645 680 { 646 681 unsigned long flags; 647 682 int newasid = 0; 648 683 649 - #ifdef DEBUG 650 684 kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); 651 - #endif 652 685 653 686 /* Alocate new kernel and user ASIDs if needed */ 654 687 ··· 679 682 vcpu->arch.guest_user_mm.context.asid[cpu]; 680 683 newasid++; 681 684 682 - kvm_info("[%d]: cpu_context: %#lx\n", cpu, 683 - cpu_context(cpu, current->mm)); 684 - kvm_info("[%d]: Allocated new ASID for Guest Kernel: %#x\n", 685 - cpu, vcpu->arch.guest_kernel_asid[cpu]); 686 - kvm_info("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, 687 - vcpu->arch.guest_user_asid[cpu]); 685 + kvm_debug("[%d]: cpu_context: %#lx\n", cpu, 686 + cpu_context(cpu, current->mm)); 687 + kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", 688 + cpu, vcpu->arch.guest_kernel_asid[cpu]); 689 + kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, 690 + vcpu->arch.guest_user_asid[cpu]); 688 691 } 689 692 690 693 if (vcpu->arch.last_sched_cpu != cpu) { 691 - kvm_info("[%d->%d]KVM VCPU[%d] switch\n", 692 - vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); 694 + kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", 695 + vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); 696 + /* 697 + * Migrate the timer interrupt to the current CPU so that it 698 + * always interrupts the guest and synchronously triggers a 699 + * guest timer interrupt. 700 + */ 701 + kvm_mips_migrate_count(vcpu); 693 702 } 694 703 695 704 if (!newasid) {

+74 -12

arch/mips/kvm/kvm_trap_emul.c

··· 32 32 gpa = KVM_INVALID_ADDR; 33 33 } 34 34 35 - #ifdef DEBUG 36 35 kvm_debug("%s: gva %#lx, gpa: %#llx\n", __func__, gva, gpa); 37 - #endif 38 36 39 37 return gpa; 40 38 } ··· 83 85 84 86 if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 85 87 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { 86 - #ifdef DEBUG 87 88 kvm_debug 88 89 ("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n", 89 90 cause, opc, badvaddr); 90 - #endif 91 91 er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu); 92 92 93 93 if (er == EMULATE_DONE) ··· 134 138 } 135 139 } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 136 140 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { 137 - #ifdef DEBUG 138 141 kvm_debug 139 142 ("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n", 140 143 cause, opc, badvaddr); 141 - #endif 142 144 er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu); 143 145 if (er == EMULATE_DONE) 144 146 ret = RESUME_GUEST; ··· 182 188 } 183 189 } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 184 190 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { 185 - #ifdef DEBUG 186 191 kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n", 187 192 vcpu->arch.pc, badvaddr); 188 - #endif 189 193 190 194 /* User Address (UA) fault, this could happen if 191 195 * (1) TLB entry not present/valid in both Guest and shadow host TLBs, in this ··· 228 236 229 237 if (KVM_GUEST_KERNEL_MODE(vcpu) 230 238 && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { 231 - #ifdef DEBUG 232 239 kvm_debug("Emulate Store to MMIO space\n"); 233 - #endif 234 240 er = kvm_mips_emulate_inst(cause, opc, run, vcpu); 235 241 if (er == EMULATE_FAIL) { 236 242 printk("Emulate Store to MMIO space failed\n"); ··· 258 268 int ret = RESUME_GUEST; 259 269 260 270 if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) { 261 - #ifdef DEBUG 262 271 kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr); 263 - #endif 264 272 er = kvm_mips_emulate_inst(cause, opc, run, vcpu); 265 273 if (er == EMULATE_FAIL) { 266 274 printk("Emulate Load from MMIO space failed\n"); ··· 389 401 return 0; 390 402 } 391 403 404 + static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, 405 + const struct kvm_one_reg *reg, 406 + s64 *v) 407 + { 408 + switch (reg->id) { 409 + case KVM_REG_MIPS_CP0_COUNT: 410 + *v = kvm_mips_read_count(vcpu); 411 + break; 412 + case KVM_REG_MIPS_COUNT_CTL: 413 + *v = vcpu->arch.count_ctl; 414 + break; 415 + case KVM_REG_MIPS_COUNT_RESUME: 416 + *v = ktime_to_ns(vcpu->arch.count_resume); 417 + break; 418 + case KVM_REG_MIPS_COUNT_HZ: 419 + *v = vcpu->arch.count_hz; 420 + break; 421 + default: 422 + return -EINVAL; 423 + } 424 + return 0; 425 + } 426 + 427 + static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, 428 + const struct kvm_one_reg *reg, 429 + s64 v) 430 + { 431 + struct mips_coproc *cop0 = vcpu->arch.cop0; 432 + int ret = 0; 433 + 434 + switch (reg->id) { 435 + case KVM_REG_MIPS_CP0_COUNT: 436 + kvm_mips_write_count(vcpu, v); 437 + break; 438 + case KVM_REG_MIPS_CP0_COMPARE: 439 + kvm_mips_write_compare(vcpu, v); 440 + break; 441 + case KVM_REG_MIPS_CP0_CAUSE: 442 + /* 443 + * If the timer is stopped or started (DC bit) it must look 444 + * atomic with changes to the interrupt pending bits (TI, IRQ5). 445 + * A timer interrupt should not happen in between. 446 + */ 447 + if ((kvm_read_c0_guest_cause(cop0) ^ v) & CAUSEF_DC) { 448 + if (v & CAUSEF_DC) { 449 + /* disable timer first */ 450 + kvm_mips_count_disable_cause(vcpu); 451 + kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v); 452 + } else { 453 + /* enable timer last */ 454 + kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v); 455 + kvm_mips_count_enable_cause(vcpu); 456 + } 457 + } else { 458 + kvm_write_c0_guest_cause(cop0, v); 459 + } 460 + break; 461 + case KVM_REG_MIPS_COUNT_CTL: 462 + ret = kvm_mips_set_count_ctl(vcpu, v); 463 + break; 464 + case KVM_REG_MIPS_COUNT_RESUME: 465 + ret = kvm_mips_set_count_resume(vcpu, v); 466 + break; 467 + case KVM_REG_MIPS_COUNT_HZ: 468 + ret = kvm_mips_set_count_hz(vcpu, v); 469 + break; 470 + default: 471 + return -EINVAL; 472 + } 473 + return ret; 474 + } 475 + 392 476 static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { 393 477 /* exit handlers */ 394 478 .handle_cop_unusable = kvm_trap_emul_handle_cop_unusable, ··· 483 423 .dequeue_io_int = kvm_mips_dequeue_io_int_cb, 484 424 .irq_deliver = kvm_mips_irq_deliver_cb, 485 425 .irq_clear = kvm_mips_irq_clear_cb, 426 + .get_one_reg = kvm_trap_emul_get_one_reg, 427 + .set_one_reg = kvm_trap_emul_set_one_reg, 486 428 }; 487 429 488 430 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)

+1

arch/mips/mm/cache.c

··· 31 31 void (*flush_icache_range)(unsigned long start, unsigned long end); 32 32 EXPORT_SYMBOL_GPL(flush_icache_range); 33 33 void (*local_flush_icache_range)(unsigned long start, unsigned long end); 34 + EXPORT_SYMBOL_GPL(local_flush_icache_range); 34 35 35 36 void (*__flush_cache_vmap)(void); 36 37 void (*__flush_cache_vunmap)(void);

+2 -12

arch/mips/mti-malta/malta-time.c

··· 74 74 unsigned int giccount = 0, gicstart = 0; 75 75 #endif 76 76 77 - #if defined (CONFIG_KVM_GUEST) && defined (CONFIG_KVM_HOST_FREQ) 78 - unsigned int prid = read_c0_prid() & (PRID_COMP_MASK | PRID_IMP_MASK); 79 - 80 - /* 81 - * XXXKYMA: hardwire the CPU frequency to Host Freq/4 82 - */ 83 - count = (CONFIG_KVM_HOST_FREQ * 1000000) >> 3; 84 - if ((prid != (PRID_COMP_MIPS | PRID_IMP_20KC)) && 85 - (prid != (PRID_COMP_MIPS | PRID_IMP_25KF))) 86 - count *= 2; 87 - 88 - mips_hpt_frequency = count; 77 + #if defined(CONFIG_KVM_GUEST) && CONFIG_KVM_GUEST_TIMER_FREQ 78 + mips_hpt_frequency = CONFIG_KVM_GUEST_TIMER_FREQ * 1000000; 89 79 return; 90 80 #endif 91 81

+34

arch/powerpc/include/asm/disassemble.h

··· 81 81 { 82 82 return (inst >> 11) & 0x7fff; 83 83 } 84 + 85 + #define IS_XFORM(inst) (get_op(inst) == 31) 86 + #define IS_DSFORM(inst) (get_op(inst) >= 56) 87 + 88 + /* 89 + * Create a DSISR value from the instruction 90 + */ 91 + static inline unsigned make_dsisr(unsigned instr) 92 + { 93 + unsigned dsisr; 94 + 95 + 96 + /* bits 6:15 --> 22:31 */ 97 + dsisr = (instr & 0x03ff0000) >> 16; 98 + 99 + if (IS_XFORM(instr)) { 100 + /* bits 29:30 --> 15:16 */ 101 + dsisr |= (instr & 0x00000006) << 14; 102 + /* bit 25 --> 17 */ 103 + dsisr |= (instr & 0x00000040) << 8; 104 + /* bits 21:24 --> 18:21 */ 105 + dsisr |= (instr & 0x00000780) << 3; 106 + } else { 107 + /* bit 5 --> 17 */ 108 + dsisr |= (instr & 0x04000000) >> 12; 109 + /* bits 1: 4 --> 18:21 */ 110 + dsisr |= (instr & 0x78000000) >> 17; 111 + /* bits 30:31 --> 12:13 */ 112 + if (IS_DSFORM(instr)) 113 + dsisr |= (instr & 0x00000003) << 18; 114 + } 115 + 116 + return dsisr; 117 + } 84 118 #endif /* __ASM_PPC_DISASSEMBLE_H__ */

+10 -8

arch/powerpc/include/asm/kvm_asm.h

··· 102 102 #define BOOK3S_INTERRUPT_PERFMON 0xf00 103 103 #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 104 104 #define BOOK3S_INTERRUPT_VSX 0xf40 105 + #define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60 105 106 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80 106 107 107 108 #define BOOK3S_IRQPRIO_SYSTEM_RESET 0 ··· 115 114 #define BOOK3S_IRQPRIO_FP_UNAVAIL 7 116 115 #define BOOK3S_IRQPRIO_ALTIVEC 8 117 116 #define BOOK3S_IRQPRIO_VSX 9 118 - #define BOOK3S_IRQPRIO_SYSCALL 10 119 - #define BOOK3S_IRQPRIO_MACHINE_CHECK 11 120 - #define BOOK3S_IRQPRIO_DEBUG 12 121 - #define BOOK3S_IRQPRIO_EXTERNAL 13 122 - #define BOOK3S_IRQPRIO_DECREMENTER 14 123 - #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15 124 - #define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 16 125 - #define BOOK3S_IRQPRIO_MAX 17 117 + #define BOOK3S_IRQPRIO_FAC_UNAVAIL 10 118 + #define BOOK3S_IRQPRIO_SYSCALL 11 119 + #define BOOK3S_IRQPRIO_MACHINE_CHECK 12 120 + #define BOOK3S_IRQPRIO_DEBUG 13 121 + #define BOOK3S_IRQPRIO_EXTERNAL 14 122 + #define BOOK3S_IRQPRIO_DECREMENTER 15 123 + #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16 124 + #define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 17 125 + #define BOOK3S_IRQPRIO_MAX 18 126 126 127 127 #define BOOK3S_HFLAG_DCBZ32 0x1 128 128 #define BOOK3S_HFLAG_SLB 0x2

+2 -1

arch/powerpc/include/asm/kvm_book3s.h

··· 268 268 return vcpu->arch.pc; 269 269 } 270 270 271 + static inline u64 kvmppc_get_msr(struct kvm_vcpu *vcpu); 271 272 static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu) 272 273 { 273 - return (vcpu->arch.shared->msr & MSR_LE) != (MSR_KERNEL & MSR_LE); 274 + return (kvmppc_get_msr(vcpu) & MSR_LE) != (MSR_KERNEL & MSR_LE); 274 275 } 275 276 276 277 static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong pc)

+123 -23

arch/powerpc/include/asm/kvm_book3s_64.h

··· 77 77 return old == 0; 78 78 } 79 79 80 + static inline int __hpte_actual_psize(unsigned int lp, int psize) 81 + { 82 + int i, shift; 83 + unsigned int mask; 84 + 85 + /* start from 1 ignoring MMU_PAGE_4K */ 86 + for (i = 1; i < MMU_PAGE_COUNT; i++) { 87 + 88 + /* invalid penc */ 89 + if (mmu_psize_defs[psize].penc[i] == -1) 90 + continue; 91 + /* 92 + * encoding bits per actual page size 93 + * PTE LP actual page size 94 + * rrrr rrrz >=8KB 95 + * rrrr rrzz >=16KB 96 + * rrrr rzzz >=32KB 97 + * rrrr zzzz >=64KB 98 + * ....... 99 + */ 100 + shift = mmu_psize_defs[i].shift - LP_SHIFT; 101 + if (shift > LP_BITS) 102 + shift = LP_BITS; 103 + mask = (1 << shift) - 1; 104 + if ((lp & mask) == mmu_psize_defs[psize].penc[i]) 105 + return i; 106 + } 107 + return -1; 108 + } 109 + 80 110 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, 81 111 unsigned long pte_index) 82 112 { 83 - unsigned long rb, va_low; 113 + int b_psize, a_psize; 114 + unsigned int penc; 115 + unsigned long rb = 0, va_low, sllp; 116 + unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1); 84 117 118 + if (!(v & HPTE_V_LARGE)) { 119 + /* both base and actual psize is 4k */ 120 + b_psize = MMU_PAGE_4K; 121 + a_psize = MMU_PAGE_4K; 122 + } else { 123 + for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) { 124 + 125 + /* valid entries have a shift value */ 126 + if (!mmu_psize_defs[b_psize].shift) 127 + continue; 128 + 129 + a_psize = __hpte_actual_psize(lp, b_psize); 130 + if (a_psize != -1) 131 + break; 132 + } 133 + } 134 + /* 135 + * Ignore the top 14 bits of va 136 + * v have top two bits covering segment size, hence move 137 + * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits. 138 + * AVA field in v also have the lower 23 bits ignored. 139 + * For base page size 4K we need 14 .. 65 bits (so need to 140 + * collect extra 11 bits) 141 + * For others we need 14..14+i 142 + */ 143 + /* This covers 14..54 bits of va*/ 85 144 rb = (v & ~0x7fUL) << 16; /* AVA field */ 145 + /* 146 + * AVA in v had cleared lower 23 bits. We need to derive 147 + * that from pteg index 148 + */ 86 149 va_low = pte_index >> 3; 87 150 if (v & HPTE_V_SECONDARY) 88 151 va_low = ~va_low; 89 - /* xor vsid from AVA */ 152 + /* 153 + * get the vpn bits from va_low using reverse of hashing. 154 + * In v we have va with 23 bits dropped and then left shifted 155 + * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need 156 + * right shift it with (SID_SHIFT - (23 - 7)) 157 + */ 90 158 if (!(v & HPTE_V_1TB_SEG)) 91 - va_low ^= v >> 12; 159 + va_low ^= v >> (SID_SHIFT - 16); 92 160 else 93 - va_low ^= v >> 24; 161 + va_low ^= v >> (SID_SHIFT_1T - 16); 94 162 va_low &= 0x7ff; 95 - if (v & HPTE_V_LARGE) { 96 - rb |= 1; /* L field */ 97 - if (cpu_has_feature(CPU_FTR_ARCH_206) && 98 - (r & 0xff000)) { 99 - /* non-16MB large page, must be 64k */ 100 - /* (masks depend on page size) */ 101 - rb |= 0x1000; /* page encoding in LP field */ 102 - rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ 103 - rb |= ((va_low << 4) & 0xf0); /* AVAL field (P7 doesn't seem to care) */ 104 - } 105 - } else { 106 - /* 4kB page */ 107 - rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ 163 + 164 + switch (b_psize) { 165 + case MMU_PAGE_4K: 166 + sllp = ((mmu_psize_defs[a_psize].sllp & SLB_VSID_L) >> 6) | 167 + ((mmu_psize_defs[a_psize].sllp & SLB_VSID_LP) >> 4); 168 + rb |= sllp << 5; /* AP field */ 169 + rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */ 170 + break; 171 + default: 172 + { 173 + int aval_shift; 174 + /* 175 + * remaining 7bits of AVA/LP fields 176 + * Also contain the rr bits of LP 177 + */ 178 + rb |= (va_low & 0x7f) << 16; 179 + /* 180 + * Now clear not needed LP bits based on actual psize 181 + */ 182 + rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1); 183 + /* 184 + * AVAL field 58..77 - base_page_shift bits of va 185 + * we have space for 58..64 bits, Missing bits should 186 + * be zero filled. +1 is to take care of L bit shift 187 + */ 188 + aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1; 189 + rb |= ((va_low << aval_shift) & 0xfe); 190 + 191 + rb |= 1; /* L field */ 192 + penc = mmu_psize_defs[b_psize].penc[a_psize]; 193 + rb |= penc << 12; /* LP field */ 194 + break; 195 + } 108 196 } 109 197 rb |= (v >> 54) & 0x300; /* B field */ 110 198 return rb; ··· 200 112 201 113 static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) 202 114 { 115 + int size, a_psize; 116 + /* Look at the 8 bit LP value */ 117 + unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1); 118 + 203 119 /* only handle 4k, 64k and 16M pages for now */ 204 120 if (!(h & HPTE_V_LARGE)) 205 - return 1ul << 12; /* 4k page */ 206 - if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206)) 207 - return 1ul << 16; /* 64k page */ 208 - if ((l & 0xff000) == 0) 209 - return 1ul << 24; /* 16M page */ 210 - return 0; /* error */ 121 + return 1ul << 12; 122 + else { 123 + for (size = 0; size < MMU_PAGE_COUNT; size++) { 124 + /* valid entries have a shift value */ 125 + if (!mmu_psize_defs[size].shift) 126 + continue; 127 + 128 + a_psize = __hpte_actual_psize(lp, size); 129 + if (a_psize != -1) 130 + return 1ul << mmu_psize_defs[a_psize].shift; 131 + } 132 + 133 + } 134 + return 0; 211 135 } 212 136 213 137 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)

+2

arch/powerpc/include/asm/kvm_book3s_asm.h

··· 104 104 #ifdef CONFIG_PPC_BOOK3S_64 105 105 u64 cfar; 106 106 u64 ppr; 107 + u64 host_fscr; 107 108 #endif 108 109 }; 109 110 ··· 134 133 u64 esid; 135 134 u64 vsid; 136 135 } slb[64]; /* guest SLB */ 136 + u64 shadow_fscr; 137 137 #endif 138 138 }; 139 139

-5

arch/powerpc/include/asm/kvm_booke.h

··· 108 108 { 109 109 return vcpu->arch.fault_dear; 110 110 } 111 - 112 - static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu) 113 - { 114 - return vcpu->arch.shared->msr; 115 - } 116 111 #endif /* __ASM_KVM_BOOKE_H__ */

+8 -1

arch/powerpc/include/asm/kvm_host.h

··· 449 449 ulong pc; 450 450 ulong ctr; 451 451 ulong lr; 452 + #ifdef CONFIG_PPC_BOOK3S 452 453 ulong tar; 454 + #endif 453 455 454 456 ulong xer; 455 457 u32 cr; ··· 477 475 ulong ppr; 478 476 ulong pspb; 479 477 ulong fscr; 478 + ulong shadow_fscr; 480 479 ulong ebbhr; 481 480 ulong ebbrr; 482 481 ulong bescr; ··· 565 562 #ifdef CONFIG_PPC_BOOK3S 566 563 ulong fault_dar; 567 564 u32 fault_dsisr; 565 + unsigned long intr_msr; 568 566 #endif 569 567 570 568 #ifdef CONFIG_BOOKE ··· 626 622 wait_queue_head_t cpu_run; 627 623 628 624 struct kvm_vcpu_arch_shared *shared; 625 + #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) 626 + bool shared_big_endian; 627 + #endif 629 628 unsigned long magic_page_pa; /* phys addr to map the magic page to */ 630 629 unsigned long magic_page_ea; /* effect. addr to map the magic page to */ 630 + bool disable_kernel_nx; 631 631 632 632 int irq_type; /* one of KVM_IRQ_* */ 633 633 int irq_cpu_id; ··· 662 654 spinlock_t tbacct_lock; 663 655 u64 busy_stolen; 664 656 u64 busy_preempt; 665 - unsigned long intr_msr; 666 657 #endif 667 658 }; 668 659

+79 -1

arch/powerpc/include/asm/kvm_ppc.h

··· 449 449 } 450 450 451 451 /* 452 + * Shared struct helpers. The shared struct can be little or big endian, 453 + * depending on the guest endianness. So expose helpers to all of them. 454 + */ 455 + static inline bool kvmppc_shared_big_endian(struct kvm_vcpu *vcpu) 456 + { 457 + #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) 458 + /* Only Book3S_64 PR supports bi-endian for now */ 459 + return vcpu->arch.shared_big_endian; 460 + #elif defined(CONFIG_PPC_BOOK3S_64) && defined(__LITTLE_ENDIAN__) 461 + /* Book3s_64 HV on little endian is always little endian */ 462 + return false; 463 + #else 464 + return true; 465 + #endif 466 + } 467 + 468 + #define SHARED_WRAPPER_GET(reg, size) \ 469 + static inline u##size kvmppc_get_##reg(struct kvm_vcpu *vcpu) \ 470 + { \ 471 + if (kvmppc_shared_big_endian(vcpu)) \ 472 + return be##size##_to_cpu(vcpu->arch.shared->reg); \ 473 + else \ 474 + return le##size##_to_cpu(vcpu->arch.shared->reg); \ 475 + } \ 476 + 477 + #define SHARED_WRAPPER_SET(reg, size) \ 478 + static inline void kvmppc_set_##reg(struct kvm_vcpu *vcpu, u##size val) \ 479 + { \ 480 + if (kvmppc_shared_big_endian(vcpu)) \ 481 + vcpu->arch.shared->reg = cpu_to_be##size(val); \ 482 + else \ 483 + vcpu->arch.shared->reg = cpu_to_le##size(val); \ 484 + } \ 485 + 486 + #define SHARED_WRAPPER(reg, size) \ 487 + SHARED_WRAPPER_GET(reg, size) \ 488 + SHARED_WRAPPER_SET(reg, size) \ 489 + 490 + SHARED_WRAPPER(critical, 64) 491 + SHARED_WRAPPER(sprg0, 64) 492 + SHARED_WRAPPER(sprg1, 64) 493 + SHARED_WRAPPER(sprg2, 64) 494 + SHARED_WRAPPER(sprg3, 64) 495 + SHARED_WRAPPER(srr0, 64) 496 + SHARED_WRAPPER(srr1, 64) 497 + SHARED_WRAPPER(dar, 64) 498 + SHARED_WRAPPER_GET(msr, 64) 499 + static inline void kvmppc_set_msr_fast(struct kvm_vcpu *vcpu, u64 val) 500 + { 501 + if (kvmppc_shared_big_endian(vcpu)) 502 + vcpu->arch.shared->msr = cpu_to_be64(val); 503 + else 504 + vcpu->arch.shared->msr = cpu_to_le64(val); 505 + } 506 + SHARED_WRAPPER(dsisr, 32) 507 + SHARED_WRAPPER(int_pending, 32) 508 + SHARED_WRAPPER(sprg4, 64) 509 + SHARED_WRAPPER(sprg5, 64) 510 + SHARED_WRAPPER(sprg6, 64) 511 + SHARED_WRAPPER(sprg7, 64) 512 + 513 + static inline u32 kvmppc_get_sr(struct kvm_vcpu *vcpu, int nr) 514 + { 515 + if (kvmppc_shared_big_endian(vcpu)) 516 + return be32_to_cpu(vcpu->arch.shared->sr[nr]); 517 + else 518 + return le32_to_cpu(vcpu->arch.shared->sr[nr]); 519 + } 520 + 521 + static inline void kvmppc_set_sr(struct kvm_vcpu *vcpu, int nr, u32 val) 522 + { 523 + if (kvmppc_shared_big_endian(vcpu)) 524 + vcpu->arch.shared->sr[nr] = cpu_to_be32(val); 525 + else 526 + vcpu->arch.shared->sr[nr] = cpu_to_le32(val); 527 + } 528 + 529 + /* 452 530 * Please call after prepare_to_enter. This function puts the lazy ee and irq 453 531 * disabled tracking state back to normal mode, without actually enabling 454 532 * interrupts. ··· 563 485 msr_64bit = MSR_SF; 564 486 #endif 565 487 566 - if (!(vcpu->arch.shared->msr & msr_64bit)) 488 + if (!(kvmppc_get_msr(vcpu) & msr_64bit)) 567 489 ea = (uint32_t)ea; 568 490 569 491 return ea;

+7 -5

arch/powerpc/include/asm/reg.h

··· 670 670 #define MMCR0_PROBLEM_DISABLE MMCR0_FCP 671 671 #define MMCR0_FCM1 0x10000000UL /* freeze counters while MSR mark = 1 */ 672 672 #define MMCR0_FCM0 0x08000000UL /* freeze counters while MSR mark = 0 */ 673 - #define MMCR0_PMXE 0x04000000UL /* performance monitor exception enable */ 674 - #define MMCR0_FCECE 0x02000000UL /* freeze ctrs on enabled cond or event */ 673 + #define MMCR0_PMXE ASM_CONST(0x04000000) /* perf mon exception enable */ 674 + #define MMCR0_FCECE ASM_CONST(0x02000000) /* freeze ctrs on enabled cond or event */ 675 675 #define MMCR0_TBEE 0x00400000UL /* time base exception enable */ 676 676 #define MMCR0_BHRBA 0x00200000UL /* BHRB Access allowed in userspace */ 677 677 #define MMCR0_EBE 0x00100000UL /* Event based branch enable */ 678 678 #define MMCR0_PMCC 0x000c0000UL /* PMC control */ 679 679 #define MMCR0_PMCC_U6 0x00080000UL /* PMC1-6 are R/W by user (PR) */ 680 680 #define MMCR0_PMC1CE 0x00008000UL /* PMC1 count enable*/ 681 - #define MMCR0_PMCjCE 0x00004000UL /* PMCj count enable*/ 681 + #define MMCR0_PMCjCE ASM_CONST(0x00004000) /* PMCj count enable*/ 682 682 #define MMCR0_TRIGGER 0x00002000UL /* TRIGGER enable */ 683 - #define MMCR0_PMAO_SYNC 0x00000800UL /* PMU interrupt is synchronous */ 684 - #define MMCR0_PMAO 0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */ 683 + #define MMCR0_PMAO_SYNC ASM_CONST(0x00000800) /* PMU intr is synchronous */ 684 + #define MMCR0_C56RUN ASM_CONST(0x00000100) /* PMC5/6 count when RUN=0 */ 685 + /* performance monitor alert has occurred, set to 0 after handling exception */ 686 + #define MMCR0_PMAO ASM_CONST(0x00000080) 685 687 #define MMCR0_SHRFC 0x00000040UL /* SHRre freeze conditions between threads */ 686 688 #define MMCR0_FC56 0x00000010UL /* freeze counters 5 and 6 */ 687 689 #define MMCR0_FCTI 0x00000008UL /* freeze counters in tags inactive mode */

+1

arch/powerpc/include/asm/reg_booke.h

··· 583 583 584 584 /* Bit definitions for L1CSR0. */ 585 585 #define L1CSR0_CPE 0x00010000 /* Data Cache Parity Enable */ 586 + #define L1CSR0_CUL 0x00000400 /* Data Cache Unable to Lock */ 586 587 #define L1CSR0_CLFC 0x00000100 /* Cache Lock Bits Flash Clear */ 587 588 #define L1CSR0_DCFI 0x00000002 /* Data Cache Flash Invalidate */ 588 589 #define L1CSR0_CFI 0x00000002 /* Cache Flash Invalidate */

+1 -1

arch/powerpc/include/uapi/asm/kvm.h

··· 545 545 #define KVM_REG_PPC_TCSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1) 546 546 #define KVM_REG_PPC_PID (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2) 547 547 #define KVM_REG_PPC_ACOP (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3) 548 - #define KVM_REG_PPC_WORT (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb4) 549 548 550 549 #define KVM_REG_PPC_VRSAVE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4) 551 550 #define KVM_REG_PPC_LPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5) ··· 554 555 #define KVM_REG_PPC_ARCH_COMPAT (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7) 555 556 556 557 #define KVM_REG_PPC_DABRX (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb8) 558 + #define KVM_REG_PPC_WORT (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb9) 557 559 558 560 /* Transactional Memory checkpointed state: 559 561 * This is all GPRs, all VSX regs and a subset of SPRs

+6

arch/powerpc/include/uapi/asm/kvm_para.h

··· 82 82 83 83 #define KVM_FEATURE_MAGIC_PAGE 1 84 84 85 + /* Magic page flags from host to guest */ 86 + 85 87 #define KVM_MAGIC_FEAT_SR (1 << 0) 86 88 87 89 /* MASn, ESR, PIR, and high SPRGs */ 88 90 #define KVM_MAGIC_FEAT_MAS0_TO_SPRG7 (1 << 1) 91 + 92 + /* Magic page flags from guest to host */ 93 + 94 + #define MAGIC_PAGE_FLAG_NOT_MAPPED_NX (1 << 0) 89 95 90 96 91 97 #endif /* _UAPI__POWERPC_KVM_PARA_H__ */

+1 -33

arch/powerpc/kernel/align.c

··· 25 25 #include <asm/cputable.h> 26 26 #include <asm/emulated_ops.h> 27 27 #include <asm/switch_to.h> 28 + #include <asm/disassemble.h> 28 29 29 30 struct aligninfo { 30 31 unsigned char len; 31 32 unsigned char flags; 32 33 }; 33 34 34 - #define IS_XFORM(inst) (((inst) >> 26) == 31) 35 - #define IS_DSFORM(inst) (((inst) >> 26) >= 56) 36 35 37 36 #define INVALID { 0, 0 } 38 37 ··· 189 190 INVALID, /* 11 1 1110 */ 190 191 INVALID, /* 11 1 1111 */ 191 192 }; 192 - 193 - /* 194 - * Create a DSISR value from the instruction 195 - */ 196 - static inline unsigned make_dsisr(unsigned instr) 197 - { 198 - unsigned dsisr; 199 - 200 - 201 - /* bits 6:15 --> 22:31 */ 202 - dsisr = (instr & 0x03ff0000) >> 16; 203 - 204 - if (IS_XFORM(instr)) { 205 - /* bits 29:30 --> 15:16 */ 206 - dsisr |= (instr & 0x00000006) << 14; 207 - /* bit 25 --> 17 */ 208 - dsisr |= (instr & 0x00000040) << 8; 209 - /* bits 21:24 --> 18:21 */ 210 - dsisr |= (instr & 0x00000780) << 3; 211 - } else { 212 - /* bit 5 --> 17 */ 213 - dsisr |= (instr & 0x04000000) >> 12; 214 - /* bits 1: 4 --> 18:21 */ 215 - dsisr |= (instr & 0x78000000) >> 17; 216 - /* bits 30:31 --> 12:13 */ 217 - if (IS_DSFORM(instr)) 218 - dsisr |= (instr & 0x00000003) << 18; 219 - } 220 - 221 - return dsisr; 222 - } 223 193 224 194 /* 225 195 * The dcbz (data cache block zero) instruction

+10 -1

arch/powerpc/kernel/asm-offsets.c

··· 54 54 #endif 55 55 #if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S) 56 56 #include <asm/kvm_book3s.h> 57 + #include <asm/kvm_ppc.h> 57 58 #endif 58 59 59 60 #ifdef CONFIG_PPC32 ··· 446 445 DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); 447 446 DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); 448 447 DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); 448 + #ifdef CONFIG_PPC_BOOK3S 449 449 DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar)); 450 + #endif 450 451 DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); 451 452 DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); 452 453 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE ··· 470 467 DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); 471 468 DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); 472 469 DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); 470 + #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) 471 + DEFINE(VCPU_SHAREDBE, offsetof(struct kvm_vcpu, arch.shared_big_endian)); 472 + #endif 473 473 474 474 DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0)); 475 475 DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1)); ··· 499 493 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); 500 494 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); 501 495 DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty)); 502 - DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); 503 496 #endif 504 497 #ifdef CONFIG_PPC_BOOK3S 505 498 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); ··· 533 528 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); 534 529 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); 535 530 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); 531 + DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); 536 532 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 537 533 DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); 538 534 DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar)); 539 535 DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr)); 540 536 DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr)); 537 + DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr)); 541 538 DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb)); 542 539 DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr)); 543 540 DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr)); ··· 621 614 #ifdef CONFIG_PPC64 622 615 SVCPU_FIELD(SVCPU_SLB, slb); 623 616 SVCPU_FIELD(SVCPU_SLB_MAX, slb_max); 617 + SVCPU_FIELD(SVCPU_SHADOW_FSCR, shadow_fscr); 624 618 #endif 625 619 626 620 HSTATE_FIELD(HSTATE_HOST_R1, host_r1); ··· 657 649 #ifdef CONFIG_PPC_BOOK3S_64 658 650 HSTATE_FIELD(HSTATE_CFAR, cfar); 659 651 HSTATE_FIELD(HSTATE_PPR, ppr); 652 + HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr); 660 653 #endif /* CONFIG_PPC_BOOK3S_64 */ 661 654 662 655 #else /* CONFIG_PPC_BOOK3S */

+3 -2

arch/powerpc/kernel/epapr_paravirt.c

··· 47 47 return -1; 48 48 49 49 for (i = 0; i < (len / 4); i++) { 50 - patch_instruction(epapr_hypercall_start + i, insts[i]); 50 + u32 inst = be32_to_cpu(insts[i]); 51 + patch_instruction(epapr_hypercall_start + i, inst); 51 52 #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) 52 - patch_instruction(epapr_ev_idle_start + i, insts[i]); 53 + patch_instruction(epapr_ev_idle_start + i, inst); 53 54 #endif 54 55 } 55 56

+1 -1

arch/powerpc/kernel/kvm.c

··· 417 417 ulong out[8]; 418 418 419 419 in[0] = KVM_MAGIC_PAGE; 420 - in[1] = KVM_MAGIC_PAGE; 420 + in[1] = KVM_MAGIC_PAGE | MAGIC_PAGE_FLAG_NOT_MAPPED_NX; 421 421 422 422 epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE)); 423 423

+3

arch/powerpc/kernel/paca.c

··· 98 98 /* 99 99 * 3 persistent SLBs are registered here. The buffer will be zero 100 100 * initially, hence will all be invaild until we actually write them. 101 + * 102 + * If you make the number of persistent SLB entries dynamic, please also 103 + * update PR KVM to flush and restore them accordingly. 101 104 */ 102 105 static struct slb_shadow *slb_shadow; 103 106

+1 -1

arch/powerpc/kvm/Kconfig

··· 6 6 7 7 menuconfig VIRTUALIZATION 8 8 bool "Virtualization" 9 - depends on !CPU_LITTLE_ENDIAN 10 9 ---help--- 11 10 Say Y here to get to see options for using your Linux host to run 12 11 other operating systems inside virtual machines (guests). ··· 75 76 config KVM_BOOK3S_64_HV 76 77 tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" 77 78 depends on KVM_BOOK3S_64 79 + depends on !CPU_LITTLE_ENDIAN 78 80 select KVM_BOOK3S_HV_POSSIBLE 79 81 select MMU_NOTIFIER 80 82 select CMA

+70 -36

arch/powerpc/kvm/book3s.c

··· 85 85 if (is_kvmppc_hv_enabled(vcpu->kvm)) 86 86 return; 87 87 if (pending_now) 88 - vcpu->arch.shared->int_pending = 1; 88 + kvmppc_set_int_pending(vcpu, 1); 89 89 else if (old_pending) 90 - vcpu->arch.shared->int_pending = 0; 90 + kvmppc_set_int_pending(vcpu, 0); 91 91 } 92 92 93 93 static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) ··· 99 99 if (is_kvmppc_hv_enabled(vcpu->kvm)) 100 100 return false; 101 101 102 - crit_raw = vcpu->arch.shared->critical; 102 + crit_raw = kvmppc_get_critical(vcpu); 103 103 crit_r1 = kvmppc_get_gpr(vcpu, 1); 104 104 105 105 /* Truncate crit indicators in 32 bit mode */ 106 - if (!(vcpu->arch.shared->msr & MSR_SF)) { 106 + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { 107 107 crit_raw &= 0xffffffff; 108 108 crit_r1 &= 0xffffffff; 109 109 } ··· 111 111 /* Critical section when crit == r1 */ 112 112 crit = (crit_raw == crit_r1); 113 113 /* ... and we're in supervisor mode */ 114 - crit = crit && !(vcpu->arch.shared->msr & MSR_PR); 114 + crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR); 115 115 116 116 return crit; 117 117 } 118 118 119 119 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) 120 120 { 121 - vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); 122 - vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; 121 + kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); 122 + kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags); 123 123 kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); 124 124 vcpu->arch.mmu.reset_msr(vcpu); 125 125 } ··· 145 145 case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG; break; 146 146 case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC; break; 147 147 case 0xf40: prio = BOOK3S_IRQPRIO_VSX; break; 148 + case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL; break; 148 149 default: prio = BOOK3S_IRQPRIO_MAX; break; 149 150 } 150 151 ··· 226 225 227 226 switch (priority) { 228 227 case BOOK3S_IRQPRIO_DECREMENTER: 229 - deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; 228 + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; 230 229 vec = BOOK3S_INTERRUPT_DECREMENTER; 231 230 break; 232 231 case BOOK3S_IRQPRIO_EXTERNAL: 233 232 case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: 234 - deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; 233 + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; 235 234 vec = BOOK3S_INTERRUPT_EXTERNAL; 236 235 break; 237 236 case BOOK3S_IRQPRIO_SYSTEM_RESET: ··· 275 274 break; 276 275 case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR: 277 276 vec = BOOK3S_INTERRUPT_PERFMON; 277 + break; 278 + case BOOK3S_IRQPRIO_FAC_UNAVAIL: 279 + vec = BOOK3S_INTERRUPT_FAC_UNAVAIL; 278 280 break; 279 281 default: 280 282 deliver = 0; ··· 347 343 { 348 344 ulong mp_pa = vcpu->arch.magic_page_pa; 349 345 350 - if (!(vcpu->arch.shared->msr & MSR_SF)) 346 + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) 351 347 mp_pa = (uint32_t)mp_pa; 352 348 353 349 /* Magic page override */ ··· 371 367 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, 372 368 bool iswrite, struct kvmppc_pte *pte) 373 369 { 374 - int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR)); 370 + int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR)); 375 371 int r; 376 372 377 373 if (relocated) { ··· 502 498 regs->ctr = kvmppc_get_ctr(vcpu); 503 499 regs->lr = kvmppc_get_lr(vcpu); 504 500 regs->xer = kvmppc_get_xer(vcpu); 505 - regs->msr = vcpu->arch.shared->msr; 506 - regs->srr0 = vcpu->arch.shared->srr0; 507 - regs->srr1 = vcpu->arch.shared->srr1; 501 + regs->msr = kvmppc_get_msr(vcpu); 502 + regs->srr0 = kvmppc_get_srr0(vcpu); 503 + regs->srr1 = kvmppc_get_srr1(vcpu); 508 504 regs->pid = vcpu->arch.pid; 509 - regs->sprg0 = vcpu->arch.shared->sprg0; 510 - regs->sprg1 = vcpu->arch.shared->sprg1; 511 - regs->sprg2 = vcpu->arch.shared->sprg2; 512 - regs->sprg3 = vcpu->arch.shared->sprg3; 513 - regs->sprg4 = vcpu->arch.shared->sprg4; 514 - regs->sprg5 = vcpu->arch.shared->sprg5; 515 - regs->sprg6 = vcpu->arch.shared->sprg6; 516 - regs->sprg7 = vcpu->arch.shared->sprg7; 505 + regs->sprg0 = kvmppc_get_sprg0(vcpu); 506 + regs->sprg1 = kvmppc_get_sprg1(vcpu); 507 + regs->sprg2 = kvmppc_get_sprg2(vcpu); 508 + regs->sprg3 = kvmppc_get_sprg3(vcpu); 509 + regs->sprg4 = kvmppc_get_sprg4(vcpu); 510 + regs->sprg5 = kvmppc_get_sprg5(vcpu); 511 + regs->sprg6 = kvmppc_get_sprg6(vcpu); 512 + regs->sprg7 = kvmppc_get_sprg7(vcpu); 517 513 518 514 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 519 515 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); ··· 531 527 kvmppc_set_lr(vcpu, regs->lr); 532 528 kvmppc_set_xer(vcpu, regs->xer); 533 529 kvmppc_set_msr(vcpu, regs->msr); 534 - vcpu->arch.shared->srr0 = regs->srr0; 535 - vcpu->arch.shared->srr1 = regs->srr1; 536 - vcpu->arch.shared->sprg0 = regs->sprg0; 537 - vcpu->arch.shared->sprg1 = regs->sprg1; 538 - vcpu->arch.shared->sprg2 = regs->sprg2; 539 - vcpu->arch.shared->sprg3 = regs->sprg3; 540 - vcpu->arch.shared->sprg4 = regs->sprg4; 541 - vcpu->arch.shared->sprg5 = regs->sprg5; 542 - vcpu->arch.shared->sprg6 = regs->sprg6; 543 - vcpu->arch.shared->sprg7 = regs->sprg7; 530 + kvmppc_set_srr0(vcpu, regs->srr0); 531 + kvmppc_set_srr1(vcpu, regs->srr1); 532 + kvmppc_set_sprg0(vcpu, regs->sprg0); 533 + kvmppc_set_sprg1(vcpu, regs->sprg1); 534 + kvmppc_set_sprg2(vcpu, regs->sprg2); 535 + kvmppc_set_sprg3(vcpu, regs->sprg3); 536 + kvmppc_set_sprg4(vcpu, regs->sprg4); 537 + kvmppc_set_sprg5(vcpu, regs->sprg5); 538 + kvmppc_set_sprg6(vcpu, regs->sprg6); 539 + kvmppc_set_sprg7(vcpu, regs->sprg7); 544 540 545 541 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 546 542 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); ··· 574 570 r = 0; 575 571 switch (reg->id) { 576 572 case KVM_REG_PPC_DAR: 577 - val = get_reg_val(reg->id, vcpu->arch.shared->dar); 573 + val = get_reg_val(reg->id, kvmppc_get_dar(vcpu)); 578 574 break; 579 575 case KVM_REG_PPC_DSISR: 580 - val = get_reg_val(reg->id, vcpu->arch.shared->dsisr); 576 + val = get_reg_val(reg->id, kvmppc_get_dsisr(vcpu)); 581 577 break; 582 578 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: 583 579 i = reg->id - KVM_REG_PPC_FPR0; ··· 631 627 val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu)); 632 628 break; 633 629 #endif /* CONFIG_KVM_XICS */ 630 + case KVM_REG_PPC_FSCR: 631 + val = get_reg_val(reg->id, vcpu->arch.fscr); 632 + break; 633 + case KVM_REG_PPC_TAR: 634 + val = get_reg_val(reg->id, vcpu->arch.tar); 635 + break; 636 + case KVM_REG_PPC_EBBHR: 637 + val = get_reg_val(reg->id, vcpu->arch.ebbhr); 638 + break; 639 + case KVM_REG_PPC_EBBRR: 640 + val = get_reg_val(reg->id, vcpu->arch.ebbrr); 641 + break; 642 + case KVM_REG_PPC_BESCR: 643 + val = get_reg_val(reg->id, vcpu->arch.bescr); 644 + break; 634 645 default: 635 646 r = -EINVAL; 636 647 break; ··· 679 660 r = 0; 680 661 switch (reg->id) { 681 662 case KVM_REG_PPC_DAR: 682 - vcpu->arch.shared->dar = set_reg_val(reg->id, val); 663 + kvmppc_set_dar(vcpu, set_reg_val(reg->id, val)); 683 664 break; 684 665 case KVM_REG_PPC_DSISR: 685 - vcpu->arch.shared->dsisr = set_reg_val(reg->id, val); 666 + kvmppc_set_dsisr(vcpu, set_reg_val(reg->id, val)); 686 667 break; 687 668 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: 688 669 i = reg->id - KVM_REG_PPC_FPR0; ··· 735 716 set_reg_val(reg->id, val)); 736 717 break; 737 718 #endif /* CONFIG_KVM_XICS */ 719 + case KVM_REG_PPC_FSCR: 720 + vcpu->arch.fscr = set_reg_val(reg->id, val); 721 + break; 722 + case KVM_REG_PPC_TAR: 723 + vcpu->arch.tar = set_reg_val(reg->id, val); 724 + break; 725 + case KVM_REG_PPC_EBBHR: 726 + vcpu->arch.ebbhr = set_reg_val(reg->id, val); 727 + break; 728 + case KVM_REG_PPC_EBBRR: 729 + vcpu->arch.ebbrr = set_reg_val(reg->id, val); 730 + break; 731 + case KVM_REG_PPC_BESCR: 732 + vcpu->arch.bescr = set_reg_val(reg->id, val); 733 + break; 738 734 default: 739 735 r = -EINVAL; 740 736 break;

+23 -18

arch/powerpc/kvm/book3s_32_mmu.c

··· 91 91 92 92 static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr) 93 93 { 94 - return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf]; 94 + return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf); 95 95 } 96 96 97 97 static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, ··· 131 131 pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash; 132 132 133 133 dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", 134 - kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg, 134 + kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg, 135 135 sr_vsid(sre)); 136 136 137 137 r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); ··· 160 160 else 161 161 bat = &vcpu_book3s->ibat[i]; 162 162 163 - if (vcpu->arch.shared->msr & MSR_PR) { 163 + if (kvmppc_get_msr(vcpu) & MSR_PR) { 164 164 if (!bat->vp) 165 165 continue; 166 166 } else { ··· 208 208 u32 sre; 209 209 hva_t ptegp; 210 210 u32 pteg[16]; 211 + u32 pte0, pte1; 211 212 u32 ptem = 0; 212 213 int i; 213 214 int found = 0; ··· 234 233 } 235 234 236 235 for (i=0; i<16; i+=2) { 237 - if (ptem == pteg[i]) { 236 + pte0 = be32_to_cpu(pteg[i]); 237 + pte1 = be32_to_cpu(pteg[i + 1]); 238 + if (ptem == pte0) { 238 239 u8 pp; 239 240 240 - pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF); 241 - pp = pteg[i+1] & 3; 241 + pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF); 242 + pp = pte1 & 3; 242 243 243 - if ((sr_kp(sre) && (vcpu->arch.shared->msr & MSR_PR)) || 244 - (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR))) 244 + if ((sr_kp(sre) && (kvmppc_get_msr(vcpu) & MSR_PR)) || 245 + (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR))) 245 246 pp |= 4; 246 247 247 248 pte->may_write = false; ··· 263 260 } 264 261 265 262 dprintk_pte("MMU: Found PTE -> %x %x - %x\n", 266 - pteg[i], pteg[i+1], pp); 263 + pte0, pte1, pp); 267 264 found = 1; 268 265 break; 269 266 } ··· 272 269 /* Update PTE C and A bits, so the guest's swapper knows we used the 273 270 page */ 274 271 if (found) { 275 - u32 pte_r = pteg[i+1]; 276 - char __user *addr = (char __user *) &pteg[i+1]; 272 + u32 pte_r = pte1; 273 + char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32)); 277 274 278 275 /* 279 276 * Use single-byte writes to update the HPTE, to ··· 299 296 to_book3s(vcpu)->sdr1, ptegp); 300 297 for (i=0; i<16; i+=2) { 301 298 dprintk_pte(" %02d: 0x%x - 0x%x (0x%x)\n", 302 - i, pteg[i], pteg[i+1], ptem); 299 + i, be32_to_cpu(pteg[i]), 300 + be32_to_cpu(pteg[i+1]), ptem); 303 301 } 304 302 } 305 303 ··· 320 316 /* Magic page override */ 321 317 if (unlikely(mp_ea) && 322 318 unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && 323 - !(vcpu->arch.shared->msr & MSR_PR)) { 319 + !(kvmppc_get_msr(vcpu) & MSR_PR)) { 324 320 pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); 325 321 pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff); 326 322 pte->raddr &= KVM_PAM; ··· 345 341 346 342 static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum) 347 343 { 348 - return vcpu->arch.shared->sr[srnum]; 344 + return kvmppc_get_sr(vcpu, srnum); 349 345 } 350 346 351 347 static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, 352 348 ulong value) 353 349 { 354 - vcpu->arch.shared->sr[srnum] = value; 350 + kvmppc_set_sr(vcpu, srnum, value); 355 351 kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT); 356 352 } 357 353 ··· 371 367 ulong ea = esid << SID_SHIFT; 372 368 u32 sr; 373 369 u64 gvsid = esid; 370 + u64 msr = kvmppc_get_msr(vcpu); 374 371 375 - if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 372 + if (msr & (MSR_DR|MSR_IR)) { 376 373 sr = find_sr(vcpu, ea); 377 374 if (sr_valid(sr)) 378 375 gvsid = sr_vsid(sr); ··· 382 377 /* In case we only have one of MSR_IR or MSR_DR set, let's put 383 378 that in the real-mode context (and hope RM doesn't access 384 379 high memory) */ 385 - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 380 + switch (msr & (MSR_DR|MSR_IR)) { 386 381 case 0: 387 382 *vsid = VSID_REAL | esid; 388 383 break; ··· 402 397 BUG(); 403 398 } 404 399 405 - if (vcpu->arch.shared->msr & MSR_PR) 400 + if (msr & MSR_PR) 406 401 *vsid |= VSID_PR; 407 402 408 403 return 0;

+2 -2

arch/powerpc/kvm/book3s_32_mmu_host.c

··· 92 92 struct kvmppc_sid_map *map; 93 93 u16 sid_map_mask; 94 94 95 - if (vcpu->arch.shared->msr & MSR_PR) 95 + if (kvmppc_get_msr(vcpu) & MSR_PR) 96 96 gvsid |= VSID_PR; 97 97 98 98 sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); ··· 279 279 u16 sid_map_mask; 280 280 static int backwards_map = 0; 281 281 282 - if (vcpu->arch.shared->msr & MSR_PR) 282 + if (kvmppc_get_msr(vcpu) & MSR_PR) 283 283 gvsid |= VSID_PR; 284 284 285 285 /* We might get collisions that trap in preceding order, so let's

+23 -16

arch/powerpc/kvm/book3s_64_mmu.c

··· 38 38 39 39 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) 40 40 { 41 - kvmppc_set_msr(vcpu, MSR_SF); 41 + kvmppc_set_msr(vcpu, vcpu->arch.intr_msr); 42 42 } 43 43 44 44 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( ··· 226 226 /* Magic page override */ 227 227 if (unlikely(mp_ea) && 228 228 unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && 229 - !(vcpu->arch.shared->msr & MSR_PR)) { 229 + !(kvmppc_get_msr(vcpu) & MSR_PR)) { 230 230 gpte->eaddr = eaddr; 231 231 gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); 232 232 gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff); ··· 269 269 goto no_page_found; 270 270 } 271 271 272 - if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp) 272 + if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp) 273 273 key = 4; 274 - else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks) 274 + else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks) 275 275 key = 4; 276 276 277 277 for (i=0; i<16; i+=2) { 278 + u64 pte0 = be64_to_cpu(pteg[i]); 279 + u64 pte1 = be64_to_cpu(pteg[i + 1]); 280 + 278 281 /* Check all relevant fields of 1st dword */ 279 - if ((pteg[i] & v_mask) == v_val) { 282 + if ((pte0 & v_mask) == v_val) { 280 283 /* If large page bit is set, check pgsize encoding */ 281 284 if (slbe->large && 282 285 (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { 283 - pgsize = decode_pagesize(slbe, pteg[i+1]); 286 + pgsize = decode_pagesize(slbe, pte1); 284 287 if (pgsize < 0) 285 288 continue; 286 289 } ··· 300 297 goto do_second; 301 298 } 302 299 303 - v = pteg[i]; 304 - r = pteg[i+1]; 300 + v = be64_to_cpu(pteg[i]); 301 + r = be64_to_cpu(pteg[i+1]); 305 302 pp = (r & HPTE_R_PP) | key; 306 303 if (r & HPTE_R_PP0) 307 304 pp |= 8; ··· 313 310 gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask); 314 311 gpte->page_size = pgsize; 315 312 gpte->may_execute = ((r & HPTE_R_N) ? false : true); 313 + if (unlikely(vcpu->arch.disable_kernel_nx) && 314 + !(kvmppc_get_msr(vcpu) & MSR_PR)) 315 + gpte->may_execute = true; 316 316 gpte->may_read = false; 317 317 gpte->may_write = false; 318 318 ··· 348 342 * non-PAPR platforms such as mac99, and this is 349 343 * what real hardware does. 350 344 */ 351 - char __user *addr = (char __user *) &pteg[i+1]; 345 + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); 352 346 r |= HPTE_R_R; 353 347 put_user(r >> 8, addr + 6); 354 348 } 355 349 if (iswrite && gpte->may_write && !(r & HPTE_R_C)) { 356 350 /* Set the dirty flag */ 357 351 /* Use a single byte write */ 358 - char __user *addr = (char __user *) &pteg[i+1]; 352 + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); 359 353 r |= HPTE_R_C; 360 354 put_user(r, addr + 7); 361 355 } ··· 485 479 vcpu->arch.slb[i].origv = 0; 486 480 } 487 481 488 - if (vcpu->arch.shared->msr & MSR_IR) { 482 + if (kvmppc_get_msr(vcpu) & MSR_IR) { 489 483 kvmppc_mmu_flush_segments(vcpu); 490 484 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); 491 485 } ··· 569 563 { 570 564 ulong mp_ea = vcpu->arch.magic_page_ea; 571 565 572 - return mp_ea && !(vcpu->arch.shared->msr & MSR_PR) && 566 + return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) && 573 567 (mp_ea >> SID_SHIFT) == esid; 574 568 } 575 569 #endif ··· 582 576 u64 gvsid = esid; 583 577 ulong mp_ea = vcpu->arch.magic_page_ea; 584 578 int pagesize = MMU_PAGE_64K; 579 + u64 msr = kvmppc_get_msr(vcpu); 585 580 586 - if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 581 + if (msr & (MSR_DR|MSR_IR)) { 587 582 slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); 588 583 if (slb) { 589 584 gvsid = slb->vsid; ··· 597 590 } 598 591 } 599 592 600 - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 593 + switch (msr & (MSR_DR|MSR_IR)) { 601 594 case 0: 602 595 gvsid = VSID_REAL | esid; 603 596 break; ··· 630 623 gvsid |= VSID_64K; 631 624 #endif 632 625 633 - if (vcpu->arch.shared->msr & MSR_PR) 626 + if (kvmppc_get_msr(vcpu) & MSR_PR) 634 627 gvsid |= VSID_PR; 635 628 636 629 *vsid = gvsid; ··· 640 633 /* Catch magic page case */ 641 634 if (unlikely(mp_ea) && 642 635 unlikely(esid == (mp_ea >> SID_SHIFT)) && 643 - !(vcpu->arch.shared->msr & MSR_PR)) { 636 + !(kvmppc_get_msr(vcpu) & MSR_PR)) { 644 637 *vsid = VSID_REAL | esid; 645 638 return 0; 646 639 }

+6 -9

arch/powerpc/kvm/book3s_64_mmu_host.c

··· 58 58 struct kvmppc_sid_map *map; 59 59 u16 sid_map_mask; 60 60 61 - if (vcpu->arch.shared->msr & MSR_PR) 61 + if (kvmppc_get_msr(vcpu) & MSR_PR) 62 62 gvsid |= VSID_PR; 63 63 64 64 sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); ··· 230 230 u16 sid_map_mask; 231 231 static int backwards_map = 0; 232 232 233 - if (vcpu->arch.shared->msr & MSR_PR) 233 + if (kvmppc_get_msr(vcpu) & MSR_PR) 234 234 gvsid |= VSID_PR; 235 235 236 236 /* We might get collisions that trap in preceding order, so let's ··· 271 271 int found_inval = -1; 272 272 int r; 273 273 274 - if (!svcpu->slb_max) 275 - svcpu->slb_max = 1; 276 - 277 274 /* Are we overwriting? */ 278 - for (i = 1; i < svcpu->slb_max; i++) { 275 + for (i = 0; i < svcpu->slb_max; i++) { 279 276 if (!(svcpu->slb[i].esid & SLB_ESID_V)) 280 277 found_inval = i; 281 278 else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { ··· 282 285 } 283 286 284 287 /* Found a spare entry that was invalidated before */ 285 - if (found_inval > 0) { 288 + if (found_inval >= 0) { 286 289 r = found_inval; 287 290 goto out; 288 291 } ··· 356 359 ulong seg_mask = -seg_size; 357 360 int i; 358 361 359 - for (i = 1; i < svcpu->slb_max; i++) { 362 + for (i = 0; i < svcpu->slb_max; i++) { 360 363 if ((svcpu->slb[i].esid & SLB_ESID_V) && 361 364 (svcpu->slb[i].esid & seg_mask) == ea) { 362 365 /* Invalidate this entry */ ··· 370 373 void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) 371 374 { 372 375 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); 373 - svcpu->slb_max = 1; 376 + svcpu->slb_max = 0; 374 377 svcpu->slb[0].esid = 0; 375 378 svcpu_put(svcpu); 376 379 }

+79 -37

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 52 52 53 53 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 54 54 { 55 - unsigned long hpt; 55 + unsigned long hpt = 0; 56 56 struct revmap_entry *rev; 57 57 struct page *page = NULL; 58 58 long order = KVM_DEFAULT_HPT_ORDER; ··· 64 64 } 65 65 66 66 kvm->arch.hpt_cma_alloc = 0; 67 - /* 68 - * try first to allocate it from the kernel page allocator. 69 - * We keep the CMA reserved for failed allocation. 70 - */ 71 - hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT | 72 - __GFP_NOWARN, order - PAGE_SHIFT); 73 - 74 - /* Next try to allocate from the preallocated pool */ 75 - if (!hpt) { 76 - VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); 77 - page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); 78 - if (page) { 79 - hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 80 - kvm->arch.hpt_cma_alloc = 1; 81 - } else 82 - --order; 67 + VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); 68 + page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); 69 + if (page) { 70 + hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 71 + kvm->arch.hpt_cma_alloc = 1; 83 72 } 84 73 85 74 /* Lastly try successively smaller sizes from the page allocator */ ··· 585 596 struct kvm *kvm = vcpu->kvm; 586 597 unsigned long *hptep, hpte[3], r; 587 598 unsigned long mmu_seq, psize, pte_size; 599 + unsigned long gpa_base, gfn_base; 588 600 unsigned long gpa, gfn, hva, pfn; 589 601 struct kvm_memory_slot *memslot; 590 602 unsigned long *rmap; ··· 624 634 625 635 /* Translate the logical address and get the page */ 626 636 psize = hpte_page_size(hpte[0], r); 627 - gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1)); 637 + gpa_base = r & HPTE_R_RPN & ~(psize - 1); 638 + gfn_base = gpa_base >> PAGE_SHIFT; 639 + gpa = gpa_base | (ea & (psize - 1)); 628 640 gfn = gpa >> PAGE_SHIFT; 629 641 memslot = gfn_to_memslot(kvm, gfn); 630 642 ··· 637 645 638 646 if (!kvm->arch.using_mmu_notifiers) 639 647 return -EFAULT; /* should never get here */ 648 + 649 + /* 650 + * This should never happen, because of the slot_is_aligned() 651 + * check in kvmppc_do_h_enter(). 652 + */ 653 + if (gfn_base < memslot->base_gfn) 654 + return -EFAULT; 640 655 641 656 /* used to check for invalidations in progress */ 642 657 mmu_seq = kvm->mmu_notifier_seq; ··· 737 738 goto out_unlock; 738 739 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 739 740 740 - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 741 + /* Always put the HPTE in the rmap chain for the page base address */ 742 + rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 741 743 lock_rmap(rmap); 742 744 743 745 /* Check if we might have been invalidated; let the guest retry if so */ ··· 1060 1060 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 1061 1061 } 1062 1062 1063 - static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) 1063 + static int vcpus_running(struct kvm *kvm) 1064 + { 1065 + return atomic_read(&kvm->arch.vcpus_running) != 0; 1066 + } 1067 + 1068 + /* 1069 + * Returns the number of system pages that are dirty. 1070 + * This can be more than 1 if we find a huge-page HPTE. 1071 + */ 1072 + static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1064 1073 { 1065 1074 struct revmap_entry *rev = kvm->arch.revmap; 1066 1075 unsigned long head, i, j; 1076 + unsigned long n; 1077 + unsigned long v, r; 1067 1078 unsigned long *hptep; 1068 - int ret = 0; 1079 + int npages_dirty = 0; 1069 1080 1070 1081 retry: 1071 1082 lock_rmap(rmapp); 1072 1083 if (*rmapp & KVMPPC_RMAP_CHANGED) { 1073 1084 *rmapp &= ~KVMPPC_RMAP_CHANGED; 1074 - ret = 1; 1085 + npages_dirty = 1; 1075 1086 } 1076 1087 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1077 1088 unlock_rmap(rmapp); 1078 - return ret; 1089 + return npages_dirty; 1079 1090 } 1080 1091 1081 1092 i = head = *rmapp & KVMPPC_RMAP_INDEX; ··· 1094 1083 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); 1095 1084 j = rev[i].forw; 1096 1085 1097 - if (!(hptep[1] & HPTE_R_C)) 1086 + /* 1087 + * Checking the C (changed) bit here is racy since there 1088 + * is no guarantee about when the hardware writes it back. 1089 + * If the HPTE is not writable then it is stable since the 1090 + * page can't be written to, and we would have done a tlbie 1091 + * (which forces the hardware to complete any writeback) 1092 + * when making the HPTE read-only. 1093 + * If vcpus are running then this call is racy anyway 1094 + * since the page could get dirtied subsequently, so we 1095 + * expect there to be a further call which would pick up 1096 + * any delayed C bit writeback. 1097 + * Otherwise we need to do the tlbie even if C==0 in 1098 + * order to pick up any delayed writeback of C. 1099 + */ 1100 + if (!(hptep[1] & HPTE_R_C) && 1101 + (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) 1098 1102 continue; 1099 1103 1100 1104 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { ··· 1121 1095 } 1122 1096 1123 1097 /* Now check and modify the HPTE */ 1124 - if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { 1125 - /* need to make it temporarily absent to clear C */ 1126 - hptep[0] |= HPTE_V_ABSENT; 1127 - kvmppc_invalidate_hpte(kvm, hptep, i); 1128 - hptep[1] &= ~HPTE_R_C; 1129 - eieio(); 1130 - hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 1098 + if (!(hptep[0] & HPTE_V_VALID)) 1099 + continue; 1100 + 1101 + /* need to make it temporarily absent so C is stable */ 1102 + hptep[0] |= HPTE_V_ABSENT; 1103 + kvmppc_invalidate_hpte(kvm, hptep, i); 1104 + v = hptep[0]; 1105 + r = hptep[1]; 1106 + if (r & HPTE_R_C) { 1107 + hptep[1] = r & ~HPTE_R_C; 1131 1108 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1132 1109 rev[i].guest_rpte |= HPTE_R_C; 1133 1110 note_hpte_modification(kvm, &rev[i]); 1134 1111 } 1135 - ret = 1; 1112 + n = hpte_page_size(v, r); 1113 + n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1114 + if (n > npages_dirty) 1115 + npages_dirty = n; 1116 + eieio(); 1136 1117 } 1137 - hptep[0] &= ~HPTE_V_HVLOCK; 1118 + v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); 1119 + v |= HPTE_V_VALID; 1120 + hptep[0] = v; 1138 1121 } while ((i = j) != head); 1139 1122 1140 1123 unlock_rmap(rmapp); 1141 - return ret; 1124 + return npages_dirty; 1142 1125 } 1143 1126 1144 1127 static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, ··· 1171 1136 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, 1172 1137 unsigned long *map) 1173 1138 { 1174 - unsigned long i; 1139 + unsigned long i, j; 1175 1140 unsigned long *rmapp; 1176 1141 struct kvm_vcpu *vcpu; 1177 1142 1178 1143 preempt_disable(); 1179 1144 rmapp = memslot->arch.rmap; 1180 1145 for (i = 0; i < memslot->npages; ++i) { 1181 - if (kvm_test_clear_dirty(kvm, rmapp) && map) 1182 - __set_bit_le(i, map); 1146 + int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1147 + /* 1148 + * Note that if npages > 0 then i must be a multiple of npages, 1149 + * since we always put huge-page HPTEs in the rmap chain 1150 + * corresponding to their page base address. 1151 + */ 1152 + if (npages && map) 1153 + for (j = i; npages; ++j, --npages) 1154 + __set_bit_le(j, map); 1183 1155 ++rmapp; 1184 1156 } 1185 1157

+40 -47

arch/powerpc/kvm/book3s_64_slb.S

··· 17 17 * Authors: Alexander Graf <agraf@suse.de> 18 18 */ 19 19 20 - #ifdef __LITTLE_ENDIAN__ 21 - #error Need to fix SLB shadow accesses in little endian mode 22 - #endif 23 - 24 - #define SHADOW_SLB_ESID(num) (SLBSHADOW_SAVEAREA + (num * 0x10)) 25 - #define SHADOW_SLB_VSID(num) (SLBSHADOW_SAVEAREA + (num * 0x10) + 0x8) 26 - #define UNBOLT_SLB_ENTRY(num) \ 27 - ld r9, SHADOW_SLB_ESID(num)(r12); \ 28 - /* Invalid? Skip. */; \ 29 - rldicl. r0, r9, 37, 63; \ 30 - beq slb_entry_skip_ ## num; \ 31 - xoris r9, r9, SLB_ESID_V@h; \ 32 - std r9, SHADOW_SLB_ESID(num)(r12); \ 33 - slb_entry_skip_ ## num: 34 - 35 - #define REBOLT_SLB_ENTRY(num) \ 36 - ld r10, SHADOW_SLB_ESID(num)(r11); \ 37 - cmpdi r10, 0; \ 38 - beq slb_exit_skip_ ## num; \ 39 - oris r10, r10, SLB_ESID_V@h; \ 40 - ld r9, SHADOW_SLB_VSID(num)(r11); \ 41 - slbmte r9, r10; \ 42 - std r10, SHADOW_SLB_ESID(num)(r11); \ 43 - slb_exit_skip_ ## num: 20 + #define SHADOW_SLB_ENTRY_LEN 0x10 21 + #define OFFSET_ESID(x) (SHADOW_SLB_ENTRY_LEN * x) 22 + #define OFFSET_VSID(x) ((SHADOW_SLB_ENTRY_LEN * x) + 8) 44 23 45 24 /****************************************************************************** 46 25 * * ··· 43 64 * SVCPU[LR] = guest LR 44 65 */ 45 66 46 - /* Remove LPAR shadow entries */ 67 + BEGIN_FW_FTR_SECTION 47 68 48 - #if SLB_NUM_BOLTED == 3 69 + /* Declare SLB shadow as 0 entries big */ 49 70 50 - ld r12, PACA_SLBSHADOWPTR(r13) 71 + ld r11, PACA_SLBSHADOWPTR(r13) 72 + li r8, 0 73 + stb r8, 3(r11) 51 74 52 - /* Remove bolted entries */ 53 - UNBOLT_SLB_ENTRY(0) 54 - UNBOLT_SLB_ENTRY(1) 55 - UNBOLT_SLB_ENTRY(2) 56 - 57 - #else 58 - #error unknown number of bolted entries 59 - #endif 75 + END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) 60 76 61 77 /* Flush SLB */ 62 78 ··· 74 100 75 101 ld r10, 0(r11) 76 102 77 - rldicl. r0, r10, 37, 63 103 + andis. r9, r10, SLB_ESID_V@h 78 104 beq slb_loop_enter_skip 79 105 80 106 ld r9, 8(r11) ··· 111 137 * 112 138 */ 113 139 114 - /* Restore bolted entries from the shadow and fix it along the way */ 140 + /* Remove all SLB entries that are in use. */ 115 141 116 - /* We don't store anything in entry 0, so we don't need to take care of it */ 142 + li r0, r0 143 + slbmte r0, r0 117 144 slbia 118 - isync 119 145 120 - #if SLB_NUM_BOLTED == 3 146 + /* Restore bolted entries from the shadow */ 121 147 122 148 ld r11, PACA_SLBSHADOWPTR(r13) 123 149 124 - REBOLT_SLB_ENTRY(0) 125 - REBOLT_SLB_ENTRY(1) 126 - REBOLT_SLB_ENTRY(2) 127 - 128 - #else 129 - #error unknown number of bolted entries 130 - #endif 150 + BEGIN_FW_FTR_SECTION 151 + 152 + /* Declare SLB shadow as SLB_NUM_BOLTED entries big */ 153 + 154 + li r8, SLB_NUM_BOLTED 155 + stb r8, 3(r11) 156 + 157 + END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) 158 + 159 + /* Manually load all entries from shadow SLB */ 160 + 161 + li r8, SLBSHADOW_SAVEAREA 162 + li r7, SLBSHADOW_SAVEAREA + 8 163 + 164 + .rept SLB_NUM_BOLTED 165 + LDX_BE r10, r11, r8 166 + cmpdi r10, 0 167 + beq 1f 168 + LDX_BE r9, r11, r7 169 + slbmte r9, r10 170 + 1: addi r7, r7, SHADOW_SLB_ENTRY_LEN 171 + addi r8, r8, SHADOW_SLB_ENTRY_LEN 172 + .endr 173 + 174 + isync 175 + sync 131 176 132 177 slb_do_exit: 133 178

+104 -52

arch/powerpc/kvm/book3s_emulate.c

··· 80 80 return false; 81 81 82 82 /* Limit user space to its own small SPR set */ 83 - if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM) 83 + if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM) 84 84 return false; 85 85 86 86 return true; ··· 94 94 int rs = get_rs(inst); 95 95 int ra = get_ra(inst); 96 96 int rb = get_rb(inst); 97 + u32 inst_sc = 0x44000002; 97 98 98 99 switch (get_op(inst)) { 100 + case 0: 101 + emulated = EMULATE_FAIL; 102 + if ((kvmppc_get_msr(vcpu) & MSR_LE) && 103 + (inst == swab32(inst_sc))) { 104 + /* 105 + * This is the byte reversed syscall instruction of our 106 + * hypercall handler. Early versions of LE Linux didn't 107 + * swap the instructions correctly and ended up in 108 + * illegal instructions. 109 + * Just always fail hypercalls on these broken systems. 110 + */ 111 + kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED); 112 + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); 113 + emulated = EMULATE_DONE; 114 + } 115 + break; 99 116 case 19: 100 117 switch (get_xop(inst)) { 101 118 case OP_19_XOP_RFID: 102 119 case OP_19_XOP_RFI: 103 - kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0); 104 - kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); 120 + kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu)); 121 + kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu)); 105 122 *advance = 0; 106 123 break; 107 124 ··· 130 113 case 31: 131 114 switch (get_xop(inst)) { 132 115 case OP_31_XOP_MFMSR: 133 - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr); 116 + kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu)); 134 117 break; 135 118 case OP_31_XOP_MTMSRD: 136 119 { 137 120 ulong rs_val = kvmppc_get_gpr(vcpu, rs); 138 121 if (inst & 0x10000) { 139 - ulong new_msr = vcpu->arch.shared->msr; 122 + ulong new_msr = kvmppc_get_msr(vcpu); 140 123 new_msr &= ~(MSR_RI | MSR_EE); 141 124 new_msr |= rs_val & (MSR_RI | MSR_EE); 142 - vcpu->arch.shared->msr = new_msr; 125 + kvmppc_set_msr_fast(vcpu, new_msr); 143 126 } else 144 127 kvmppc_set_msr(vcpu, rs_val); 145 128 break; ··· 196 179 ulong cmd = kvmppc_get_gpr(vcpu, 3); 197 180 int i; 198 181 199 - if ((vcpu->arch.shared->msr & MSR_PR) || 182 + if ((kvmppc_get_msr(vcpu) & MSR_PR) || 200 183 !vcpu->arch.papr_enabled) { 201 184 emulated = EMULATE_FAIL; 202 185 break; ··· 278 261 ra_val = kvmppc_get_gpr(vcpu, ra); 279 262 280 263 addr = (ra_val + rb_val) & ~31ULL; 281 - if (!(vcpu->arch.shared->msr & MSR_SF)) 264 + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) 282 265 addr &= 0xffffffff; 283 266 vaddr = addr; 284 267 285 268 r = kvmppc_st(vcpu, &addr, 32, zeros, true); 286 269 if ((r == -ENOENT) || (r == -EPERM)) { 287 270 *advance = 0; 288 - vcpu->arch.shared->dar = vaddr; 271 + kvmppc_set_dar(vcpu, vaddr); 289 272 vcpu->arch.fault_dar = vaddr; 290 273 291 274 dsisr = DSISR_ISSTORE; ··· 294 277 else if (r == -EPERM) 295 278 dsisr |= DSISR_PROTFAULT; 296 279 297 - vcpu->arch.shared->dsisr = dsisr; 280 + kvmppc_set_dsisr(vcpu, dsisr); 298 281 vcpu->arch.fault_dsisr = dsisr; 299 282 300 283 kvmppc_book3s_queue_irqprio(vcpu, ··· 373 356 to_book3s(vcpu)->sdr1 = spr_val; 374 357 break; 375 358 case SPRN_DSISR: 376 - vcpu->arch.shared->dsisr = spr_val; 359 + kvmppc_set_dsisr(vcpu, spr_val); 377 360 break; 378 361 case SPRN_DAR: 379 - vcpu->arch.shared->dar = spr_val; 362 + kvmppc_set_dar(vcpu, spr_val); 380 363 break; 381 364 case SPRN_HIOR: 382 365 to_book3s(vcpu)->hior = spr_val; ··· 455 438 case SPRN_GQR7: 456 439 to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val; 457 440 break; 441 + case SPRN_FSCR: 442 + vcpu->arch.fscr = spr_val; 443 + break; 444 + #ifdef CONFIG_PPC_BOOK3S_64 445 + case SPRN_BESCR: 446 + vcpu->arch.bescr = spr_val; 447 + break; 448 + case SPRN_EBBHR: 449 + vcpu->arch.ebbhr = spr_val; 450 + break; 451 + case SPRN_EBBRR: 452 + vcpu->arch.ebbrr = spr_val; 453 + break; 454 + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 455 + case SPRN_TFHAR: 456 + vcpu->arch.tfhar = spr_val; 457 + break; 458 + case SPRN_TEXASR: 459 + vcpu->arch.texasr = spr_val; 460 + break; 461 + case SPRN_TFIAR: 462 + vcpu->arch.tfiar = spr_val; 463 + break; 464 + #endif 465 + #endif 458 466 case SPRN_ICTC: 459 467 case SPRN_THRM1: 460 468 case SPRN_THRM2: ··· 497 455 case SPRN_WPAR_GEKKO: 498 456 case SPRN_MSSSR0: 499 457 case SPRN_DABR: 458 + #ifdef CONFIG_PPC_BOOK3S_64 459 + case SPRN_MMCRS: 460 + case SPRN_MMCRA: 461 + case SPRN_MMCR0: 462 + case SPRN_MMCR1: 463 + case SPRN_MMCR2: 464 + #endif 500 465 break; 501 466 unprivileged: 502 467 default: ··· 542 493 *spr_val = to_book3s(vcpu)->sdr1; 543 494 break; 544 495 case SPRN_DSISR: 545 - *spr_val = vcpu->arch.shared->dsisr; 496 + *spr_val = kvmppc_get_dsisr(vcpu); 546 497 break; 547 498 case SPRN_DAR: 548 - *spr_val = vcpu->arch.shared->dar; 499 + *spr_val = kvmppc_get_dar(vcpu); 549 500 break; 550 501 case SPRN_HIOR: 551 502 *spr_val = to_book3s(vcpu)->hior; ··· 587 538 case SPRN_GQR7: 588 539 *spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]; 589 540 break; 541 + case SPRN_FSCR: 542 + *spr_val = vcpu->arch.fscr; 543 + break; 544 + #ifdef CONFIG_PPC_BOOK3S_64 545 + case SPRN_BESCR: 546 + *spr_val = vcpu->arch.bescr; 547 + break; 548 + case SPRN_EBBHR: 549 + *spr_val = vcpu->arch.ebbhr; 550 + break; 551 + case SPRN_EBBRR: 552 + *spr_val = vcpu->arch.ebbrr; 553 + break; 554 + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 555 + case SPRN_TFHAR: 556 + *spr_val = vcpu->arch.tfhar; 557 + break; 558 + case SPRN_TEXASR: 559 + *spr_val = vcpu->arch.texasr; 560 + break; 561 + case SPRN_TFIAR: 562 + *spr_val = vcpu->arch.tfiar; 563 + break; 564 + #endif 565 + #endif 590 566 case SPRN_THRM1: 591 567 case SPRN_THRM2: 592 568 case SPRN_THRM3: ··· 627 553 case SPRN_WPAR_GEKKO: 628 554 case SPRN_MSSSR0: 629 555 case SPRN_DABR: 556 + #ifdef CONFIG_PPC_BOOK3S_64 557 + case SPRN_MMCRS: 558 + case SPRN_MMCRA: 559 + case SPRN_MMCR0: 560 + case SPRN_MMCR1: 561 + case SPRN_MMCR2: 562 + case SPRN_TIR: 563 + #endif 630 564 *spr_val = 0; 631 565 break; 632 566 default: ··· 651 569 652 570 u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) 653 571 { 654 - u32 dsisr = 0; 655 - 656 - /* 657 - * This is what the spec says about DSISR bits (not mentioned = 0): 658 - * 659 - * 12:13 [DS] Set to bits 30:31 660 - * 15:16 [X] Set to bits 29:30 661 - * 17 [X] Set to bit 25 662 - * [D/DS] Set to bit 5 663 - * 18:21 [X] Set to bits 21:24 664 - * [D/DS] Set to bits 1:4 665 - * 22:26 Set to bits 6:10 (RT/RS/FRT/FRS) 666 - * 27:31 Set to bits 11:15 (RA) 667 - */ 668 - 669 - switch (get_op(inst)) { 670 - /* D-form */ 671 - case OP_LFS: 672 - case OP_LFD: 673 - case OP_STFD: 674 - case OP_STFS: 675 - dsisr |= (inst >> 12) & 0x4000; /* bit 17 */ 676 - dsisr |= (inst >> 17) & 0x3c00; /* bits 18:21 */ 677 - break; 678 - /* X-form */ 679 - case 31: 680 - dsisr |= (inst << 14) & 0x18000; /* bits 15:16 */ 681 - dsisr |= (inst << 8) & 0x04000; /* bit 17 */ 682 - dsisr |= (inst << 3) & 0x03c00; /* bits 18:21 */ 683 - break; 684 - default: 685 - printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); 686 - break; 687 - } 688 - 689 - dsisr |= (inst >> 16) & 0x03ff; /* bits 22:31 */ 690 - 691 - return dsisr; 572 + return make_dsisr(inst); 692 573 } 693 574 694 575 ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) 695 576 { 577 + #ifdef CONFIG_PPC_BOOK3S_64 578 + /* 579 + * Linux's fix_alignment() assumes that DAR is valid, so can we 580 + */ 581 + return vcpu->arch.fault_dar; 582 + #else 696 583 ulong dar = 0; 697 584 ulong ra = get_ra(inst); 698 585 ulong rb = get_rb(inst); ··· 686 635 } 687 636 688 637 return dar; 638 + #endif 689 639 }

+1

arch/powerpc/kvm/book3s_exports.c

··· 18 18 */ 19 19 20 20 #include <linux/export.h> 21 + #include <asm/kvm_ppc.h> 21 22 #include <asm/kvm_book3s.h> 22 23 23 24 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE

+18 -30

arch/powerpc/kvm/book3s_hv.c

··· 879 879 case KVM_REG_PPC_IAMR: 880 880 *val = get_reg_val(id, vcpu->arch.iamr); 881 881 break; 882 - case KVM_REG_PPC_FSCR: 883 - *val = get_reg_val(id, vcpu->arch.fscr); 884 - break; 885 882 case KVM_REG_PPC_PSPB: 886 883 *val = get_reg_val(id, vcpu->arch.pspb); 887 - break; 888 - case KVM_REG_PPC_EBBHR: 889 - *val = get_reg_val(id, vcpu->arch.ebbhr); 890 - break; 891 - case KVM_REG_PPC_EBBRR: 892 - *val = get_reg_val(id, vcpu->arch.ebbrr); 893 - break; 894 - case KVM_REG_PPC_BESCR: 895 - *val = get_reg_val(id, vcpu->arch.bescr); 896 - break; 897 - case KVM_REG_PPC_TAR: 898 - *val = get_reg_val(id, vcpu->arch.tar); 899 884 break; 900 885 case KVM_REG_PPC_DPDES: 901 886 *val = get_reg_val(id, vcpu->arch.vcore->dpdes); ··· 1076 1091 case KVM_REG_PPC_IAMR: 1077 1092 vcpu->arch.iamr = set_reg_val(id, *val); 1078 1093 break; 1079 - case KVM_REG_PPC_FSCR: 1080 - vcpu->arch.fscr = set_reg_val(id, *val); 1081 - break; 1082 1094 case KVM_REG_PPC_PSPB: 1083 1095 vcpu->arch.pspb = set_reg_val(id, *val); 1084 - break; 1085 - case KVM_REG_PPC_EBBHR: 1086 - vcpu->arch.ebbhr = set_reg_val(id, *val); 1087 - break; 1088 - case KVM_REG_PPC_EBBRR: 1089 - vcpu->arch.ebbrr = set_reg_val(id, *val); 1090 - break; 1091 - case KVM_REG_PPC_BESCR: 1092 - vcpu->arch.bescr = set_reg_val(id, *val); 1093 - break; 1094 - case KVM_REG_PPC_TAR: 1095 - vcpu->arch.tar = set_reg_val(id, *val); 1096 1096 break; 1097 1097 case KVM_REG_PPC_DPDES: 1098 1098 vcpu->arch.vcore->dpdes = set_reg_val(id, *val); ··· 1250 1280 goto free_vcpu; 1251 1281 1252 1282 vcpu->arch.shared = &vcpu->arch.shregs; 1283 + #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 1284 + /* 1285 + * The shared struct is never shared on HV, 1286 + * so we can always use host endianness 1287 + */ 1288 + #ifdef __BIG_ENDIAN__ 1289 + vcpu->arch.shared_big_endian = true; 1290 + #else 1291 + vcpu->arch.shared_big_endian = false; 1292 + #endif 1293 + #endif 1253 1294 vcpu->arch.mmcr[0] = MMCR0_FC; 1254 1295 vcpu->arch.ctrl = CTRL_RUNLATCH; 1255 1296 /* default to host PVR, since we can't spoof it */ ··· 1930 1949 * support pte_enc here 1931 1950 */ 1932 1951 (*sps)->enc[0].pte_enc = def->penc[linux_psize]; 1952 + /* 1953 + * Add 16MB MPSS support if host supports it 1954 + */ 1955 + if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { 1956 + (*sps)->enc[1].page_shift = 24; 1957 + (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; 1958 + } 1933 1959 (*sps)++; 1934 1960 } 1935 1961

+2 -1

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 42 42 43 43 /* 44 44 * If there is only one vcore, and it's currently running, 45 + * as indicated by local_paca->kvm_hstate.kvm_vcpu being set, 45 46 * we can use tlbiel as long as we mark all other physical 46 47 * cores as potentially having stale TLB entries for this lpid. 47 48 * If we're not using MMU notifiers, we never take pages away 48 49 * from the guest, so we can use tlbiel if requested. 49 50 * Otherwise, don't use tlbiel. 50 51 */ 51 - if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore) 52 + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu) 52 53 global = 0; 53 54 else if (kvm->arch.using_mmu_notifiers) 54 55 global = 1;

+58 -2

arch/powerpc/kvm/book3s_hv_rmhandlers.S

··· 86 86 lbz r4, LPPACA_PMCINUSE(r3) 87 87 cmpwi r4, 0 88 88 beq 23f /* skip if not */ 89 + BEGIN_FTR_SECTION 90 + ld r3, HSTATE_MMCR(r13) 91 + andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO 92 + cmpwi r4, MMCR0_PMAO 93 + beql kvmppc_fix_pmao 94 + END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) 89 95 lwz r3, HSTATE_PMC(r13) 90 96 lwz r4, HSTATE_PMC + 4(r13) 91 97 lwz r5, HSTATE_PMC + 8(r13) ··· 743 737 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ 744 738 mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ 745 739 isync 740 + BEGIN_FTR_SECTION 741 + ld r3, VCPU_MMCR(r4) 742 + andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO 743 + cmpwi r5, MMCR0_PMAO 744 + beql kvmppc_fix_pmao 745 + END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) 746 746 lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ 747 747 lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ 748 748 lwz r6, VCPU_PMC + 8(r4) ··· 1451 1439 25: 1452 1440 /* Save PMU registers if requested */ 1453 1441 /* r8 and cr0.eq are live here */ 1442 + BEGIN_FTR_SECTION 1443 + /* 1444 + * POWER8 seems to have a hardware bug where setting 1445 + * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] 1446 + * when some counters are already negative doesn't seem 1447 + * to cause a performance monitor alert (and hence interrupt). 1448 + * The effect of this is that when saving the PMU state, 1449 + * if there is no PMU alert pending when we read MMCR0 1450 + * before freezing the counters, but one becomes pending 1451 + * before we read the counters, we lose it. 1452 + * To work around this, we need a way to freeze the counters 1453 + * before reading MMCR0. Normally, freezing the counters 1454 + * is done by writing MMCR0 (to set MMCR0[FC]) which 1455 + * unavoidably writes MMCR0[PMA0] as well. On POWER8, 1456 + * we can also freeze the counters using MMCR2, by writing 1457 + * 1s to all the counter freeze condition bits (there are 1458 + * 9 bits each for 6 counters). 1459 + */ 1460 + li r3, -1 /* set all freeze bits */ 1461 + clrrdi r3, r3, 10 1462 + mfspr r10, SPRN_MMCR2 1463 + mtspr SPRN_MMCR2, r3 1464 + isync 1465 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 1454 1466 li r3, 1 1455 1467 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ 1456 1468 mfspr r4, SPRN_MMCR0 /* save MMCR0 */ ··· 1498 1462 std r4, VCPU_MMCR(r9) 1499 1463 std r5, VCPU_MMCR + 8(r9) 1500 1464 std r6, VCPU_MMCR + 16(r9) 1465 + BEGIN_FTR_SECTION 1466 + std r10, VCPU_MMCR + 24(r9) 1467 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 1501 1468 std r7, VCPU_SIAR(r9) 1502 1469 std r8, VCPU_SDAR(r9) 1503 1470 mfspr r3, SPRN_PMC1 ··· 1524 1485 stw r11, VCPU_PMC + 28(r9) 1525 1486 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 1526 1487 BEGIN_FTR_SECTION 1527 - mfspr r4, SPRN_MMCR2 1528 1488 mfspr r5, SPRN_SIER 1529 1489 mfspr r6, SPRN_SPMC1 1530 1490 mfspr r7, SPRN_SPMC2 1531 1491 mfspr r8, SPRN_MMCRS 1532 - std r4, VCPU_MMCR + 24(r9) 1533 1492 std r5, VCPU_SIER(r9) 1534 1493 stw r6, VCPU_PMC + 24(r9) 1535 1494 stw r7, VCPU_PMC + 28(r9) ··· 2264 2227 beq mc_cont 2265 2228 /* If not, deliver a machine check. SRR0/1 are already set */ 2266 2229 li r10, BOOK3S_INTERRUPT_MACHINE_CHECK 2230 + ld r11, VCPU_MSR(r9) 2267 2231 bl kvmppc_msr_interrupt 2268 2232 b fast_interrupt_c_return 2269 2233 ··· 2468 2430 /* ... if transactional, change to suspended */ 2469 2431 li r0, 1 2470 2432 1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG 2433 + blr 2434 + 2435 + /* 2436 + * This works around a hardware bug on POWER8E processors, where 2437 + * writing a 1 to the MMCR0[PMAO] bit doesn't generate a 2438 + * performance monitor interrupt. Instead, when we need to have 2439 + * an interrupt pending, we have to arrange for a counter to overflow. 2440 + */ 2441 + kvmppc_fix_pmao: 2442 + li r3, 0 2443 + mtspr SPRN_MMCR2, r3 2444 + lis r3, (MMCR0_PMXE | MMCR0_FCECE)@h 2445 + ori r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN 2446 + mtspr SPRN_MMCR0, r3 2447 + lis r3, 0x7fff 2448 + ori r3, r3, 0xffff 2449 + mtspr SPRN_PMC6, r3 2450 + isync 2471 2451 blr

+21 -2

arch/powerpc/kvm/book3s_interrupts.S

··· 104 104 stb r3, HSTATE_RESTORE_HID5(r13) 105 105 106 106 /* Load up guest SPRG3 value, since it's user readable */ 107 - ld r3, VCPU_SHARED(r4) 108 - ld r3, VCPU_SHARED_SPRG3(r3) 107 + lwz r3, VCPU_SHAREDBE(r4) 108 + cmpwi r3, 0 109 + ld r5, VCPU_SHARED(r4) 110 + beq sprg3_little_endian 111 + sprg3_big_endian: 112 + #ifdef __BIG_ENDIAN__ 113 + ld r3, VCPU_SHARED_SPRG3(r5) 114 + #else 115 + addi r5, r5, VCPU_SHARED_SPRG3 116 + ldbrx r3, 0, r5 117 + #endif 118 + b after_sprg3_load 119 + sprg3_little_endian: 120 + #ifdef __LITTLE_ENDIAN__ 121 + ld r3, VCPU_SHARED_SPRG3(r5) 122 + #else 123 + addi r5, r5, VCPU_SHARED_SPRG3 124 + ldbrx r3, 0, r5 125 + #endif 126 + 127 + after_sprg3_load: 109 128 mtspr SPRN_SPRG3, r3 110 129 #endif /* CONFIG_PPC_BOOK3S_64 */ 111 130

+9 -7

arch/powerpc/kvm/book3s_paired_singles.c

··· 165 165 166 166 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) 167 167 { 168 - u64 dsisr; 169 - struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; 168 + u32 dsisr; 169 + u64 msr = kvmppc_get_msr(vcpu); 170 170 171 - shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0); 172 - shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0); 173 - shared->dar = eaddr; 171 + msr = kvmppc_set_field(msr, 33, 36, 0); 172 + msr = kvmppc_set_field(msr, 42, 47, 0); 173 + kvmppc_set_msr(vcpu, msr); 174 + kvmppc_set_dar(vcpu, eaddr); 174 175 /* Page Fault */ 175 176 dsisr = kvmppc_set_field(0, 33, 33, 1); 176 177 if (is_store) 177 - shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1); 178 + dsisr = kvmppc_set_field(dsisr, 38, 38, 1); 179 + kvmppc_set_dsisr(vcpu, dsisr); 178 180 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); 179 181 } 180 182 ··· 662 660 if (!kvmppc_inst_is_paired_single(vcpu, inst)) 663 661 return EMULATE_FAIL; 664 662 665 - if (!(vcpu->arch.shared->msr & MSR_FP)) { 663 + if (!(kvmppc_get_msr(vcpu) & MSR_FP)) { 666 664 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL); 667 665 return EMULATE_AGAIN; 668 666 }

+195 -43

arch/powerpc/kvm/book3s_pr.c

··· 53 53 54 54 static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, 55 55 ulong msr); 56 + static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); 56 57 57 58 /* Some compatibility defines */ 58 59 #ifdef CONFIG_PPC_BOOK3S_32 ··· 90 89 #endif 91 90 92 91 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); 92 + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); 93 93 vcpu->cpu = -1; 94 94 } 95 95 ··· 117 115 svcpu->ctr = vcpu->arch.ctr; 118 116 svcpu->lr = vcpu->arch.lr; 119 117 svcpu->pc = vcpu->arch.pc; 118 + #ifdef CONFIG_PPC_BOOK3S_64 119 + svcpu->shadow_fscr = vcpu->arch.shadow_fscr; 120 + #endif 120 121 svcpu->in_use = true; 121 122 } 122 123 ··· 163 158 vcpu->arch.fault_dar = svcpu->fault_dar; 164 159 vcpu->arch.fault_dsisr = svcpu->fault_dsisr; 165 160 vcpu->arch.last_inst = svcpu->last_inst; 161 + #ifdef CONFIG_PPC_BOOK3S_64 162 + vcpu->arch.shadow_fscr = svcpu->shadow_fscr; 163 + #endif 166 164 svcpu->in_use = false; 167 165 168 166 out: ··· 254 246 255 247 static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) 256 248 { 257 - ulong smsr = vcpu->arch.shared->msr; 249 + ulong guest_msr = kvmppc_get_msr(vcpu); 250 + ulong smsr = guest_msr; 258 251 259 252 /* Guest MSR values */ 260 - smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE; 253 + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE; 261 254 /* Process MSR values */ 262 255 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; 263 256 /* External providers the guest reserved */ 264 - smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); 257 + smsr |= (guest_msr & vcpu->arch.guest_owned_ext); 265 258 /* 64-bit Process MSR values */ 266 259 #ifdef CONFIG_PPC_BOOK3S_64 267 260 smsr |= MSR_ISF | MSR_HV; ··· 272 263 273 264 static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) 274 265 { 275 - ulong old_msr = vcpu->arch.shared->msr; 266 + ulong old_msr = kvmppc_get_msr(vcpu); 276 267 277 268 #ifdef EXIT_DEBUG 278 269 printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); 279 270 #endif 280 271 281 272 msr &= to_book3s(vcpu)->msr_mask; 282 - vcpu->arch.shared->msr = msr; 273 + kvmppc_set_msr_fast(vcpu, msr); 283 274 kvmppc_recalc_shadow_msr(vcpu); 284 275 285 276 if (msr & MSR_POW) { ··· 290 281 291 282 /* Unset POW bit after we woke up */ 292 283 msr &= ~MSR_POW; 293 - vcpu->arch.shared->msr = msr; 284 + kvmppc_set_msr_fast(vcpu, msr); 294 285 } 295 286 } 296 287 297 - if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != 288 + if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) != 298 289 (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { 299 290 kvmppc_mmu_flush_segments(vcpu); 300 291 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); ··· 326 317 } 327 318 328 319 /* Preload FPU if it's enabled */ 329 - if (vcpu->arch.shared->msr & MSR_FP) 320 + if (kvmppc_get_msr(vcpu) & MSR_FP) 330 321 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 331 322 } 332 323 ··· 436 427 437 428 /* patch dcbz into reserved instruction, so we trap */ 438 429 for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) 439 - if ((page[i] & 0xff0007ff) == INS_DCBZ) 440 - page[i] &= 0xfffffff7; 430 + if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ) 431 + page[i] &= cpu_to_be32(0xfffffff7); 441 432 442 433 kunmap_atomic(page); 443 434 put_page(hpage); ··· 447 438 { 448 439 ulong mp_pa = vcpu->arch.magic_page_pa; 449 440 450 - if (!(vcpu->arch.shared->msr & MSR_SF)) 441 + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) 451 442 mp_pa = (uint32_t)mp_pa; 452 443 453 444 if (unlikely(mp_pa) && ··· 468 459 int page_found = 0; 469 460 struct kvmppc_pte pte; 470 461 bool is_mmio = false; 471 - bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; 472 - bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; 462 + bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false; 463 + bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false; 473 464 u64 vsid; 474 465 475 466 relocated = data ? dr : ir; ··· 489 480 pte.page_size = MMU_PAGE_64K; 490 481 } 491 482 492 - switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 483 + switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { 493 484 case 0: 494 485 pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); 495 486 break; ··· 497 488 case MSR_IR: 498 489 vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); 499 490 500 - if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) 491 + if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR) 501 492 pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); 502 493 else 503 494 pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); ··· 520 511 521 512 if (page_found == -ENOENT) { 522 513 /* Page not found in guest PTE entries */ 523 - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); 524 - vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr; 525 - vcpu->arch.shared->msr |= 526 - vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL; 514 + u64 ssrr1 = vcpu->arch.shadow_srr1; 515 + u64 msr = kvmppc_get_msr(vcpu); 516 + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); 517 + kvmppc_set_dsisr(vcpu, vcpu->arch.fault_dsisr); 518 + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); 527 519 kvmppc_book3s_queue_irqprio(vcpu, vec); 528 520 } else if (page_found == -EPERM) { 529 521 /* Storage protection */ 530 - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); 531 - vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE; 532 - vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; 533 - vcpu->arch.shared->msr |= 534 - vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL; 522 + u32 dsisr = vcpu->arch.fault_dsisr; 523 + u64 ssrr1 = vcpu->arch.shadow_srr1; 524 + u64 msr = kvmppc_get_msr(vcpu); 525 + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); 526 + dsisr = (dsisr & ~DSISR_NOHPTE) | DSISR_PROTFAULT; 527 + kvmppc_set_dsisr(vcpu, dsisr); 528 + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); 535 529 kvmppc_book3s_queue_irqprio(vcpu, vec); 536 530 } else if (page_found == -EINVAL) { 537 531 /* Page not found in guest SLB */ 538 - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); 532 + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); 539 533 kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); 540 534 } else if (!is_mmio && 541 535 kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { ··· 618 606 kvmppc_recalc_shadow_msr(vcpu); 619 607 } 620 608 609 + /* Give up facility (TAR / EBB / DSCR) */ 610 + static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac) 611 + { 612 + #ifdef CONFIG_PPC_BOOK3S_64 613 + if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) { 614 + /* Facility not available to the guest, ignore giveup request*/ 615 + return; 616 + } 617 + 618 + switch (fac) { 619 + case FSCR_TAR_LG: 620 + vcpu->arch.tar = mfspr(SPRN_TAR); 621 + mtspr(SPRN_TAR, current->thread.tar); 622 + vcpu->arch.shadow_fscr &= ~FSCR_TAR; 623 + break; 624 + } 625 + #endif 626 + } 627 + 621 628 static int kvmppc_read_inst(struct kvm_vcpu *vcpu) 622 629 { 623 630 ulong srr0 = kvmppc_get_pc(vcpu); ··· 645 614 646 615 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); 647 616 if (ret == -ENOENT) { 648 - ulong msr = vcpu->arch.shared->msr; 617 + ulong msr = kvmppc_get_msr(vcpu); 649 618 650 619 msr = kvmppc_set_field(msr, 33, 33, 1); 651 620 msr = kvmppc_set_field(msr, 34, 36, 0); 652 - vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); 621 + msr = kvmppc_set_field(msr, 42, 47, 0); 622 + kvmppc_set_msr_fast(vcpu, msr); 653 623 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); 654 624 return EMULATE_AGAIN; 655 625 } ··· 683 651 if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) 684 652 return RESUME_GUEST; 685 653 686 - if (!(vcpu->arch.shared->msr & msr)) { 654 + if (!(kvmppc_get_msr(vcpu) & msr)) { 687 655 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 688 656 return RESUME_GUEST; 689 657 } ··· 715 683 #endif 716 684 717 685 if (msr & MSR_FP) { 686 + preempt_disable(); 718 687 enable_kernel_fp(); 719 688 load_fp_state(&vcpu->arch.fp); 720 689 t->fp_save_area = &vcpu->arch.fp; 690 + preempt_enable(); 721 691 } 722 692 723 693 if (msr & MSR_VEC) { 724 694 #ifdef CONFIG_ALTIVEC 695 + preempt_disable(); 725 696 enable_kernel_altivec(); 726 697 load_vr_state(&vcpu->arch.vr); 727 698 t->vr_save_area = &vcpu->arch.vr; 699 + preempt_enable(); 728 700 #endif 729 701 } 730 702 ··· 752 716 return; 753 717 754 718 if (lost_ext & MSR_FP) { 719 + preempt_disable(); 755 720 enable_kernel_fp(); 756 721 load_fp_state(&vcpu->arch.fp); 722 + preempt_enable(); 757 723 } 758 724 #ifdef CONFIG_ALTIVEC 759 725 if (lost_ext & MSR_VEC) { 726 + preempt_disable(); 760 727 enable_kernel_altivec(); 761 728 load_vr_state(&vcpu->arch.vr); 729 + preempt_enable(); 762 730 } 763 731 #endif 764 732 current->thread.regs->msr |= lost_ext; 765 733 } 734 + 735 + #ifdef CONFIG_PPC_BOOK3S_64 736 + 737 + static void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac) 738 + { 739 + /* Inject the Interrupt Cause field and trigger a guest interrupt */ 740 + vcpu->arch.fscr &= ~(0xffULL << 56); 741 + vcpu->arch.fscr |= (fac << 56); 742 + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); 743 + } 744 + 745 + static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac) 746 + { 747 + enum emulation_result er = EMULATE_FAIL; 748 + 749 + if (!(kvmppc_get_msr(vcpu) & MSR_PR)) 750 + er = kvmppc_emulate_instruction(vcpu->run, vcpu); 751 + 752 + if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) { 753 + /* Couldn't emulate, trigger interrupt in guest */ 754 + kvmppc_trigger_fac_interrupt(vcpu, fac); 755 + } 756 + } 757 + 758 + /* Enable facilities (TAR, EBB, DSCR) for the guest */ 759 + static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac) 760 + { 761 + bool guest_fac_enabled; 762 + BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S)); 763 + 764 + /* 765 + * Not every facility is enabled by FSCR bits, check whether the 766 + * guest has this facility enabled at all. 767 + */ 768 + switch (fac) { 769 + case FSCR_TAR_LG: 770 + case FSCR_EBB_LG: 771 + guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac)); 772 + break; 773 + case FSCR_TM_LG: 774 + guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM; 775 + break; 776 + default: 777 + guest_fac_enabled = false; 778 + break; 779 + } 780 + 781 + if (!guest_fac_enabled) { 782 + /* Facility not enabled by the guest */ 783 + kvmppc_trigger_fac_interrupt(vcpu, fac); 784 + return RESUME_GUEST; 785 + } 786 + 787 + switch (fac) { 788 + case FSCR_TAR_LG: 789 + /* TAR switching isn't lazy in Linux yet */ 790 + current->thread.tar = mfspr(SPRN_TAR); 791 + mtspr(SPRN_TAR, vcpu->arch.tar); 792 + vcpu->arch.shadow_fscr |= FSCR_TAR; 793 + break; 794 + default: 795 + kvmppc_emulate_fac(vcpu, fac); 796 + break; 797 + } 798 + 799 + return RESUME_GUEST; 800 + } 801 + #endif 766 802 767 803 int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, 768 804 unsigned int exit_nr) ··· 892 784 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); 893 785 r = RESUME_GUEST; 894 786 } else { 895 - vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000; 787 + u64 msr = kvmppc_get_msr(vcpu); 788 + msr |= shadow_srr1 & 0x58000000; 789 + kvmppc_set_msr_fast(vcpu, msr); 896 790 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 897 791 r = RESUME_GUEST; 898 792 } ··· 934 824 r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); 935 825 srcu_read_unlock(&vcpu->kvm->srcu, idx); 936 826 } else { 937 - vcpu->arch.shared->dar = dar; 938 - vcpu->arch.shared->dsisr = fault_dsisr; 827 + kvmppc_set_dar(vcpu, dar); 828 + kvmppc_set_dsisr(vcpu, fault_dsisr); 939 829 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 940 830 r = RESUME_GUEST; 941 831 } ··· 943 833 } 944 834 case BOOK3S_INTERRUPT_DATA_SEGMENT: 945 835 if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { 946 - vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); 836 + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); 947 837 kvmppc_book3s_queue_irqprio(vcpu, 948 838 BOOK3S_INTERRUPT_DATA_SEGMENT); 949 839 } ··· 981 871 program_interrupt: 982 872 flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; 983 873 984 - if (vcpu->arch.shared->msr & MSR_PR) { 874 + if (kvmppc_get_msr(vcpu) & MSR_PR) { 985 875 #ifdef EXIT_DEBUG 986 876 printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); 987 877 #endif ··· 1023 913 case BOOK3S_INTERRUPT_SYSCALL: 1024 914 if (vcpu->arch.papr_enabled && 1025 915 (kvmppc_get_last_sc(vcpu) == 0x44000022) && 1026 - !(vcpu->arch.shared->msr & MSR_PR)) { 916 + !(kvmppc_get_msr(vcpu) & MSR_PR)) { 1027 917 /* SC 1 papr hypercalls */ 1028 918 ulong cmd = kvmppc_get_gpr(vcpu, 3); 1029 919 int i; ··· 1055 945 gprs[i] = kvmppc_get_gpr(vcpu, i); 1056 946 vcpu->arch.osi_needed = 1; 1057 947 r = RESUME_HOST_NV; 1058 - } else if (!(vcpu->arch.shared->msr & MSR_PR) && 948 + } else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && 1059 949 (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { 1060 950 /* KVM PV hypercalls */ 1061 951 kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); ··· 1096 986 } 1097 987 case BOOK3S_INTERRUPT_ALIGNMENT: 1098 988 if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { 1099 - vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, 1100 - kvmppc_get_last_inst(vcpu)); 1101 - vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, 1102 - kvmppc_get_last_inst(vcpu)); 989 + u32 last_inst = kvmppc_get_last_inst(vcpu); 990 + u32 dsisr; 991 + u64 dar; 992 + 993 + dsisr = kvmppc_alignment_dsisr(vcpu, last_inst); 994 + dar = kvmppc_alignment_dar(vcpu, last_inst); 995 + 996 + kvmppc_set_dsisr(vcpu, dsisr); 997 + kvmppc_set_dar(vcpu, dar); 998 + 1103 999 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 1104 1000 } 1105 1001 r = RESUME_GUEST; 1106 1002 break; 1003 + #ifdef CONFIG_PPC_BOOK3S_64 1004 + case BOOK3S_INTERRUPT_FAC_UNAVAIL: 1005 + kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56); 1006 + r = RESUME_GUEST; 1007 + break; 1008 + #endif 1107 1009 case BOOK3S_INTERRUPT_MACHINE_CHECK: 1108 1010 case BOOK3S_INTERRUPT_TRACE: 1109 1011 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); ··· 1176 1054 } 1177 1055 } else { 1178 1056 for (i = 0; i < 16; i++) 1179 - sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; 1057 + sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i); 1180 1058 1181 1059 for (i = 0; i < 8; i++) { 1182 1060 sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; ··· 1232 1110 case KVM_REG_PPC_HIOR: 1233 1111 *val = get_reg_val(id, to_book3s(vcpu)->hior); 1234 1112 break; 1113 + case KVM_REG_PPC_LPCR: 1114 + /* 1115 + * We are only interested in the LPCR_ILE bit 1116 + */ 1117 + if (vcpu->arch.intr_msr & MSR_LE) 1118 + *val = get_reg_val(id, LPCR_ILE); 1119 + else 1120 + *val = get_reg_val(id, 0); 1121 + break; 1235 1122 default: 1236 1123 r = -EINVAL; 1237 1124 break; 1238 1125 } 1239 1126 1240 1127 return r; 1128 + } 1129 + 1130 + static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr) 1131 + { 1132 + if (new_lpcr & LPCR_ILE) 1133 + vcpu->arch.intr_msr |= MSR_LE; 1134 + else 1135 + vcpu->arch.intr_msr &= ~MSR_LE; 1241 1136 } 1242 1137 1243 1138 static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, ··· 1266 1127 case KVM_REG_PPC_HIOR: 1267 1128 to_book3s(vcpu)->hior = set_reg_val(id, *val); 1268 1129 to_book3s(vcpu)->hior_explicit = true; 1130 + break; 1131 + case KVM_REG_PPC_LPCR: 1132 + kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); 1269 1133 break; 1270 1134 default: 1271 1135 r = -EINVAL; ··· 1312 1170 goto uninit_vcpu; 1313 1171 /* the real shared page fills the last 4k of our page */ 1314 1172 vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096); 1315 - 1316 1173 #ifdef CONFIG_PPC_BOOK3S_64 1174 + /* Always start the shared struct in native endian mode */ 1175 + #ifdef __BIG_ENDIAN__ 1176 + vcpu->arch.shared_big_endian = true; 1177 + #else 1178 + vcpu->arch.shared_big_endian = false; 1179 + #endif 1180 + 1317 1181 /* 1318 1182 * Default to the same as the host if we're on sufficiently 1319 1183 * recent machine that we have 1TB segments; ··· 1328 1180 vcpu->arch.pvr = 0x3C0301; 1329 1181 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) 1330 1182 vcpu->arch.pvr = mfspr(SPRN_PVR); 1183 + vcpu->arch.intr_msr = MSR_SF; 1331 1184 #else 1332 1185 /* default to book3s_32 (750) */ 1333 1186 vcpu->arch.pvr = 0x84202; ··· 1336 1187 kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); 1337 1188 vcpu->arch.slb_nr = 64; 1338 1189 1339 - vcpu->arch.shadow_msr = MSR_USER64; 1190 + vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE; 1340 1191 1341 1192 err = kvmppc_mmu_init(vcpu); 1342 1193 if (err < 0) ··· 1413 1264 #endif 1414 1265 1415 1266 /* Preload FPU if it's enabled */ 1416 - if (vcpu->arch.shared->msr & MSR_FP) 1267 + if (kvmppc_get_msr(vcpu) & MSR_FP) 1417 1268 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 1418 1269 1419 1270 kvmppc_fix_ee_before_entry(); ··· 1425 1276 1426 1277 /* Make sure we save the guest FPU/Altivec/VSX state */ 1427 1278 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); 1279 + 1280 + /* Make sure we save the guest TAR/EBB/DSCR state */ 1281 + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); 1428 1282 1429 1283 out: 1430 1284 vcpu->mode = OUTSIDE_GUEST_MODE;

+12 -4

arch/powerpc/kvm/book3s_pr_papr.c

··· 57 57 for (i = 0; ; ++i) { 58 58 if (i == 8) 59 59 goto done; 60 - if ((*hpte & HPTE_V_VALID) == 0) 60 + if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0) 61 61 break; 62 62 hpte += 2; 63 63 } ··· 67 67 goto done; 68 68 } 69 69 70 - hpte[0] = kvmppc_get_gpr(vcpu, 6); 71 - hpte[1] = kvmppc_get_gpr(vcpu, 7); 70 + hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6)); 71 + hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7)); 72 72 pteg_addr += i * HPTE_SIZE; 73 73 copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); 74 74 kvmppc_set_gpr(vcpu, 4, pte_index | i); ··· 93 93 pteg = get_pteg_addr(vcpu, pte_index); 94 94 mutex_lock(&vcpu->kvm->arch.hpt_mutex); 95 95 copy_from_user(pte, (void __user *)pteg, sizeof(pte)); 96 + pte[0] = be64_to_cpu(pte[0]); 97 + pte[1] = be64_to_cpu(pte[1]); 96 98 97 99 ret = H_NOT_FOUND; 98 100 if ((pte[0] & HPTE_V_VALID) == 0 || ··· 171 169 172 170 pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX); 173 171 copy_from_user(pte, (void __user *)pteg, sizeof(pte)); 172 + pte[0] = be64_to_cpu(pte[0]); 173 + pte[1] = be64_to_cpu(pte[1]); 174 174 175 175 /* tsl = AVPN */ 176 176 flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26; ··· 211 207 pteg = get_pteg_addr(vcpu, pte_index); 212 208 mutex_lock(&vcpu->kvm->arch.hpt_mutex); 213 209 copy_from_user(pte, (void __user *)pteg, sizeof(pte)); 210 + pte[0] = be64_to_cpu(pte[0]); 211 + pte[1] = be64_to_cpu(pte[1]); 214 212 215 213 ret = H_NOT_FOUND; 216 214 if ((pte[0] & HPTE_V_VALID) == 0 || ··· 231 225 232 226 rb = compute_tlbie_rb(v, r, pte_index); 233 227 vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); 228 + pte[0] = cpu_to_be64(pte[0]); 229 + pte[1] = cpu_to_be64(pte[1]); 234 230 copy_to_user((void __user *)pteg, pte, sizeof(pte)); 235 231 ret = H_SUCCESS; 236 232 ··· 278 270 case H_PUT_TCE: 279 271 return kvmppc_h_pr_put_tce(vcpu); 280 272 case H_CEDE: 281 - vcpu->arch.shared->msr |= MSR_EE; 273 + kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); 282 274 kvm_vcpu_block(vcpu); 283 275 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 284 276 vcpu->stat.halt_wakeup++;

+29

arch/powerpc/kvm/book3s_rtas.c

··· 205 205 return rc; 206 206 } 207 207 208 + static void kvmppc_rtas_swap_endian_in(struct rtas_args *args) 209 + { 210 + #ifdef __LITTLE_ENDIAN__ 211 + int i; 212 + 213 + args->token = be32_to_cpu(args->token); 214 + args->nargs = be32_to_cpu(args->nargs); 215 + args->nret = be32_to_cpu(args->nret); 216 + for (i = 0; i < args->nargs; i++) 217 + args->args[i] = be32_to_cpu(args->args[i]); 218 + #endif 219 + } 220 + 221 + static void kvmppc_rtas_swap_endian_out(struct rtas_args *args) 222 + { 223 + #ifdef __LITTLE_ENDIAN__ 224 + int i; 225 + 226 + for (i = 0; i < args->nret; i++) 227 + args->args[i] = cpu_to_be32(args->args[i]); 228 + args->token = cpu_to_be32(args->token); 229 + args->nargs = cpu_to_be32(args->nargs); 230 + args->nret = cpu_to_be32(args->nret); 231 + #endif 232 + } 233 + 208 234 int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) 209 235 { 210 236 struct rtas_token_definition *d; ··· 248 222 rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); 249 223 if (rc) 250 224 goto fail; 225 + 226 + kvmppc_rtas_swap_endian_in(&args); 251 227 252 228 /* 253 229 * args->rets is a pointer into args->args. Now that we've ··· 275 247 276 248 if (rc == 0) { 277 249 args.rets = orig_rets; 250 + kvmppc_rtas_swap_endian_out(&args); 278 251 rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args)); 279 252 if (rc) 280 253 goto fail;

+25

arch/powerpc/kvm/book3s_segment.S

··· 90 90 LOAD_GUEST_SEGMENTS 91 91 92 92 #ifdef CONFIG_PPC_BOOK3S_64 93 + BEGIN_FTR_SECTION 94 + /* Save host FSCR */ 95 + mfspr r8, SPRN_FSCR 96 + std r8, HSTATE_HOST_FSCR(r13) 97 + /* Set FSCR during guest execution */ 98 + ld r9, SVCPU_SHADOW_FSCR(r13) 99 + mtspr SPRN_FSCR, r9 100 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 101 + 93 102 /* Some guests may need to have dcbz set to 32 byte length. 94 103 * 95 104 * Usually we ensure that by patching the guest's instructions ··· 264 255 cmpwi r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST 265 256 beq- ld_last_inst 266 257 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) 258 + BEGIN_FTR_SECTION 259 + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL 260 + beq- ld_last_inst 261 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 267 262 #endif 268 263 269 264 b no_ld_last_inst ··· 323 310 mtspr SPRN_HID5,r5 324 311 325 312 no_dcbz32_off: 313 + 314 + BEGIN_FTR_SECTION 315 + /* Save guest FSCR on a FAC_UNAVAIL interrupt */ 316 + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL 317 + bne+ no_fscr_save 318 + mfspr r7, SPRN_FSCR 319 + std r7, SVCPU_SHADOW_FSCR(r13) 320 + no_fscr_save: 321 + /* Restore host FSCR */ 322 + ld r8, HSTATE_HOST_FSCR(r13) 323 + mtspr SPRN_FSCR, r8 324 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 326 325 327 326 #endif /* CONFIG_PPC_BOOK3S_64 */ 328 327

+15

arch/powerpc/kvm/e500_emulate.c

··· 19 19 #include "booke.h" 20 20 #include "e500.h" 21 21 22 + #define XOP_DCBTLS 166 22 23 #define XOP_MSGSND 206 23 24 #define XOP_MSGCLR 238 24 25 #define XOP_TLBIVAX 786 ··· 104 103 return emulated; 105 104 } 106 105 106 + static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) 107 + { 108 + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 109 + 110 + /* Always fail to lock the cache */ 111 + vcpu_e500->l1csr0 |= L1CSR0_CUL; 112 + return EMULATE_DONE; 113 + } 114 + 107 115 int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, 108 116 unsigned int inst, int *advance) 109 117 { ··· 125 115 switch (get_op(inst)) { 126 116 case 31: 127 117 switch (get_xop(inst)) { 118 + 119 + case XOP_DCBTLS: 120 + emulated = kvmppc_e500_emul_dcbtls(vcpu); 121 + break; 128 122 129 123 #ifdef CONFIG_KVM_E500MC 130 124 case XOP_MSGSND: ··· 236 222 break; 237 223 case SPRN_L1CSR1: 238 224 vcpu_e500->l1csr1 = spr_val; 225 + vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR); 239 226 break; 240 227 case SPRN_HID0: 241 228 vcpu_e500->hid0 = spr_val;

+12 -12

arch/powerpc/kvm/emulate.c

··· 97 97 98 98 switch (sprn) { 99 99 case SPRN_SRR0: 100 - vcpu->arch.shared->srr0 = spr_val; 100 + kvmppc_set_srr0(vcpu, spr_val); 101 101 break; 102 102 case SPRN_SRR1: 103 - vcpu->arch.shared->srr1 = spr_val; 103 + kvmppc_set_srr1(vcpu, spr_val); 104 104 break; 105 105 106 106 /* XXX We need to context-switch the timebase for ··· 114 114 break; 115 115 116 116 case SPRN_SPRG0: 117 - vcpu->arch.shared->sprg0 = spr_val; 117 + kvmppc_set_sprg0(vcpu, spr_val); 118 118 break; 119 119 case SPRN_SPRG1: 120 - vcpu->arch.shared->sprg1 = spr_val; 120 + kvmppc_set_sprg1(vcpu, spr_val); 121 121 break; 122 122 case SPRN_SPRG2: 123 - vcpu->arch.shared->sprg2 = spr_val; 123 + kvmppc_set_sprg2(vcpu, spr_val); 124 124 break; 125 125 case SPRN_SPRG3: 126 - vcpu->arch.shared->sprg3 = spr_val; 126 + kvmppc_set_sprg3(vcpu, spr_val); 127 127 break; 128 128 129 129 /* PIR can legally be written, but we ignore it */ ··· 150 150 151 151 switch (sprn) { 152 152 case SPRN_SRR0: 153 - spr_val = vcpu->arch.shared->srr0; 153 + spr_val = kvmppc_get_srr0(vcpu); 154 154 break; 155 155 case SPRN_SRR1: 156 - spr_val = vcpu->arch.shared->srr1; 156 + spr_val = kvmppc_get_srr1(vcpu); 157 157 break; 158 158 case SPRN_PVR: 159 159 spr_val = vcpu->arch.pvr; ··· 173 173 break; 174 174 175 175 case SPRN_SPRG0: 176 - spr_val = vcpu->arch.shared->sprg0; 176 + spr_val = kvmppc_get_sprg0(vcpu); 177 177 break; 178 178 case SPRN_SPRG1: 179 - spr_val = vcpu->arch.shared->sprg1; 179 + spr_val = kvmppc_get_sprg1(vcpu); 180 180 break; 181 181 case SPRN_SPRG2: 182 - spr_val = vcpu->arch.shared->sprg2; 182 + spr_val = kvmppc_get_sprg2(vcpu); 183 183 break; 184 184 case SPRN_SPRG3: 185 - spr_val = vcpu->arch.shared->sprg3; 185 + spr_val = kvmppc_get_sprg3(vcpu); 186 186 break; 187 187 /* Note: SPRG4-7 are user-readable, so we don't get 188 188 * a trap. */

+4 -1

arch/powerpc/kvm/mpic.c

··· 126 126 u32 val, int idx); 127 127 static int openpic_cpu_read_internal(void *opaque, gpa_t addr, 128 128 u32 *ptr, int idx); 129 + static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, 130 + uint32_t val); 129 131 130 132 enum irq_type { 131 133 IRQ_TYPE_NORMAL = 0, ··· 530 528 /* Initialise IRQ sources */ 531 529 for (i = 0; i < opp->max_irq; i++) { 532 530 opp->src[i].ivpr = opp->ivpr_reset; 533 - opp->src[i].idr = opp->idr_reset; 534 531 535 532 switch (opp->src[i].type) { 536 533 case IRQ_TYPE_NORMAL: ··· 544 543 case IRQ_TYPE_FSLSPECIAL: 545 544 break; 546 545 } 546 + 547 + write_IRQreg_idr(opp, i, opp->idr_reset); 547 548 } 548 549 /* Initialise IRQ destinations */ 549 550 for (i = 0; i < MAX_CPU; i++) {

+53 -11

arch/powerpc/kvm/powerpc.c

··· 125 125 } 126 126 EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter); 127 127 128 + #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) 129 + static void kvmppc_swab_shared(struct kvm_vcpu *vcpu) 130 + { 131 + struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; 132 + int i; 133 + 134 + shared->sprg0 = swab64(shared->sprg0); 135 + shared->sprg1 = swab64(shared->sprg1); 136 + shared->sprg2 = swab64(shared->sprg2); 137 + shared->sprg3 = swab64(shared->sprg3); 138 + shared->srr0 = swab64(shared->srr0); 139 + shared->srr1 = swab64(shared->srr1); 140 + shared->dar = swab64(shared->dar); 141 + shared->msr = swab64(shared->msr); 142 + shared->dsisr = swab32(shared->dsisr); 143 + shared->int_pending = swab32(shared->int_pending); 144 + for (i = 0; i < ARRAY_SIZE(shared->sr); i++) 145 + shared->sr[i] = swab32(shared->sr[i]); 146 + } 147 + #endif 148 + 128 149 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 129 150 { 130 151 int nr = kvmppc_get_gpr(vcpu, 11); ··· 156 135 unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6); 157 136 unsigned long r2 = 0; 158 137 159 - if (!(vcpu->arch.shared->msr & MSR_SF)) { 138 + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { 160 139 /* 32 bit mode */ 161 140 param1 &= 0xffffffff; 162 141 param2 &= 0xffffffff; ··· 167 146 switch (nr) { 168 147 case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE): 169 148 { 170 - vcpu->arch.magic_page_pa = param1; 171 - vcpu->arch.magic_page_ea = param2; 149 + #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) 150 + /* Book3S can be little endian, find it out here */ 151 + int shared_big_endian = true; 152 + if (vcpu->arch.intr_msr & MSR_LE) 153 + shared_big_endian = false; 154 + if (shared_big_endian != vcpu->arch.shared_big_endian) 155 + kvmppc_swab_shared(vcpu); 156 + vcpu->arch.shared_big_endian = shared_big_endian; 157 + #endif 158 + 159 + if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) { 160 + /* 161 + * Older versions of the Linux magic page code had 162 + * a bug where they would map their trampoline code 163 + * NX. If that's the case, remove !PR NX capability. 164 + */ 165 + vcpu->arch.disable_kernel_nx = true; 166 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 167 + } 168 + 169 + vcpu->arch.magic_page_pa = param1 & ~0xfffULL; 170 + vcpu->arch.magic_page_ea = param2 & ~0xfffULL; 172 171 173 172 r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; 174 173 ··· 416 375 case KVM_CAP_SPAPR_TCE: 417 376 case KVM_CAP_PPC_ALLOC_HTAB: 418 377 case KVM_CAP_PPC_RTAS: 378 + case KVM_CAP_PPC_FIXUP_HCALL: 419 379 #ifdef CONFIG_KVM_XICS 420 380 case KVM_CAP_IRQ_XICS: 421 381 #endif ··· 1057 1015 u32 inst_nop = 0x60000000; 1058 1016 #ifdef CONFIG_KVM_BOOKE_HV 1059 1017 u32 inst_sc1 = 0x44000022; 1060 - pvinfo->hcall[0] = inst_sc1; 1061 - pvinfo->hcall[1] = inst_nop; 1062 - pvinfo->hcall[2] = inst_nop; 1063 - pvinfo->hcall[3] = inst_nop; 1018 + pvinfo->hcall[0] = cpu_to_be32(inst_sc1); 1019 + pvinfo->hcall[1] = cpu_to_be32(inst_nop); 1020 + pvinfo->hcall[2] = cpu_to_be32(inst_nop); 1021 + pvinfo->hcall[3] = cpu_to_be32(inst_nop); 1064 1022 #else 1065 1023 u32 inst_lis = 0x3c000000; 1066 1024 u32 inst_ori = 0x60000000; ··· 1076 1034 * sc 1077 1035 * nop 1078 1036 */ 1079 - pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask); 1080 - pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); 1081 - pvinfo->hcall[2] = inst_sc; 1082 - pvinfo->hcall[3] = inst_nop; 1037 + pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask)); 1038 + pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask)); 1039 + pvinfo->hcall[2] = cpu_to_be32(inst_sc); 1040 + pvinfo->hcall[3] = cpu_to_be32(inst_nop); 1083 1041 #endif 1084 1042 1085 1043 pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;

+1 -1

arch/powerpc/kvm/trace_pr.h

··· 255 255 __entry->exit_nr = exit_nr; 256 256 __entry->pc = kvmppc_get_pc(vcpu); 257 257 __entry->dar = kvmppc_get_fault_dar(vcpu); 258 - __entry->msr = vcpu->arch.shared->msr; 258 + __entry->msr = kvmppc_get_msr(vcpu); 259 259 __entry->srr1 = vcpu->arch.shadow_srr1; 260 260 __entry->last_inst = vcpu->arch.last_inst; 261 261 ),

+1 -1

arch/powerpc/mm/slb.c

··· 97 97 static void __slb_flush_and_rebolt(void) 98 98 { 99 99 /* If you change this make sure you change SLB_NUM_BOLTED 100 - * appropriately too. */ 100 + * and PR KVM appropriately too. */ 101 101 unsigned long linear_llp, vmalloc_llp, lflags, vflags; 102 102 unsigned long ksp_esid_data, ksp_vsid_data; 103 103

+14

arch/s390/include/asm/ctl_reg.h

··· 57 57 void smp_ctl_set_bit(int cr, int bit); 58 58 void smp_ctl_clear_bit(int cr, int bit); 59 59 60 + union ctlreg0 { 61 + unsigned long val; 62 + struct { 63 + #ifdef CONFIG_64BIT 64 + unsigned long : 32; 65 + #endif 66 + unsigned long : 3; 67 + unsigned long lap : 1; /* Low-address-protection control */ 68 + unsigned long : 4; 69 + unsigned long edat : 1; /* Enhanced-DAT-enablement control */ 70 + unsigned long : 23; 71 + }; 72 + }; 73 + 60 74 #ifdef CONFIG_SMP 61 75 # define ctl_set_bit(cr, bit) smp_ctl_set_bit(cr, bit) 62 76 # define ctl_clear_bit(cr, bit) smp_ctl_clear_bit(cr, bit)

+149 -14

arch/s390/include/asm/kvm_host.h

··· 32 32 #define KVM_NR_IRQCHIPS 1 33 33 #define KVM_IRQCHIP_NUM_PINS 4096 34 34 35 + #define SIGP_CTRL_C 0x00800000 36 + 35 37 struct sca_entry { 36 - atomic_t scn; 38 + atomic_t ctrl; 37 39 __u32 reserved; 38 40 __u64 sda; 39 41 __u64 reserved2[2]; 40 42 } __attribute__((packed)); 41 43 44 + union ipte_control { 45 + unsigned long val; 46 + struct { 47 + unsigned long k : 1; 48 + unsigned long kh : 31; 49 + unsigned long kg : 32; 50 + }; 51 + }; 42 52 43 53 struct sca_block { 44 - __u64 ipte_control; 54 + union ipte_control ipte_control; 45 55 __u64 reserved[5]; 46 56 __u64 mcn; 47 57 __u64 reserved2; ··· 74 64 #define CPUSTAT_ZARCH 0x00000800 75 65 #define CPUSTAT_MCDS 0x00000100 76 66 #define CPUSTAT_SM 0x00000080 67 + #define CPUSTAT_IBS 0x00000040 77 68 #define CPUSTAT_G 0x00000008 78 69 #define CPUSTAT_GED 0x00000004 79 70 #define CPUSTAT_J 0x00000002 ··· 82 71 83 72 struct kvm_s390_sie_block { 84 73 atomic_t cpuflags; /* 0x0000 */ 85 - __u32 prefix; /* 0x0004 */ 74 + __u32 : 1; /* 0x0004 */ 75 + __u32 prefix : 18; 76 + __u32 : 13; 86 77 __u8 reserved08[4]; /* 0x0008 */ 87 78 #define PROG_IN_SIE (1<<0) 88 79 __u32 prog0c; /* 0x000c */ ··· 98 85 __u8 reserved40[4]; /* 0x0040 */ 99 86 #define LCTL_CR0 0x8000 100 87 #define LCTL_CR6 0x0200 88 + #define LCTL_CR9 0x0040 89 + #define LCTL_CR10 0x0020 90 + #define LCTL_CR11 0x0010 101 91 #define LCTL_CR14 0x0002 102 92 __u16 lctl; /* 0x0044 */ 103 93 __s16 icpua; /* 0x0046 */ 104 - #define ICTL_LPSW 0x00400000 94 + #define ICTL_PINT 0x20000000 95 + #define ICTL_LPSW 0x00400000 96 + #define ICTL_STCTL 0x00040000 97 + #define ICTL_ISKE 0x00004000 98 + #define ICTL_SSKE 0x00002000 99 + #define ICTL_RRBE 0x00001000 100 + #define ICTL_TPROT 0x00000200 105 101 __u32 ictl; /* 0x0048 */ 106 102 __u32 eca; /* 0x004c */ 103 + #define ICPT_INST 0x04 104 + #define ICPT_PROGI 0x08 105 + #define ICPT_INSTPROGI 0x0C 106 + #define ICPT_OPEREXC 0x2C 107 + #define ICPT_PARTEXEC 0x38 108 + #define ICPT_IOINST 0x40 107 109 __u8 icptcode; /* 0x0050 */ 108 110 __u8 reserved51; /* 0x0051 */ 109 111 __u16 ihcpu; /* 0x0052 */ ··· 137 109 psw_t gpsw; /* 0x0090 */ 138 110 __u64 gg14; /* 0x00a0 */ 139 111 __u64 gg15; /* 0x00a8 */ 140 - __u8 reservedb0[30]; /* 0x00b0 */ 141 - __u16 iprcc; /* 0x00ce */ 142 - __u8 reservedd0[48]; /* 0x00d0 */ 112 + __u8 reservedb0[20]; /* 0x00b0 */ 113 + __u16 extcpuaddr; /* 0x00c4 */ 114 + __u16 eic; /* 0x00c6 */ 115 + __u32 reservedc8; /* 0x00c8 */ 116 + __u16 pgmilc; /* 0x00cc */ 117 + __u16 iprcc; /* 0x00ce */ 118 + __u32 dxc; /* 0x00d0 */ 119 + __u16 mcn; /* 0x00d4 */ 120 + __u8 perc; /* 0x00d6 */ 121 + __u8 peratmid; /* 0x00d7 */ 122 + __u64 peraddr; /* 0x00d8 */ 123 + __u8 eai; /* 0x00e0 */ 124 + __u8 peraid; /* 0x00e1 */ 125 + __u8 oai; /* 0x00e2 */ 126 + __u8 armid; /* 0x00e3 */ 127 + __u8 reservede4[4]; /* 0x00e4 */ 128 + __u64 tecmc; /* 0x00e8 */ 129 + __u8 reservedf0[16]; /* 0x00f0 */ 143 130 __u64 gcr[16]; /* 0x0100 */ 144 131 __u64 gbea; /* 0x0180 */ 145 132 __u8 reserved188[24]; /* 0x0188 */ ··· 189 146 u32 exit_instruction; 190 147 u32 instruction_lctl; 191 148 u32 instruction_lctlg; 149 + u32 instruction_stctl; 150 + u32 instruction_stctg; 192 151 u32 exit_program_interruption; 193 152 u32 exit_instr_and_program; 194 153 u32 deliver_external_call; ··· 209 164 u32 instruction_stpx; 210 165 u32 instruction_stap; 211 166 u32 instruction_storage_key; 167 + u32 instruction_ipte_interlock; 212 168 u32 instruction_stsch; 213 169 u32 instruction_chsc; 214 170 u32 instruction_stsi; ··· 229 183 u32 diagnose_9c; 230 184 }; 231 185 232 - #define PGM_OPERATION 0x01 233 - #define PGM_PRIVILEGED_OP 0x02 234 - #define PGM_EXECUTE 0x03 235 - #define PGM_PROTECTION 0x04 236 - #define PGM_ADDRESSING 0x05 237 - #define PGM_SPECIFICATION 0x06 238 - #define PGM_DATA 0x07 186 + #define PGM_OPERATION 0x01 187 + #define PGM_PRIVILEGED_OP 0x02 188 + #define PGM_EXECUTE 0x03 189 + #define PGM_PROTECTION 0x04 190 + #define PGM_ADDRESSING 0x05 191 + #define PGM_SPECIFICATION 0x06 192 + #define PGM_DATA 0x07 193 + #define PGM_FIXED_POINT_OVERFLOW 0x08 194 + #define PGM_FIXED_POINT_DIVIDE 0x09 195 + #define PGM_DECIMAL_OVERFLOW 0x0a 196 + #define PGM_DECIMAL_DIVIDE 0x0b 197 + #define PGM_HFP_EXPONENT_OVERFLOW 0x0c 198 + #define PGM_HFP_EXPONENT_UNDERFLOW 0x0d 199 + #define PGM_HFP_SIGNIFICANCE 0x0e 200 + #define PGM_HFP_DIVIDE 0x0f 201 + #define PGM_SEGMENT_TRANSLATION 0x10 202 + #define PGM_PAGE_TRANSLATION 0x11 203 + #define PGM_TRANSLATION_SPEC 0x12 204 + #define PGM_SPECIAL_OPERATION 0x13 205 + #define PGM_OPERAND 0x15 206 + #define PGM_TRACE_TABEL 0x16 207 + #define PGM_SPACE_SWITCH 0x1c 208 + #define PGM_HFP_SQUARE_ROOT 0x1d 209 + #define PGM_PC_TRANSLATION_SPEC 0x1f 210 + #define PGM_AFX_TRANSLATION 0x20 211 + #define PGM_ASX_TRANSLATION 0x21 212 + #define PGM_LX_TRANSLATION 0x22 213 + #define PGM_EX_TRANSLATION 0x23 214 + #define PGM_PRIMARY_AUTHORITY 0x24 215 + #define PGM_SECONDARY_AUTHORITY 0x25 216 + #define PGM_LFX_TRANSLATION 0x26 217 + #define PGM_LSX_TRANSLATION 0x27 218 + #define PGM_ALET_SPECIFICATION 0x28 219 + #define PGM_ALEN_TRANSLATION 0x29 220 + #define PGM_ALE_SEQUENCE 0x2a 221 + #define PGM_ASTE_VALIDITY 0x2b 222 + #define PGM_ASTE_SEQUENCE 0x2c 223 + #define PGM_EXTENDED_AUTHORITY 0x2d 224 + #define PGM_LSTE_SEQUENCE 0x2e 225 + #define PGM_ASTE_INSTANCE 0x2f 226 + #define PGM_STACK_FULL 0x30 227 + #define PGM_STACK_EMPTY 0x31 228 + #define PGM_STACK_SPECIFICATION 0x32 229 + #define PGM_STACK_TYPE 0x33 230 + #define PGM_STACK_OPERATION 0x34 231 + #define PGM_ASCE_TYPE 0x38 232 + #define PGM_REGION_FIRST_TRANS 0x39 233 + #define PGM_REGION_SECOND_TRANS 0x3a 234 + #define PGM_REGION_THIRD_TRANS 0x3b 235 + #define PGM_MONITOR 0x40 236 + #define PGM_PER 0x80 237 + #define PGM_CRYPTO_OPERATION 0x119 239 238 240 239 struct kvm_s390_interrupt_info { 241 240 struct list_head list; ··· 320 229 unsigned int irq_count; 321 230 }; 322 231 232 + struct kvm_hw_wp_info_arch { 233 + unsigned long addr; 234 + unsigned long phys_addr; 235 + int len; 236 + char *old_data; 237 + }; 238 + 239 + struct kvm_hw_bp_info_arch { 240 + unsigned long addr; 241 + int len; 242 + }; 243 + 244 + /* 245 + * Only the upper 16 bits of kvm_guest_debug->control are arch specific. 246 + * Further KVM_GUESTDBG flags which an be used from userspace can be found in 247 + * arch/s390/include/uapi/asm/kvm.h 248 + */ 249 + #define KVM_GUESTDBG_EXIT_PENDING 0x10000000 250 + 251 + #define guestdbg_enabled(vcpu) \ 252 + (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) 253 + #define guestdbg_sstep_enabled(vcpu) \ 254 + (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 255 + #define guestdbg_hw_bp_enabled(vcpu) \ 256 + (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 257 + #define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \ 258 + (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING)) 259 + 260 + struct kvm_guestdbg_info_arch { 261 + unsigned long cr0; 262 + unsigned long cr9; 263 + unsigned long cr10; 264 + unsigned long cr11; 265 + struct kvm_hw_bp_info_arch *hw_bp_info; 266 + struct kvm_hw_wp_info_arch *hw_wp_info; 267 + int nr_hw_bp; 268 + int nr_hw_wp; 269 + unsigned long last_bp; 270 + }; 323 271 324 272 struct kvm_vcpu_arch { 325 273 struct kvm_s390_sie_block *sie_block; ··· 368 238 struct kvm_s390_local_interrupt local_int; 369 239 struct hrtimer ckc_timer; 370 240 struct tasklet_struct tasklet; 241 + struct kvm_s390_pgm_info pgm; 371 242 union { 372 243 struct cpuid cpu_id; 373 244 u64 stidp_data; 374 245 }; 375 246 struct gmap *gmap; 247 + struct kvm_guestdbg_info_arch guestdbg; 376 248 #define KVM_S390_PFAULT_TOKEN_INVALID (-1UL) 377 249 unsigned long pfault_token; 378 250 unsigned long pfault_select; ··· 417 285 struct gmap *gmap; 418 286 int css_support; 419 287 int use_irqchip; 288 + int use_cmma; 420 289 struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; 290 + wait_queue_head_t ipte_wq; 291 + spinlock_t start_stop_lock; 421 292 }; 422 293 423 294 #define KVM_HVA_ERR_BAD (-1UL)

+6 -4

arch/s390/include/asm/lowcore.h

··· 56 56 __u16 pgm_code; /* 0x008e */ 57 57 __u32 trans_exc_code; /* 0x0090 */ 58 58 __u16 mon_class_num; /* 0x0094 */ 59 - __u16 per_perc_atmid; /* 0x0096 */ 59 + __u8 per_code; /* 0x0096 */ 60 + __u8 per_atmid; /* 0x0097 */ 60 61 __u32 per_address; /* 0x0098 */ 61 62 __u32 monitor_code; /* 0x009c */ 62 63 __u8 exc_access_id; /* 0x00a0 */ 63 64 __u8 per_access_id; /* 0x00a1 */ 64 65 __u8 op_access_id; /* 0x00a2 */ 65 - __u8 ar_access_id; /* 0x00a3 */ 66 + __u8 ar_mode_id; /* 0x00a3 */ 66 67 __u8 pad_0x00a4[0x00b8-0x00a4]; /* 0x00a4 */ 67 68 __u16 subchannel_id; /* 0x00b8 */ 68 69 __u16 subchannel_nr; /* 0x00ba */ ··· 196 195 __u16 pgm_code; /* 0x008e */ 197 196 __u32 data_exc_code; /* 0x0090 */ 198 197 __u16 mon_class_num; /* 0x0094 */ 199 - __u16 per_perc_atmid; /* 0x0096 */ 198 + __u8 per_code; /* 0x0096 */ 199 + __u8 per_atmid; /* 0x0097 */ 200 200 __u64 per_address; /* 0x0098 */ 201 201 __u8 exc_access_id; /* 0x00a0 */ 202 202 __u8 per_access_id; /* 0x00a1 */ 203 203 __u8 op_access_id; /* 0x00a2 */ 204 - __u8 ar_access_id; /* 0x00a3 */ 204 + __u8 ar_mode_id; /* 0x00a3 */ 205 205 __u8 pad_0x00a4[0x00a8-0x00a4]; /* 0x00a4 */ 206 206 __u64 trans_exc_code; /* 0x00a8 */ 207 207 __u64 monitor_code; /* 0x00b0 */

+2

arch/s390/include/asm/mmu.h

··· 16 16 unsigned long vdso_base; 17 17 /* The mmu context has extended page tables. */ 18 18 unsigned int has_pgste:1; 19 + /* The mmu context uses storage keys. */ 20 + unsigned int use_skey:1; 19 21 } mm_context_t; 20 22 21 23 #define INIT_MM_CONTEXT(name) \

+1

arch/s390/include/asm/mmu_context.h

··· 23 23 mm->context.asce_bits |= _ASCE_TYPE_REGION3; 24 24 #endif 25 25 mm->context.has_pgste = 0; 26 + mm->context.use_skey = 0; 26 27 mm->context.asce_limit = STACK_TOP_MAX; 27 28 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); 28 29 return 0;

+2 -1

arch/s390/include/asm/pgalloc.h

··· 22 22 void page_table_free(struct mm_struct *, unsigned long *); 23 23 void page_table_free_rcu(struct mmu_gather *, unsigned long *); 24 24 25 - void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long); 25 + void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long, 26 + bool init_skey); 26 27 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 27 28 unsigned long key, bool nq); 28 29

+81 -88

arch/s390/include/asm/pgtable.h

··· 309 309 #define PGSTE_HC_BIT 0x00200000UL 310 310 #define PGSTE_GR_BIT 0x00040000UL 311 311 #define PGSTE_GC_BIT 0x00020000UL 312 - #define PGSTE_IN_BIT 0x00008000UL /* IPTE notify bit */ 312 + #define PGSTE_UC_BIT 0x00008000UL /* user dirty (migration) */ 313 + #define PGSTE_IN_BIT 0x00004000UL /* IPTE notify bit */ 313 314 314 315 #else /* CONFIG_64BIT */ 315 316 ··· 392 391 #define PGSTE_HC_BIT 0x0020000000000000UL 393 392 #define PGSTE_GR_BIT 0x0004000000000000UL 394 393 #define PGSTE_GC_BIT 0x0002000000000000UL 395 - #define PGSTE_IN_BIT 0x0000800000000000UL /* IPTE notify bit */ 394 + #define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ 395 + #define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ 396 396 397 397 #endif /* CONFIG_64BIT */ 398 398 ··· 468 466 #endif 469 467 return 0; 470 468 } 469 + 470 + static inline int mm_use_skey(struct mm_struct *mm) 471 + { 472 + #ifdef CONFIG_PGSTE 473 + if (mm->context.use_skey) 474 + return 1; 475 + #endif 476 + return 0; 477 + } 478 + 471 479 /* 472 480 * pgd/pmd/pte query functions 473 481 */ ··· 711 699 #endif 712 700 } 713 701 714 - static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste) 702 + static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste, 703 + struct mm_struct *mm) 715 704 { 716 705 #ifdef CONFIG_PGSTE 717 706 unsigned long address, bits, skey; 718 707 719 - if (pte_val(*ptep) & _PAGE_INVALID) 708 + if (!mm_use_skey(mm) || pte_val(*ptep) & _PAGE_INVALID) 720 709 return pgste; 721 710 address = pte_val(*ptep) & PAGE_MASK; 722 711 skey = (unsigned long) page_get_storage_key(address); 723 712 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 724 - if (!(pgste_val(pgste) & PGSTE_HC_BIT) && (bits & _PAGE_CHANGED)) { 725 - /* Transfer dirty + referenced bit to host bits in pgste */ 726 - pgste_val(pgste) |= bits << 52; 727 - page_set_storage_key(address, skey ^ bits, 0); 728 - } else if (!(pgste_val(pgste) & PGSTE_HR_BIT) && 729 - (bits & _PAGE_REFERENCED)) { 730 - /* Transfer referenced bit to host bit in pgste */ 731 - pgste_val(pgste) |= PGSTE_HR_BIT; 732 - page_reset_referenced(address); 733 - } 734 713 /* Transfer page changed & referenced bit to guest bits in pgste */ 735 714 pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ 736 715 /* Copy page access key and fetch protection bit to pgste */ ··· 732 729 733 730 } 734 731 735 - static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste) 736 - { 737 - #ifdef CONFIG_PGSTE 738 - if (pte_val(*ptep) & _PAGE_INVALID) 739 - return pgste; 740 - /* Get referenced bit from storage key */ 741 - if (page_reset_referenced(pte_val(*ptep) & PAGE_MASK)) 742 - pgste_val(pgste) |= PGSTE_HR_BIT | PGSTE_GR_BIT; 743 - #endif 744 - return pgste; 745 - } 746 - 747 - static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry) 732 + static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 733 + struct mm_struct *mm) 748 734 { 749 735 #ifdef CONFIG_PGSTE 750 736 unsigned long address; 751 737 unsigned long nkey; 752 738 753 - if (pte_val(entry) & _PAGE_INVALID) 739 + if (!mm_use_skey(mm) || pte_val(entry) & _PAGE_INVALID) 754 740 return; 755 741 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 756 742 address = pte_val(entry) & PAGE_MASK; ··· 749 757 * key C/R to 0. 750 758 */ 751 759 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 760 + nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 752 761 page_set_storage_key(address, nkey, 0); 753 762 #endif 754 763 } 755 764 756 - static inline void pgste_set_pte(pte_t *ptep, pte_t entry) 765 + static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 757 766 { 758 - if (!MACHINE_HAS_ESOP && 759 - (pte_val(entry) & _PAGE_PRESENT) && 760 - (pte_val(entry) & _PAGE_WRITE)) { 761 - /* 762 - * Without enhanced suppression-on-protection force 763 - * the dirty bit on for all writable ptes. 764 - */ 765 - pte_val(entry) |= _PAGE_DIRTY; 766 - pte_val(entry) &= ~_PAGE_PROTECT; 767 + if ((pte_val(entry) & _PAGE_PRESENT) && 768 + (pte_val(entry) & _PAGE_WRITE) && 769 + !(pte_val(entry) & _PAGE_INVALID)) { 770 + if (!MACHINE_HAS_ESOP) { 771 + /* 772 + * Without enhanced suppression-on-protection force 773 + * the dirty bit on for all writable ptes. 774 + */ 775 + pte_val(entry) |= _PAGE_DIRTY; 776 + pte_val(entry) &= ~_PAGE_PROTECT; 777 + } 778 + if (!(pte_val(entry) & _PAGE_PROTECT)) 779 + /* This pte allows write access, set user-dirty */ 780 + pgste_val(pgste) |= PGSTE_UC_BIT; 767 781 } 768 782 *ptep = entry; 783 + return pgste; 769 784 } 770 785 771 786 /** ··· 838 839 unsigned long gmap_fault(unsigned long address, struct gmap *); 839 840 void gmap_discard(unsigned long from, unsigned long to, struct gmap *); 840 841 void __gmap_zap(unsigned long address, struct gmap *); 842 + bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *); 843 + 841 844 842 845 void gmap_register_ipte_notifier(struct gmap_notifier *); 843 846 void gmap_unregister_ipte_notifier(struct gmap_notifier *); ··· 871 870 if (mm_has_pgste(mm)) { 872 871 pgste = pgste_get_lock(ptep); 873 872 pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; 874 - pgste_set_key(ptep, pgste, entry); 875 - pgste_set_pte(ptep, entry); 873 + pgste_set_key(ptep, pgste, entry, mm); 874 + pgste = pgste_set_pte(ptep, pgste, entry); 876 875 pgste_set_unlock(ptep, pgste); 877 876 } else { 878 877 if (!(pte_val(entry) & _PAGE_INVALID) && MACHINE_HAS_EDAT1) ··· 1018 1017 } 1019 1018 #endif 1020 1019 1021 - /* 1022 - * Get (and clear) the user dirty bit for a pte. 1023 - */ 1024 - static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm, 1025 - pte_t *ptep) 1026 - { 1027 - pgste_t pgste; 1028 - int dirty = 0; 1029 - 1030 - if (mm_has_pgste(mm)) { 1031 - pgste = pgste_get_lock(ptep); 1032 - pgste = pgste_update_all(ptep, pgste); 1033 - dirty = !!(pgste_val(pgste) & PGSTE_HC_BIT); 1034 - pgste_val(pgste) &= ~PGSTE_HC_BIT; 1035 - pgste_set_unlock(ptep, pgste); 1036 - return dirty; 1037 - } 1038 - return dirty; 1039 - } 1040 - 1041 - /* 1042 - * Get (and clear) the user referenced bit for a pte. 1043 - */ 1044 - static inline int ptep_test_and_clear_user_young(struct mm_struct *mm, 1045 - pte_t *ptep) 1046 - { 1047 - pgste_t pgste; 1048 - int young = 0; 1049 - 1050 - if (mm_has_pgste(mm)) { 1051 - pgste = pgste_get_lock(ptep); 1052 - pgste = pgste_update_young(ptep, pgste); 1053 - young = !!(pgste_val(pgste) & PGSTE_HR_BIT); 1054 - pgste_val(pgste) &= ~PGSTE_HR_BIT; 1055 - pgste_set_unlock(ptep, pgste); 1056 - } 1057 - return young; 1058 - } 1059 - 1060 1020 static inline void __ptep_ipte(unsigned long address, pte_t *ptep) 1061 1021 { 1062 1022 unsigned long pto = (unsigned long) ptep; ··· 1080 1118 atomic_sub(0x10000, &mm->context.attach_count); 1081 1119 } 1082 1120 1121 + /* 1122 + * Get (and clear) the user dirty bit for a pte. 1123 + */ 1124 + static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm, 1125 + unsigned long addr, 1126 + pte_t *ptep) 1127 + { 1128 + pgste_t pgste; 1129 + pte_t pte; 1130 + int dirty; 1131 + 1132 + if (!mm_has_pgste(mm)) 1133 + return 0; 1134 + pgste = pgste_get_lock(ptep); 1135 + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 1136 + pgste_val(pgste) &= ~PGSTE_UC_BIT; 1137 + pte = *ptep; 1138 + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 1139 + pgste = pgste_ipte_notify(mm, ptep, pgste); 1140 + __ptep_ipte(addr, ptep); 1141 + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) 1142 + pte_val(pte) |= _PAGE_PROTECT; 1143 + else 1144 + pte_val(pte) |= _PAGE_INVALID; 1145 + *ptep = pte; 1146 + } 1147 + pgste_set_unlock(ptep, pgste); 1148 + return dirty; 1149 + } 1150 + 1083 1151 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 1084 1152 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, 1085 1153 unsigned long addr, pte_t *ptep) ··· 1129 1137 pte = pte_mkold(pte); 1130 1138 1131 1139 if (mm_has_pgste(vma->vm_mm)) { 1132 - pgste_set_pte(ptep, pte); 1140 + pgste = pgste_set_pte(ptep, pgste, pte); 1133 1141 pgste_set_unlock(ptep, pgste); 1134 1142 } else 1135 1143 *ptep = pte; ··· 1174 1182 pte_val(*ptep) = _PAGE_INVALID; 1175 1183 1176 1184 if (mm_has_pgste(mm)) { 1177 - pgste = pgste_update_all(&pte, pgste); 1185 + pgste = pgste_update_all(&pte, pgste, mm); 1178 1186 pgste_set_unlock(ptep, pgste); 1179 1187 } 1180 1188 return pte; ··· 1197 1205 ptep_flush_lazy(mm, address, ptep); 1198 1206 1199 1207 if (mm_has_pgste(mm)) { 1200 - pgste = pgste_update_all(&pte, pgste); 1208 + pgste = pgste_update_all(&pte, pgste, mm); 1201 1209 pgste_set(ptep, pgste); 1202 1210 } 1203 1211 return pte; ··· 1211 1219 1212 1220 if (mm_has_pgste(mm)) { 1213 1221 pgste = pgste_get(ptep); 1214 - pgste_set_key(ptep, pgste, pte); 1215 - pgste_set_pte(ptep, pte); 1222 + pgste_set_key(ptep, pgste, pte, mm); 1223 + pgste = pgste_set_pte(ptep, pgste, pte); 1216 1224 pgste_set_unlock(ptep, pgste); 1217 1225 } else 1218 1226 *ptep = pte; ··· 1238 1246 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 1239 1247 _PGSTE_GPS_USAGE_UNUSED) 1240 1248 pte_val(pte) |= _PAGE_UNUSED; 1241 - pgste = pgste_update_all(&pte, pgste); 1249 + pgste = pgste_update_all(&pte, pgste, vma->vm_mm); 1242 1250 pgste_set_unlock(ptep, pgste); 1243 1251 } 1244 1252 return pte; ··· 1270 1278 pte_val(*ptep) = _PAGE_INVALID; 1271 1279 1272 1280 if (!full && mm_has_pgste(mm)) { 1273 - pgste = pgste_update_all(&pte, pgste); 1281 + pgste = pgste_update_all(&pte, pgste, mm); 1274 1282 pgste_set_unlock(ptep, pgste); 1275 1283 } 1276 1284 return pte; ··· 1293 1301 pte = pte_wrprotect(pte); 1294 1302 1295 1303 if (mm_has_pgste(mm)) { 1296 - pgste_set_pte(ptep, pte); 1304 + pgste = pgste_set_pte(ptep, pgste, pte); 1297 1305 pgste_set_unlock(ptep, pgste); 1298 1306 } else 1299 1307 *ptep = pte; ··· 1318 1326 ptep_flush_direct(vma->vm_mm, address, ptep); 1319 1327 1320 1328 if (mm_has_pgste(vma->vm_mm)) { 1321 - pgste_set_pte(ptep, entry); 1329 + pgste = pgste_set_pte(ptep, pgste, entry); 1322 1330 pgste_set_unlock(ptep, pgste); 1323 1331 } else 1324 1332 *ptep = entry; ··· 1726 1734 extern int vmem_add_mapping(unsigned long start, unsigned long size); 1727 1735 extern int vmem_remove_mapping(unsigned long start, unsigned long size); 1728 1736 extern int s390_enable_sie(void); 1737 + extern void s390_enable_skey(void); 1729 1738 1730 1739 /* 1731 1740 * No page table caches to initialise

+44

arch/s390/include/asm/ptrace.h

··· 22 22 PSW_DEFAULT_KEY | PSW_MASK_BASE | PSW_MASK_MCHECK | \ 23 23 PSW_MASK_PSTATE | PSW_ASC_PRIMARY) 24 24 25 + struct psw_bits { 26 + unsigned long long : 1; 27 + unsigned long long r : 1; /* PER-Mask */ 28 + unsigned long long : 3; 29 + unsigned long long t : 1; /* DAT Mode */ 30 + unsigned long long i : 1; /* Input/Output Mask */ 31 + unsigned long long e : 1; /* External Mask */ 32 + unsigned long long key : 4; /* PSW Key */ 33 + unsigned long long : 1; 34 + unsigned long long m : 1; /* Machine-Check Mask */ 35 + unsigned long long w : 1; /* Wait State */ 36 + unsigned long long p : 1; /* Problem State */ 37 + unsigned long long as : 2; /* Address Space Control */ 38 + unsigned long long cc : 2; /* Condition Code */ 39 + unsigned long long pm : 4; /* Program Mask */ 40 + unsigned long long ri : 1; /* Runtime Instrumentation */ 41 + unsigned long long : 6; 42 + unsigned long long eaba : 2; /* Addressing Mode */ 43 + #ifdef CONFIG_64BIT 44 + unsigned long long : 31; 45 + unsigned long long ia : 64;/* Instruction Address */ 46 + #else 47 + unsigned long long ia : 31;/* Instruction Address */ 48 + #endif 49 + }; 50 + 51 + enum { 52 + PSW_AMODE_24BIT = 0, 53 + PSW_AMODE_31BIT = 1, 54 + PSW_AMODE_64BIT = 3 55 + }; 56 + 57 + enum { 58 + PSW_AS_PRIMARY = 0, 59 + PSW_AS_ACCREG = 1, 60 + PSW_AS_SECONDARY = 2, 61 + PSW_AS_HOME = 3 62 + }; 63 + 64 + #define psw_bits(__psw) (*({ \ 65 + typecheck(psw_t, __psw); \ 66 + &(*(struct psw_bits *)(&(__psw))); \ 67 + })) 68 + 25 69 /* 26 70 * The pt_regs struct defines the way the registers are stored on 27 71 * the stack during a system call.

+7 -1

arch/s390/include/asm/sclp.h

··· 28 28 29 29 struct sclp_cpu_entry { 30 30 u8 address; 31 - u8 reserved0[13]; 31 + u8 reserved0[2]; 32 + u8 : 3; 33 + u8 siif : 1; 34 + u8 : 4; 35 + u8 reserved2[10]; 32 36 u8 type; 33 37 u8 reserved1; 34 38 } __attribute__((packed)); ··· 65 61 int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode); 66 62 unsigned long sclp_get_hsa_size(void); 67 63 void sclp_early_detect(void); 64 + int sclp_has_siif(void); 65 + unsigned int sclp_get_ibc(void); 68 66 69 67 #endif /* _ASM_S390_SCLP_H */

+28

arch/s390/include/uapi/asm/kvm.h

··· 15 15 #include <linux/types.h> 16 16 17 17 #define __KVM_S390 18 + #define __KVM_HAVE_GUEST_DEBUG 18 19 19 20 /* Device control API: s390-specific devices */ 20 21 #define KVM_DEV_FLIC_GET_ALL_IRQS 1 ··· 55 54 __u64 addr; 56 55 }; 57 56 57 + /* kvm attr_group on vm fd */ 58 + #define KVM_S390_VM_MEM_CTRL 0 59 + 60 + /* kvm attributes for mem_ctrl */ 61 + #define KVM_S390_VM_MEM_ENABLE_CMMA 0 62 + #define KVM_S390_VM_MEM_CLR_CMMA 1 63 + 58 64 /* for KVM_GET_REGS and KVM_SET_REGS */ 59 65 struct kvm_regs { 60 66 /* general purpose regs for s390 */ ··· 80 72 __u64 fprs[16]; 81 73 }; 82 74 75 + #define KVM_GUESTDBG_USE_HW_BP 0x00010000 76 + 77 + #define KVM_HW_BP 1 78 + #define KVM_HW_WP_WRITE 2 79 + #define KVM_SINGLESTEP 4 80 + 83 81 struct kvm_debug_exit_arch { 82 + __u64 addr; 83 + __u8 type; 84 + __u8 pad[7]; /* Should be set to 0 */ 85 + }; 86 + 87 + struct kvm_hw_breakpoint { 88 + __u64 addr; 89 + __u64 phys_addr; 90 + __u64 len; 91 + __u8 type; 92 + __u8 pad[7]; /* Should be set to 0 */ 84 93 }; 85 94 86 95 /* for KVM_SET_GUEST_DEBUG */ 87 96 struct kvm_guest_debug_arch { 97 + __u32 nr_hw_bp; 98 + __u32 pad; /* Should be set to 0 */ 99 + struct kvm_hw_breakpoint __user *hw_bp; 88 100 }; 89 101 90 102 #define KVM_SYNC_PREFIX (1UL << 0)

+245

arch/s390/include/uapi/asm/sie.h

··· 1 + #ifndef _UAPI_ASM_S390_SIE_H 2 + #define _UAPI_ASM_S390_SIE_H 3 + 4 + #include <asm/sigp.h> 5 + 6 + #define diagnose_codes \ 7 + { 0x10, "DIAG (0x10) release pages" }, \ 8 + { 0x44, "DIAG (0x44) time slice end" }, \ 9 + { 0x9c, "DIAG (0x9c) time slice end directed" }, \ 10 + { 0x204, "DIAG (0x204) logical-cpu utilization" }, \ 11 + { 0x258, "DIAG (0x258) page-reference services" }, \ 12 + { 0x308, "DIAG (0x308) ipl functions" }, \ 13 + { 0x500, "DIAG (0x500) KVM virtio functions" }, \ 14 + { 0x501, "DIAG (0x501) KVM breakpoint" } 15 + 16 + #define sigp_order_codes \ 17 + { SIGP_SENSE, "SIGP sense" }, \ 18 + { SIGP_EXTERNAL_CALL, "SIGP external call" }, \ 19 + { SIGP_EMERGENCY_SIGNAL, "SIGP emergency signal" }, \ 20 + { SIGP_STOP, "SIGP stop" }, \ 21 + { SIGP_STOP_AND_STORE_STATUS, "SIGP stop and store status" }, \ 22 + { SIGP_SET_ARCHITECTURE, "SIGP set architecture" }, \ 23 + { SIGP_SET_PREFIX, "SIGP set prefix" }, \ 24 + { SIGP_SENSE_RUNNING, "SIGP sense running" }, \ 25 + { SIGP_RESTART, "SIGP restart" }, \ 26 + { SIGP_INITIAL_CPU_RESET, "SIGP initial cpu reset" }, \ 27 + { SIGP_STORE_STATUS_AT_ADDRESS, "SIGP store status at address" } 28 + 29 + #define icpt_prog_codes \ 30 + { 0x0001, "Prog Operation" }, \ 31 + { 0x0002, "Prog Privileged Operation" }, \ 32 + { 0x0003, "Prog Execute" }, \ 33 + { 0x0004, "Prog Protection" }, \ 34 + { 0x0005, "Prog Addressing" }, \ 35 + { 0x0006, "Prog Specification" }, \ 36 + { 0x0007, "Prog Data" }, \ 37 + { 0x0008, "Prog Fixedpoint overflow" }, \ 38 + { 0x0009, "Prog Fixedpoint divide" }, \ 39 + { 0x000A, "Prog Decimal overflow" }, \ 40 + { 0x000B, "Prog Decimal divide" }, \ 41 + { 0x000C, "Prog HFP exponent overflow" }, \ 42 + { 0x000D, "Prog HFP exponent underflow" }, \ 43 + { 0x000E, "Prog HFP significance" }, \ 44 + { 0x000F, "Prog HFP divide" }, \ 45 + { 0x0010, "Prog Segment translation" }, \ 46 + { 0x0011, "Prog Page translation" }, \ 47 + { 0x0012, "Prog Translation specification" }, \ 48 + { 0x0013, "Prog Special operation" }, \ 49 + { 0x0015, "Prog Operand" }, \ 50 + { 0x0016, "Prog Trace table" }, \ 51 + { 0x0017, "Prog ASNtranslation specification" }, \ 52 + { 0x001C, "Prog Spaceswitch event" }, \ 53 + { 0x001D, "Prog HFP square root" }, \ 54 + { 0x001F, "Prog PCtranslation specification" }, \ 55 + { 0x0020, "Prog AFX translation" }, \ 56 + { 0x0021, "Prog ASX translation" }, \ 57 + { 0x0022, "Prog LX translation" }, \ 58 + { 0x0023, "Prog EX translation" }, \ 59 + { 0x0024, "Prog Primary authority" }, \ 60 + { 0x0025, "Prog Secondary authority" }, \ 61 + { 0x0026, "Prog LFXtranslation exception" }, \ 62 + { 0x0027, "Prog LSXtranslation exception" }, \ 63 + { 0x0028, "Prog ALET specification" }, \ 64 + { 0x0029, "Prog ALEN translation" }, \ 65 + { 0x002A, "Prog ALE sequence" }, \ 66 + { 0x002B, "Prog ASTE validity" }, \ 67 + { 0x002C, "Prog ASTE sequence" }, \ 68 + { 0x002D, "Prog Extended authority" }, \ 69 + { 0x002E, "Prog LSTE sequence" }, \ 70 + { 0x002F, "Prog ASTE instance" }, \ 71 + { 0x0030, "Prog Stack full" }, \ 72 + { 0x0031, "Prog Stack empty" }, \ 73 + { 0x0032, "Prog Stack specification" }, \ 74 + { 0x0033, "Prog Stack type" }, \ 75 + { 0x0034, "Prog Stack operation" }, \ 76 + { 0x0039, "Prog Region first translation" }, \ 77 + { 0x003A, "Prog Region second translation" }, \ 78 + { 0x003B, "Prog Region third translation" }, \ 79 + { 0x0040, "Prog Monitor event" }, \ 80 + { 0x0080, "Prog PER event" }, \ 81 + { 0x0119, "Prog Crypto operation" } 82 + 83 + #define exit_code_ipa0(ipa0, opcode, mnemonic) \ 84 + { (ipa0 << 8 | opcode), #ipa0 " " mnemonic } 85 + #define exit_code(opcode, mnemonic) \ 86 + { opcode, mnemonic } 87 + 88 + #define icpt_insn_codes \ 89 + exit_code_ipa0(0x01, 0x01, "PR"), \ 90 + exit_code_ipa0(0x01, 0x04, "PTFF"), \ 91 + exit_code_ipa0(0x01, 0x07, "SCKPF"), \ 92 + exit_code_ipa0(0xAA, 0x00, "RINEXT"), \ 93 + exit_code_ipa0(0xAA, 0x01, "RION"), \ 94 + exit_code_ipa0(0xAA, 0x02, "TRIC"), \ 95 + exit_code_ipa0(0xAA, 0x03, "RIOFF"), \ 96 + exit_code_ipa0(0xAA, 0x04, "RIEMIT"), \ 97 + exit_code_ipa0(0xB2, 0x02, "STIDP"), \ 98 + exit_code_ipa0(0xB2, 0x04, "SCK"), \ 99 + exit_code_ipa0(0xB2, 0x05, "STCK"), \ 100 + exit_code_ipa0(0xB2, 0x06, "SCKC"), \ 101 + exit_code_ipa0(0xB2, 0x07, "STCKC"), \ 102 + exit_code_ipa0(0xB2, 0x08, "SPT"), \ 103 + exit_code_ipa0(0xB2, 0x09, "STPT"), \ 104 + exit_code_ipa0(0xB2, 0x0d, "PTLB"), \ 105 + exit_code_ipa0(0xB2, 0x10, "SPX"), \ 106 + exit_code_ipa0(0xB2, 0x11, "STPX"), \ 107 + exit_code_ipa0(0xB2, 0x12, "STAP"), \ 108 + exit_code_ipa0(0xB2, 0x14, "SIE"), \ 109 + exit_code_ipa0(0xB2, 0x16, "SETR"), \ 110 + exit_code_ipa0(0xB2, 0x17, "STETR"), \ 111 + exit_code_ipa0(0xB2, 0x18, "PC"), \ 112 + exit_code_ipa0(0xB2, 0x20, "SERVC"), \ 113 + exit_code_ipa0(0xB2, 0x28, "PT"), \ 114 + exit_code_ipa0(0xB2, 0x29, "ISKE"), \ 115 + exit_code_ipa0(0xB2, 0x2a, "RRBE"), \ 116 + exit_code_ipa0(0xB2, 0x2b, "SSKE"), \ 117 + exit_code_ipa0(0xB2, 0x2c, "TB"), \ 118 + exit_code_ipa0(0xB2, 0x2e, "PGIN"), \ 119 + exit_code_ipa0(0xB2, 0x2f, "PGOUT"), \ 120 + exit_code_ipa0(0xB2, 0x30, "CSCH"), \ 121 + exit_code_ipa0(0xB2, 0x31, "HSCH"), \ 122 + exit_code_ipa0(0xB2, 0x32, "MSCH"), \ 123 + exit_code_ipa0(0xB2, 0x33, "SSCH"), \ 124 + exit_code_ipa0(0xB2, 0x34, "STSCH"), \ 125 + exit_code_ipa0(0xB2, 0x35, "TSCH"), \ 126 + exit_code_ipa0(0xB2, 0x36, "TPI"), \ 127 + exit_code_ipa0(0xB2, 0x37, "SAL"), \ 128 + exit_code_ipa0(0xB2, 0x38, "RSCH"), \ 129 + exit_code_ipa0(0xB2, 0x39, "STCRW"), \ 130 + exit_code_ipa0(0xB2, 0x3a, "STCPS"), \ 131 + exit_code_ipa0(0xB2, 0x3b, "RCHP"), \ 132 + exit_code_ipa0(0xB2, 0x3c, "SCHM"), \ 133 + exit_code_ipa0(0xB2, 0x40, "BAKR"), \ 134 + exit_code_ipa0(0xB2, 0x48, "PALB"), \ 135 + exit_code_ipa0(0xB2, 0x4c, "TAR"), \ 136 + exit_code_ipa0(0xB2, 0x50, "CSP"), \ 137 + exit_code_ipa0(0xB2, 0x54, "MVPG"), \ 138 + exit_code_ipa0(0xB2, 0x58, "BSG"), \ 139 + exit_code_ipa0(0xB2, 0x5a, "BSA"), \ 140 + exit_code_ipa0(0xB2, 0x5f, "CHSC"), \ 141 + exit_code_ipa0(0xB2, 0x74, "SIGA"), \ 142 + exit_code_ipa0(0xB2, 0x76, "XSCH"), \ 143 + exit_code_ipa0(0xB2, 0x78, "STCKE"), \ 144 + exit_code_ipa0(0xB2, 0x7c, "STCKF"), \ 145 + exit_code_ipa0(0xB2, 0x7d, "STSI"), \ 146 + exit_code_ipa0(0xB2, 0xb0, "STFLE"), \ 147 + exit_code_ipa0(0xB2, 0xb1, "STFL"), \ 148 + exit_code_ipa0(0xB2, 0xb2, "LPSWE"), \ 149 + exit_code_ipa0(0xB2, 0xf8, "TEND"), \ 150 + exit_code_ipa0(0xB2, 0xfc, "TABORT"), \ 151 + exit_code_ipa0(0xB9, 0x1e, "KMAC"), \ 152 + exit_code_ipa0(0xB9, 0x28, "PCKMO"), \ 153 + exit_code_ipa0(0xB9, 0x2a, "KMF"), \ 154 + exit_code_ipa0(0xB9, 0x2b, "KMO"), \ 155 + exit_code_ipa0(0xB9, 0x2d, "KMCTR"), \ 156 + exit_code_ipa0(0xB9, 0x2e, "KM"), \ 157 + exit_code_ipa0(0xB9, 0x2f, "KMC"), \ 158 + exit_code_ipa0(0xB9, 0x3e, "KIMD"), \ 159 + exit_code_ipa0(0xB9, 0x3f, "KLMD"), \ 160 + exit_code_ipa0(0xB9, 0x8a, "CSPG"), \ 161 + exit_code_ipa0(0xB9, 0x8d, "EPSW"), \ 162 + exit_code_ipa0(0xB9, 0x8e, "IDTE"), \ 163 + exit_code_ipa0(0xB9, 0x8f, "CRDTE"), \ 164 + exit_code_ipa0(0xB9, 0x9c, "EQBS"), \ 165 + exit_code_ipa0(0xB9, 0xa2, "PTF"), \ 166 + exit_code_ipa0(0xB9, 0xab, "ESSA"), \ 167 + exit_code_ipa0(0xB9, 0xae, "RRBM"), \ 168 + exit_code_ipa0(0xB9, 0xaf, "PFMF"), \ 169 + exit_code_ipa0(0xE3, 0x03, "LRAG"), \ 170 + exit_code_ipa0(0xE3, 0x13, "LRAY"), \ 171 + exit_code_ipa0(0xE3, 0x25, "NTSTG"), \ 172 + exit_code_ipa0(0xE5, 0x00, "LASP"), \ 173 + exit_code_ipa0(0xE5, 0x01, "TPROT"), \ 174 + exit_code_ipa0(0xE5, 0x60, "TBEGIN"), \ 175 + exit_code_ipa0(0xE5, 0x61, "TBEGINC"), \ 176 + exit_code_ipa0(0xEB, 0x25, "STCTG"), \ 177 + exit_code_ipa0(0xEB, 0x2f, "LCTLG"), \ 178 + exit_code_ipa0(0xEB, 0x60, "LRIC"), \ 179 + exit_code_ipa0(0xEB, 0x61, "STRIC"), \ 180 + exit_code_ipa0(0xEB, 0x62, "MRIC"), \ 181 + exit_code_ipa0(0xEB, 0x8a, "SQBS"), \ 182 + exit_code_ipa0(0xC8, 0x01, "ECTG"), \ 183 + exit_code(0x0a, "SVC"), \ 184 + exit_code(0x80, "SSM"), \ 185 + exit_code(0x82, "LPSW"), \ 186 + exit_code(0x83, "DIAG"), \ 187 + exit_code(0xae, "SIGP"), \ 188 + exit_code(0xac, "STNSM"), \ 189 + exit_code(0xad, "STOSM"), \ 190 + exit_code(0xb1, "LRA"), \ 191 + exit_code(0xb6, "STCTL"), \ 192 + exit_code(0xb7, "LCTL"), \ 193 + exit_code(0xee, "PLO") 194 + 195 + #define sie_intercept_code \ 196 + { 0x00, "Host interruption" }, \ 197 + { 0x04, "Instruction" }, \ 198 + { 0x08, "Program interruption" }, \ 199 + { 0x0c, "Instruction and program interruption" }, \ 200 + { 0x10, "External request" }, \ 201 + { 0x14, "External interruption" }, \ 202 + { 0x18, "I/O request" }, \ 203 + { 0x1c, "Wait state" }, \ 204 + { 0x20, "Validity" }, \ 205 + { 0x28, "Stop request" }, \ 206 + { 0x2c, "Operation exception" }, \ 207 + { 0x38, "Partial-execution" }, \ 208 + { 0x3c, "I/O interruption" }, \ 209 + { 0x40, "I/O instruction" }, \ 210 + { 0x48, "Timing subset" } 211 + 212 + /* 213 + * This is the simple interceptable instructions decoder. 214 + * 215 + * It will be used as userspace interface and it can be used in places 216 + * that does not allow to use general decoder functions, 217 + * such as trace events declarations. 218 + * 219 + * Some userspace tools may want to parse this code 220 + * and would be confused by switch(), if() and other statements, 221 + * but they can understand conditional operator. 222 + */ 223 + #define INSN_DECODE_IPA0(ipa0, insn, rshift, mask) \ 224 + (insn >> 56) == (ipa0) ? \ 225 + ((ipa0 << 8) | ((insn >> rshift) & mask)) : 226 + 227 + #define INSN_DECODE(insn) (insn >> 56) 228 + 229 + /* 230 + * The macro icpt_insn_decoder() takes an intercepted instruction 231 + * and returns a key, which can be used to find a mnemonic name 232 + * of the instruction in the icpt_insn_codes table. 233 + */ 234 + #define icpt_insn_decoder(insn) \ 235 + INSN_DECODE_IPA0(0x01, insn, 48, 0xff) \ 236 + INSN_DECODE_IPA0(0xaa, insn, 48, 0x0f) \ 237 + INSN_DECODE_IPA0(0xb2, insn, 48, 0xff) \ 238 + INSN_DECODE_IPA0(0xb9, insn, 48, 0xff) \ 239 + INSN_DECODE_IPA0(0xe3, insn, 48, 0xff) \ 240 + INSN_DECODE_IPA0(0xe5, insn, 48, 0xff) \ 241 + INSN_DECODE_IPA0(0xeb, insn, 16, 0xff) \ 242 + INSN_DECODE_IPA0(0xc8, insn, 48, 0x0f) \ 243 + INSN_DECODE(insn) 244 + 245 + #endif /* _UAPI_ASM_S390_SIE_H */

+11 -3

arch/s390/kernel/asm-offsets.c

··· 90 90 DEFINE(__LC_PGM_ILC, offsetof(struct _lowcore, pgm_ilc)); 91 91 DEFINE(__LC_PGM_INT_CODE, offsetof(struct _lowcore, pgm_code)); 92 92 DEFINE(__LC_TRANS_EXC_CODE, offsetof(struct _lowcore, trans_exc_code)); 93 - DEFINE(__LC_PER_CAUSE, offsetof(struct _lowcore, per_perc_atmid)); 93 + DEFINE(__LC_MON_CLASS_NR, offsetof(struct _lowcore, mon_class_num)); 94 + DEFINE(__LC_PER_CODE, offsetof(struct _lowcore, per_code)); 95 + DEFINE(__LC_PER_ATMID, offsetof(struct _lowcore, per_atmid)); 94 96 DEFINE(__LC_PER_ADDRESS, offsetof(struct _lowcore, per_address)); 95 - DEFINE(__LC_PER_PAID, offsetof(struct _lowcore, per_access_id)); 96 - DEFINE(__LC_AR_MODE_ID, offsetof(struct _lowcore, ar_access_id)); 97 + DEFINE(__LC_EXC_ACCESS_ID, offsetof(struct _lowcore, exc_access_id)); 98 + DEFINE(__LC_PER_ACCESS_ID, offsetof(struct _lowcore, per_access_id)); 99 + DEFINE(__LC_OP_ACCESS_ID, offsetof(struct _lowcore, op_access_id)); 100 + DEFINE(__LC_AR_MODE_ID, offsetof(struct _lowcore, ar_mode_id)); 101 + DEFINE(__LC_MON_CODE, offsetof(struct _lowcore, monitor_code)); 97 102 DEFINE(__LC_SUBCHANNEL_ID, offsetof(struct _lowcore, subchannel_id)); 98 103 DEFINE(__LC_SUBCHANNEL_NR, offsetof(struct _lowcore, subchannel_nr)); 99 104 DEFINE(__LC_IO_INT_PARM, offsetof(struct _lowcore, io_int_parm)); 100 105 DEFINE(__LC_IO_INT_WORD, offsetof(struct _lowcore, io_int_word)); 101 106 DEFINE(__LC_STFL_FAC_LIST, offsetof(struct _lowcore, stfl_fac_list)); 102 107 DEFINE(__LC_MCCK_CODE, offsetof(struct _lowcore, mcck_interruption_code)); 108 + DEFINE(__LC_MCCK_EXT_DAM_CODE, offsetof(struct _lowcore, external_damage_code)); 103 109 DEFINE(__LC_RST_OLD_PSW, offsetof(struct _lowcore, restart_old_psw)); 104 110 DEFINE(__LC_EXT_OLD_PSW, offsetof(struct _lowcore, external_old_psw)); 105 111 DEFINE(__LC_SVC_OLD_PSW, offsetof(struct _lowcore, svc_old_psw)); ··· 163 157 #ifdef CONFIG_32BIT 164 158 DEFINE(SAVE_AREA_BASE, offsetof(struct _lowcore, extended_save_area_addr)); 165 159 #else /* CONFIG_32BIT */ 160 + DEFINE(__LC_DATA_EXC_CODE, offsetof(struct _lowcore, data_exc_code)); 161 + DEFINE(__LC_MCCK_FAIL_STOR_ADDR, offsetof(struct _lowcore, failing_storage_address)); 166 162 DEFINE(__LC_EXT_PARAMS2, offsetof(struct _lowcore, ext_params2)); 167 163 DEFINE(SAVE_AREA_BASE, offsetof(struct _lowcore, floating_pt_save_area)); 168 164 DEFINE(__LC_PASTE, offsetof(struct _lowcore, paste));

+2 -2

arch/s390/kernel/entry.S

··· 389 389 jz pgm_kprobe 390 390 oi __PT_FLAGS+3(%r11),_PIF_PER_TRAP 391 391 mvc __THREAD_per_address(4,%r1),__LC_PER_ADDRESS 392 - mvc __THREAD_per_cause(2,%r1),__LC_PER_CAUSE 393 - mvc __THREAD_per_paid(1,%r1),__LC_PER_PAID 392 + mvc __THREAD_per_cause(2,%r1),__LC_PER_CODE 393 + mvc __THREAD_per_paid(1,%r1),__LC_PER_ACCESS_ID 394 394 0: REENABLE_IRQS 395 395 xc __SF_BACKCHAIN(4,%r15),__SF_BACKCHAIN(%r15) 396 396 l %r1,BASED(.Ljump_table)

+2 -2

arch/s390/kernel/entry64.S

··· 420 420 jz pgm_kprobe 421 421 oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP 422 422 mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS 423 - mvc __THREAD_per_cause(2,%r14),__LC_PER_CAUSE 424 - mvc __THREAD_per_paid(1,%r14),__LC_PER_PAID 423 + mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE 424 + mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID 425 425 0: REENABLE_IRQS 426 426 xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) 427 427 larl %r1,pgm_check_table

+3 -1

arch/s390/kvm/Makefile

··· 11 11 12 12 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm 13 13 14 - kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o diag.o 14 + kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o 15 + kvm-objs += diag.o gaccess.o guestdbg.o 16 + 15 17 obj-$(CONFIG_KVM) += kvm.o

+6 -13

arch/s390/kvm/diag.c

··· 23 23 static int diag_release_pages(struct kvm_vcpu *vcpu) 24 24 { 25 25 unsigned long start, end; 26 - unsigned long prefix = vcpu->arch.sie_block->prefix; 26 + unsigned long prefix = kvm_s390_get_prefix(vcpu); 27 27 28 28 start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; 29 29 end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; ··· 64 64 int rc; 65 65 u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4; 66 66 u16 ry = (vcpu->arch.sie_block->ipa & 0x0f); 67 - unsigned long hva_token = KVM_HVA_ERR_BAD; 68 67 69 68 if (vcpu->run->s.regs.gprs[rx] & 7) 70 69 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 71 - if (copy_from_guest(vcpu, &parm, vcpu->run->s.regs.gprs[rx], sizeof(parm))) 72 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 70 + rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm)); 71 + if (rc) 72 + return kvm_s390_inject_prog_cond(vcpu, rc); 73 73 if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258) 74 74 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 75 75 ··· 89 89 parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL) 90 90 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 91 91 92 - hva_token = gfn_to_hva(vcpu->kvm, gpa_to_gfn(parm.token_addr)); 93 - if (kvm_is_error_hva(hva_token)) 92 + if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr)) 94 93 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 95 94 96 95 vcpu->arch.pfault_token = parm.token_addr; ··· 166 167 167 168 VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode); 168 169 switch (subcode) { 169 - case 0: 170 - case 1: 171 - page_table_reset_pgste(current->mm, 0, TASK_SIZE); 172 - return -EOPNOTSUPP; 173 170 case 3: 174 171 vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR; 175 - page_table_reset_pgste(current->mm, 0, TASK_SIZE); 176 172 break; 177 173 case 4: 178 174 vcpu->run->s390_reset_flags = 0; 179 - page_table_reset_pgste(current->mm, 0, TASK_SIZE); 180 175 break; 181 176 default: 182 177 return -EOPNOTSUPP; 183 178 } 184 179 185 - atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 180 + kvm_s390_vcpu_stop(vcpu); 186 181 vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM; 187 182 vcpu->run->s390_reset_flags |= KVM_S390_RESET_IPL; 188 183 vcpu->run->s390_reset_flags |= KVM_S390_RESET_CPU_INIT;

+726

arch/s390/kvm/gaccess.c

··· 1 + /* 2 + * guest access functions 3 + * 4 + * Copyright IBM Corp. 2014 5 + * 6 + */ 7 + 8 + #include <linux/vmalloc.h> 9 + #include <linux/err.h> 10 + #include <asm/pgtable.h> 11 + #include "kvm-s390.h" 12 + #include "gaccess.h" 13 + 14 + union asce { 15 + unsigned long val; 16 + struct { 17 + unsigned long origin : 52; /* Region- or Segment-Table Origin */ 18 + unsigned long : 2; 19 + unsigned long g : 1; /* Subspace Group Control */ 20 + unsigned long p : 1; /* Private Space Control */ 21 + unsigned long s : 1; /* Storage-Alteration-Event Control */ 22 + unsigned long x : 1; /* Space-Switch-Event Control */ 23 + unsigned long r : 1; /* Real-Space Control */ 24 + unsigned long : 1; 25 + unsigned long dt : 2; /* Designation-Type Control */ 26 + unsigned long tl : 2; /* Region- or Segment-Table Length */ 27 + }; 28 + }; 29 + 30 + enum { 31 + ASCE_TYPE_SEGMENT = 0, 32 + ASCE_TYPE_REGION3 = 1, 33 + ASCE_TYPE_REGION2 = 2, 34 + ASCE_TYPE_REGION1 = 3 35 + }; 36 + 37 + union region1_table_entry { 38 + unsigned long val; 39 + struct { 40 + unsigned long rto: 52;/* Region-Table Origin */ 41 + unsigned long : 2; 42 + unsigned long p : 1; /* DAT-Protection Bit */ 43 + unsigned long : 1; 44 + unsigned long tf : 2; /* Region-Second-Table Offset */ 45 + unsigned long i : 1; /* Region-Invalid Bit */ 46 + unsigned long : 1; 47 + unsigned long tt : 2; /* Table-Type Bits */ 48 + unsigned long tl : 2; /* Region-Second-Table Length */ 49 + }; 50 + }; 51 + 52 + union region2_table_entry { 53 + unsigned long val; 54 + struct { 55 + unsigned long rto: 52;/* Region-Table Origin */ 56 + unsigned long : 2; 57 + unsigned long p : 1; /* DAT-Protection Bit */ 58 + unsigned long : 1; 59 + unsigned long tf : 2; /* Region-Third-Table Offset */ 60 + unsigned long i : 1; /* Region-Invalid Bit */ 61 + unsigned long : 1; 62 + unsigned long tt : 2; /* Table-Type Bits */ 63 + unsigned long tl : 2; /* Region-Third-Table Length */ 64 + }; 65 + }; 66 + 67 + struct region3_table_entry_fc0 { 68 + unsigned long sto: 52;/* Segment-Table Origin */ 69 + unsigned long : 1; 70 + unsigned long fc : 1; /* Format-Control */ 71 + unsigned long p : 1; /* DAT-Protection Bit */ 72 + unsigned long : 1; 73 + unsigned long tf : 2; /* Segment-Table Offset */ 74 + unsigned long i : 1; /* Region-Invalid Bit */ 75 + unsigned long cr : 1; /* Common-Region Bit */ 76 + unsigned long tt : 2; /* Table-Type Bits */ 77 + unsigned long tl : 2; /* Segment-Table Length */ 78 + }; 79 + 80 + struct region3_table_entry_fc1 { 81 + unsigned long rfaa : 33; /* Region-Frame Absolute Address */ 82 + unsigned long : 14; 83 + unsigned long av : 1; /* ACCF-Validity Control */ 84 + unsigned long acc: 4; /* Access-Control Bits */ 85 + unsigned long f : 1; /* Fetch-Protection Bit */ 86 + unsigned long fc : 1; /* Format-Control */ 87 + unsigned long p : 1; /* DAT-Protection Bit */ 88 + unsigned long co : 1; /* Change-Recording Override */ 89 + unsigned long : 2; 90 + unsigned long i : 1; /* Region-Invalid Bit */ 91 + unsigned long cr : 1; /* Common-Region Bit */ 92 + unsigned long tt : 2; /* Table-Type Bits */ 93 + unsigned long : 2; 94 + }; 95 + 96 + union region3_table_entry { 97 + unsigned long val; 98 + struct region3_table_entry_fc0 fc0; 99 + struct region3_table_entry_fc1 fc1; 100 + struct { 101 + unsigned long : 53; 102 + unsigned long fc : 1; /* Format-Control */ 103 + unsigned long : 4; 104 + unsigned long i : 1; /* Region-Invalid Bit */ 105 + unsigned long cr : 1; /* Common-Region Bit */ 106 + unsigned long tt : 2; /* Table-Type Bits */ 107 + unsigned long : 2; 108 + }; 109 + }; 110 + 111 + struct segment_entry_fc0 { 112 + unsigned long pto: 53;/* Page-Table Origin */ 113 + unsigned long fc : 1; /* Format-Control */ 114 + unsigned long p : 1; /* DAT-Protection Bit */ 115 + unsigned long : 3; 116 + unsigned long i : 1; /* Segment-Invalid Bit */ 117 + unsigned long cs : 1; /* Common-Segment Bit */ 118 + unsigned long tt : 2; /* Table-Type Bits */ 119 + unsigned long : 2; 120 + }; 121 + 122 + struct segment_entry_fc1 { 123 + unsigned long sfaa : 44; /* Segment-Frame Absolute Address */ 124 + unsigned long : 3; 125 + unsigned long av : 1; /* ACCF-Validity Control */ 126 + unsigned long acc: 4; /* Access-Control Bits */ 127 + unsigned long f : 1; /* Fetch-Protection Bit */ 128 + unsigned long fc : 1; /* Format-Control */ 129 + unsigned long p : 1; /* DAT-Protection Bit */ 130 + unsigned long co : 1; /* Change-Recording Override */ 131 + unsigned long : 2; 132 + unsigned long i : 1; /* Segment-Invalid Bit */ 133 + unsigned long cs : 1; /* Common-Segment Bit */ 134 + unsigned long tt : 2; /* Table-Type Bits */ 135 + unsigned long : 2; 136 + }; 137 + 138 + union segment_table_entry { 139 + unsigned long val; 140 + struct segment_entry_fc0 fc0; 141 + struct segment_entry_fc1 fc1; 142 + struct { 143 + unsigned long : 53; 144 + unsigned long fc : 1; /* Format-Control */ 145 + unsigned long : 4; 146 + unsigned long i : 1; /* Segment-Invalid Bit */ 147 + unsigned long cs : 1; /* Common-Segment Bit */ 148 + unsigned long tt : 2; /* Table-Type Bits */ 149 + unsigned long : 2; 150 + }; 151 + }; 152 + 153 + enum { 154 + TABLE_TYPE_SEGMENT = 0, 155 + TABLE_TYPE_REGION3 = 1, 156 + TABLE_TYPE_REGION2 = 2, 157 + TABLE_TYPE_REGION1 = 3 158 + }; 159 + 160 + union page_table_entry { 161 + unsigned long val; 162 + struct { 163 + unsigned long pfra : 52; /* Page-Frame Real Address */ 164 + unsigned long z : 1; /* Zero Bit */ 165 + unsigned long i : 1; /* Page-Invalid Bit */ 166 + unsigned long p : 1; /* DAT-Protection Bit */ 167 + unsigned long co : 1; /* Change-Recording Override */ 168 + unsigned long : 8; 169 + }; 170 + }; 171 + 172 + /* 173 + * vaddress union in order to easily decode a virtual address into its 174 + * region first index, region second index etc. parts. 175 + */ 176 + union vaddress { 177 + unsigned long addr; 178 + struct { 179 + unsigned long rfx : 11; 180 + unsigned long rsx : 11; 181 + unsigned long rtx : 11; 182 + unsigned long sx : 11; 183 + unsigned long px : 8; 184 + unsigned long bx : 12; 185 + }; 186 + struct { 187 + unsigned long rfx01 : 2; 188 + unsigned long : 9; 189 + unsigned long rsx01 : 2; 190 + unsigned long : 9; 191 + unsigned long rtx01 : 2; 192 + unsigned long : 9; 193 + unsigned long sx01 : 2; 194 + unsigned long : 29; 195 + }; 196 + }; 197 + 198 + /* 199 + * raddress union which will contain the result (real or absolute address) 200 + * after a page table walk. The rfaa, sfaa and pfra members are used to 201 + * simply assign them the value of a region, segment or page table entry. 202 + */ 203 + union raddress { 204 + unsigned long addr; 205 + unsigned long rfaa : 33; /* Region-Frame Absolute Address */ 206 + unsigned long sfaa : 44; /* Segment-Frame Absolute Address */ 207 + unsigned long pfra : 52; /* Page-Frame Real Address */ 208 + }; 209 + 210 + static int ipte_lock_count; 211 + static DEFINE_MUTEX(ipte_mutex); 212 + 213 + int ipte_lock_held(struct kvm_vcpu *vcpu) 214 + { 215 + union ipte_control *ic = &vcpu->kvm->arch.sca->ipte_control; 216 + 217 + if (vcpu->arch.sie_block->eca & 1) 218 + return ic->kh != 0; 219 + return ipte_lock_count != 0; 220 + } 221 + 222 + static void ipte_lock_simple(struct kvm_vcpu *vcpu) 223 + { 224 + union ipte_control old, new, *ic; 225 + 226 + mutex_lock(&ipte_mutex); 227 + ipte_lock_count++; 228 + if (ipte_lock_count > 1) 229 + goto out; 230 + ic = &vcpu->kvm->arch.sca->ipte_control; 231 + do { 232 + old = ACCESS_ONCE(*ic); 233 + while (old.k) { 234 + cond_resched(); 235 + old = ACCESS_ONCE(*ic); 236 + } 237 + new = old; 238 + new.k = 1; 239 + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); 240 + out: 241 + mutex_unlock(&ipte_mutex); 242 + } 243 + 244 + static void ipte_unlock_simple(struct kvm_vcpu *vcpu) 245 + { 246 + union ipte_control old, new, *ic; 247 + 248 + mutex_lock(&ipte_mutex); 249 + ipte_lock_count--; 250 + if (ipte_lock_count) 251 + goto out; 252 + ic = &vcpu->kvm->arch.sca->ipte_control; 253 + do { 254 + new = old = ACCESS_ONCE(*ic); 255 + new.k = 0; 256 + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); 257 + if (!ipte_lock_count) 258 + wake_up(&vcpu->kvm->arch.ipte_wq); 259 + out: 260 + mutex_unlock(&ipte_mutex); 261 + } 262 + 263 + static void ipte_lock_siif(struct kvm_vcpu *vcpu) 264 + { 265 + union ipte_control old, new, *ic; 266 + 267 + ic = &vcpu->kvm->arch.sca->ipte_control; 268 + do { 269 + old = ACCESS_ONCE(*ic); 270 + while (old.kg) { 271 + cond_resched(); 272 + old = ACCESS_ONCE(*ic); 273 + } 274 + new = old; 275 + new.k = 1; 276 + new.kh++; 277 + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); 278 + } 279 + 280 + static void ipte_unlock_siif(struct kvm_vcpu *vcpu) 281 + { 282 + union ipte_control old, new, *ic; 283 + 284 + ic = &vcpu->kvm->arch.sca->ipte_control; 285 + do { 286 + new = old = ACCESS_ONCE(*ic); 287 + new.kh--; 288 + if (!new.kh) 289 + new.k = 0; 290 + } while (cmpxchg(&ic->val, old.val, new.val) != old.val); 291 + if (!new.kh) 292 + wake_up(&vcpu->kvm->arch.ipte_wq); 293 + } 294 + 295 + void ipte_lock(struct kvm_vcpu *vcpu) 296 + { 297 + if (vcpu->arch.sie_block->eca & 1) 298 + ipte_lock_siif(vcpu); 299 + else 300 + ipte_lock_simple(vcpu); 301 + } 302 + 303 + void ipte_unlock(struct kvm_vcpu *vcpu) 304 + { 305 + if (vcpu->arch.sie_block->eca & 1) 306 + ipte_unlock_siif(vcpu); 307 + else 308 + ipte_unlock_simple(vcpu); 309 + } 310 + 311 + static unsigned long get_vcpu_asce(struct kvm_vcpu *vcpu) 312 + { 313 + switch (psw_bits(vcpu->arch.sie_block->gpsw).as) { 314 + case PSW_AS_PRIMARY: 315 + return vcpu->arch.sie_block->gcr[1]; 316 + case PSW_AS_SECONDARY: 317 + return vcpu->arch.sie_block->gcr[7]; 318 + case PSW_AS_HOME: 319 + return vcpu->arch.sie_block->gcr[13]; 320 + } 321 + return 0; 322 + } 323 + 324 + static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) 325 + { 326 + return kvm_read_guest(kvm, gpa, val, sizeof(*val)); 327 + } 328 + 329 + /** 330 + * guest_translate - translate a guest virtual into a guest absolute address 331 + * @vcpu: virtual cpu 332 + * @gva: guest virtual address 333 + * @gpa: points to where guest physical (absolute) address should be stored 334 + * @write: indicates if access is a write access 335 + * 336 + * Translate a guest virtual address into a guest absolute address by means 337 + * of dynamic address translation as specified by the architecuture. 338 + * If the resulting absolute address is not available in the configuration 339 + * an addressing exception is indicated and @gpa will not be changed. 340 + * 341 + * Returns: - zero on success; @gpa contains the resulting absolute address 342 + * - a negative value if guest access failed due to e.g. broken 343 + * guest mapping 344 + * - a positve value if an access exception happened. In this case 345 + * the returned value is the program interruption code as defined 346 + * by the architecture 347 + */ 348 + static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, 349 + unsigned long *gpa, int write) 350 + { 351 + union vaddress vaddr = {.addr = gva}; 352 + union raddress raddr = {.addr = gva}; 353 + union page_table_entry pte; 354 + int dat_protection = 0; 355 + union ctlreg0 ctlreg0; 356 + unsigned long ptr; 357 + int edat1, edat2; 358 + union asce asce; 359 + 360 + ctlreg0.val = vcpu->arch.sie_block->gcr[0]; 361 + edat1 = ctlreg0.edat && test_vfacility(8); 362 + edat2 = edat1 && test_vfacility(78); 363 + asce.val = get_vcpu_asce(vcpu); 364 + if (asce.r) 365 + goto real_address; 366 + ptr = asce.origin * 4096; 367 + switch (asce.dt) { 368 + case ASCE_TYPE_REGION1: 369 + if (vaddr.rfx01 > asce.tl) 370 + return PGM_REGION_FIRST_TRANS; 371 + ptr += vaddr.rfx * 8; 372 + break; 373 + case ASCE_TYPE_REGION2: 374 + if (vaddr.rfx) 375 + return PGM_ASCE_TYPE; 376 + if (vaddr.rsx01 > asce.tl) 377 + return PGM_REGION_SECOND_TRANS; 378 + ptr += vaddr.rsx * 8; 379 + break; 380 + case ASCE_TYPE_REGION3: 381 + if (vaddr.rfx || vaddr.rsx) 382 + return PGM_ASCE_TYPE; 383 + if (vaddr.rtx01 > asce.tl) 384 + return PGM_REGION_THIRD_TRANS; 385 + ptr += vaddr.rtx * 8; 386 + break; 387 + case ASCE_TYPE_SEGMENT: 388 + if (vaddr.rfx || vaddr.rsx || vaddr.rtx) 389 + return PGM_ASCE_TYPE; 390 + if (vaddr.sx01 > asce.tl) 391 + return PGM_SEGMENT_TRANSLATION; 392 + ptr += vaddr.sx * 8; 393 + break; 394 + } 395 + switch (asce.dt) { 396 + case ASCE_TYPE_REGION1: { 397 + union region1_table_entry rfte; 398 + 399 + if (kvm_is_error_gpa(vcpu->kvm, ptr)) 400 + return PGM_ADDRESSING; 401 + if (deref_table(vcpu->kvm, ptr, &rfte.val)) 402 + return -EFAULT; 403 + if (rfte.i) 404 + return PGM_REGION_FIRST_TRANS; 405 + if (rfte.tt != TABLE_TYPE_REGION1) 406 + return PGM_TRANSLATION_SPEC; 407 + if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) 408 + return PGM_REGION_SECOND_TRANS; 409 + if (edat1) 410 + dat_protection |= rfte.p; 411 + ptr = rfte.rto * 4096 + vaddr.rsx * 8; 412 + } 413 + /* fallthrough */ 414 + case ASCE_TYPE_REGION2: { 415 + union region2_table_entry rste; 416 + 417 + if (kvm_is_error_gpa(vcpu->kvm, ptr)) 418 + return PGM_ADDRESSING; 419 + if (deref_table(vcpu->kvm, ptr, &rste.val)) 420 + return -EFAULT; 421 + if (rste.i) 422 + return PGM_REGION_SECOND_TRANS; 423 + if (rste.tt != TABLE_TYPE_REGION2) 424 + return PGM_TRANSLATION_SPEC; 425 + if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) 426 + return PGM_REGION_THIRD_TRANS; 427 + if (edat1) 428 + dat_protection |= rste.p; 429 + ptr = rste.rto * 4096 + vaddr.rtx * 8; 430 + } 431 + /* fallthrough */ 432 + case ASCE_TYPE_REGION3: { 433 + union region3_table_entry rtte; 434 + 435 + if (kvm_is_error_gpa(vcpu->kvm, ptr)) 436 + return PGM_ADDRESSING; 437 + if (deref_table(vcpu->kvm, ptr, &rtte.val)) 438 + return -EFAULT; 439 + if (rtte.i) 440 + return PGM_REGION_THIRD_TRANS; 441 + if (rtte.tt != TABLE_TYPE_REGION3) 442 + return PGM_TRANSLATION_SPEC; 443 + if (rtte.cr && asce.p && edat2) 444 + return PGM_TRANSLATION_SPEC; 445 + if (rtte.fc && edat2) { 446 + dat_protection |= rtte.fc1.p; 447 + raddr.rfaa = rtte.fc1.rfaa; 448 + goto absolute_address; 449 + } 450 + if (vaddr.sx01 < rtte.fc0.tf) 451 + return PGM_SEGMENT_TRANSLATION; 452 + if (vaddr.sx01 > rtte.fc0.tl) 453 + return PGM_SEGMENT_TRANSLATION; 454 + if (edat1) 455 + dat_protection |= rtte.fc0.p; 456 + ptr = rtte.fc0.sto * 4096 + vaddr.sx * 8; 457 + } 458 + /* fallthrough */ 459 + case ASCE_TYPE_SEGMENT: { 460 + union segment_table_entry ste; 461 + 462 + if (kvm_is_error_gpa(vcpu->kvm, ptr)) 463 + return PGM_ADDRESSING; 464 + if (deref_table(vcpu->kvm, ptr, &ste.val)) 465 + return -EFAULT; 466 + if (ste.i) 467 + return PGM_SEGMENT_TRANSLATION; 468 + if (ste.tt != TABLE_TYPE_SEGMENT) 469 + return PGM_TRANSLATION_SPEC; 470 + if (ste.cs && asce.p) 471 + return PGM_TRANSLATION_SPEC; 472 + if (ste.fc && edat1) { 473 + dat_protection |= ste.fc1.p; 474 + raddr.sfaa = ste.fc1.sfaa; 475 + goto absolute_address; 476 + } 477 + dat_protection |= ste.fc0.p; 478 + ptr = ste.fc0.pto * 2048 + vaddr.px * 8; 479 + } 480 + } 481 + if (kvm_is_error_gpa(vcpu->kvm, ptr)) 482 + return PGM_ADDRESSING; 483 + if (deref_table(vcpu->kvm, ptr, &pte.val)) 484 + return -EFAULT; 485 + if (pte.i) 486 + return PGM_PAGE_TRANSLATION; 487 + if (pte.z) 488 + return PGM_TRANSLATION_SPEC; 489 + if (pte.co && !edat1) 490 + return PGM_TRANSLATION_SPEC; 491 + dat_protection |= pte.p; 492 + raddr.pfra = pte.pfra; 493 + real_address: 494 + raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr); 495 + absolute_address: 496 + if (write && dat_protection) 497 + return PGM_PROTECTION; 498 + if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) 499 + return PGM_ADDRESSING; 500 + *gpa = raddr.addr; 501 + return 0; 502 + } 503 + 504 + static inline int is_low_address(unsigned long ga) 505 + { 506 + /* Check for address ranges 0..511 and 4096..4607 */ 507 + return (ga & ~0x11fful) == 0; 508 + } 509 + 510 + static int low_address_protection_enabled(struct kvm_vcpu *vcpu) 511 + { 512 + union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]}; 513 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 514 + union asce asce; 515 + 516 + if (!ctlreg0.lap) 517 + return 0; 518 + asce.val = get_vcpu_asce(vcpu); 519 + if (psw_bits(*psw).t && asce.p) 520 + return 0; 521 + return 1; 522 + } 523 + 524 + struct trans_exc_code_bits { 525 + unsigned long addr : 52; /* Translation-exception Address */ 526 + unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ 527 + unsigned long : 7; 528 + unsigned long b61 : 1; 529 + unsigned long as : 2; /* ASCE Identifier */ 530 + }; 531 + 532 + enum { 533 + FSI_UNKNOWN = 0, /* Unknown wether fetch or store */ 534 + FSI_STORE = 1, /* Exception was due to store operation */ 535 + FSI_FETCH = 2 /* Exception was due to fetch operation */ 536 + }; 537 + 538 + static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, 539 + unsigned long *pages, unsigned long nr_pages, 540 + int write) 541 + { 542 + struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; 543 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 544 + struct trans_exc_code_bits *tec_bits; 545 + int lap_enabled, rc; 546 + 547 + memset(pgm, 0, sizeof(*pgm)); 548 + tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; 549 + tec_bits->fsi = write ? FSI_STORE : FSI_FETCH; 550 + tec_bits->as = psw_bits(*psw).as; 551 + lap_enabled = low_address_protection_enabled(vcpu); 552 + while (nr_pages) { 553 + ga = kvm_s390_logical_to_effective(vcpu, ga); 554 + tec_bits->addr = ga >> PAGE_SHIFT; 555 + if (write && lap_enabled && is_low_address(ga)) { 556 + pgm->code = PGM_PROTECTION; 557 + return pgm->code; 558 + } 559 + ga &= PAGE_MASK; 560 + if (psw_bits(*psw).t) { 561 + rc = guest_translate(vcpu, ga, pages, write); 562 + if (rc < 0) 563 + return rc; 564 + if (rc == PGM_PROTECTION) 565 + tec_bits->b61 = 1; 566 + if (rc) 567 + pgm->code = rc; 568 + } else { 569 + *pages = kvm_s390_real_to_abs(vcpu, ga); 570 + if (kvm_is_error_gpa(vcpu->kvm, *pages)) 571 + pgm->code = PGM_ADDRESSING; 572 + } 573 + if (pgm->code) 574 + return pgm->code; 575 + ga += PAGE_SIZE; 576 + pages++; 577 + nr_pages--; 578 + } 579 + return 0; 580 + } 581 + 582 + int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data, 583 + unsigned long len, int write) 584 + { 585 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 586 + unsigned long _len, nr_pages, gpa, idx; 587 + unsigned long pages_array[2]; 588 + unsigned long *pages; 589 + int need_ipte_lock; 590 + union asce asce; 591 + int rc; 592 + 593 + if (!len) 594 + return 0; 595 + /* Access register mode is not supported yet. */ 596 + if (psw_bits(*psw).t && psw_bits(*psw).as == PSW_AS_ACCREG) 597 + return -EOPNOTSUPP; 598 + nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1; 599 + pages = pages_array; 600 + if (nr_pages > ARRAY_SIZE(pages_array)) 601 + pages = vmalloc(nr_pages * sizeof(unsigned long)); 602 + if (!pages) 603 + return -ENOMEM; 604 + asce.val = get_vcpu_asce(vcpu); 605 + need_ipte_lock = psw_bits(*psw).t && !asce.r; 606 + if (need_ipte_lock) 607 + ipte_lock(vcpu); 608 + rc = guest_page_range(vcpu, ga, pages, nr_pages, write); 609 + for (idx = 0; idx < nr_pages && !rc; idx++) { 610 + gpa = *(pages + idx) + (ga & ~PAGE_MASK); 611 + _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); 612 + if (write) 613 + rc = kvm_write_guest(vcpu->kvm, gpa, data, _len); 614 + else 615 + rc = kvm_read_guest(vcpu->kvm, gpa, data, _len); 616 + len -= _len; 617 + ga += _len; 618 + data += _len; 619 + } 620 + if (need_ipte_lock) 621 + ipte_unlock(vcpu); 622 + if (nr_pages > ARRAY_SIZE(pages_array)) 623 + vfree(pages); 624 + return rc; 625 + } 626 + 627 + int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, 628 + void *data, unsigned long len, int write) 629 + { 630 + unsigned long _len, gpa; 631 + int rc = 0; 632 + 633 + while (len && !rc) { 634 + gpa = kvm_s390_real_to_abs(vcpu, gra); 635 + _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); 636 + if (write) 637 + rc = write_guest_abs(vcpu, gpa, data, _len); 638 + else 639 + rc = read_guest_abs(vcpu, gpa, data, _len); 640 + len -= _len; 641 + gra += _len; 642 + data += _len; 643 + } 644 + return rc; 645 + } 646 + 647 + /** 648 + * guest_translate_address - translate guest logical into guest absolute address 649 + * 650 + * Parameter semantics are the same as the ones from guest_translate. 651 + * The memory contents at the guest address are not changed. 652 + * 653 + * Note: The IPTE lock is not taken during this function, so the caller 654 + * has to take care of this. 655 + */ 656 + int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, 657 + unsigned long *gpa, int write) 658 + { 659 + struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; 660 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 661 + struct trans_exc_code_bits *tec; 662 + union asce asce; 663 + int rc; 664 + 665 + /* Access register mode is not supported yet. */ 666 + if (psw_bits(*psw).t && psw_bits(*psw).as == PSW_AS_ACCREG) 667 + return -EOPNOTSUPP; 668 + 669 + gva = kvm_s390_logical_to_effective(vcpu, gva); 670 + memset(pgm, 0, sizeof(*pgm)); 671 + tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; 672 + tec->as = psw_bits(*psw).as; 673 + tec->fsi = write ? FSI_STORE : FSI_FETCH; 674 + tec->addr = gva >> PAGE_SHIFT; 675 + if (is_low_address(gva) && low_address_protection_enabled(vcpu)) { 676 + if (write) { 677 + rc = pgm->code = PGM_PROTECTION; 678 + return rc; 679 + } 680 + } 681 + 682 + asce.val = get_vcpu_asce(vcpu); 683 + if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */ 684 + rc = guest_translate(vcpu, gva, gpa, write); 685 + if (rc > 0) { 686 + if (rc == PGM_PROTECTION) 687 + tec->b61 = 1; 688 + pgm->code = rc; 689 + } 690 + } else { 691 + rc = 0; 692 + *gpa = kvm_s390_real_to_abs(vcpu, gva); 693 + if (kvm_is_error_gpa(vcpu->kvm, *gpa)) 694 + rc = pgm->code = PGM_ADDRESSING; 695 + } 696 + 697 + return rc; 698 + } 699 + 700 + /** 701 + * kvm_s390_check_low_addr_protection - check for low-address protection 702 + * @ga: Guest address 703 + * 704 + * Checks whether an address is subject to low-address protection and set 705 + * up vcpu->arch.pgm accordingly if necessary. 706 + * 707 + * Return: 0 if no protection exception, or PGM_PROTECTION if protected. 708 + */ 709 + int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga) 710 + { 711 + struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; 712 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 713 + struct trans_exc_code_bits *tec_bits; 714 + 715 + if (!is_low_address(ga) || !low_address_protection_enabled(vcpu)) 716 + return 0; 717 + 718 + memset(pgm, 0, sizeof(*pgm)); 719 + tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; 720 + tec_bits->fsi = FSI_STORE; 721 + tec_bits->as = psw_bits(*psw).as; 722 + tec_bits->addr = ga >> PAGE_SHIFT; 723 + pgm->code = PGM_PROTECTION; 724 + 725 + return pgm->code; 726 + }

+299 -78

arch/s390/kvm/gaccess.h

··· 1 1 /* 2 2 * access guest memory 3 3 * 4 - * Copyright IBM Corp. 2008, 2009 4 + * Copyright IBM Corp. 2008, 2014 5 5 * 6 6 * This program is free software; you can redistribute it and/or modify 7 7 * it under the terms of the GNU General Public License (version 2 only) ··· 15 15 16 16 #include <linux/compiler.h> 17 17 #include <linux/kvm_host.h> 18 - #include <asm/uaccess.h> 18 + #include <linux/uaccess.h> 19 + #include <linux/ptrace.h> 19 20 #include "kvm-s390.h" 20 21 21 - /* Convert real to absolute address by applying the prefix of the CPU */ 22 + /** 23 + * kvm_s390_real_to_abs - convert guest real address to guest absolute address 24 + * @vcpu - guest virtual cpu 25 + * @gra - guest real address 26 + * 27 + * Returns the guest absolute address that corresponds to the passed guest real 28 + * address @gra of a virtual guest cpu by applying its prefix. 29 + */ 22 30 static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, 23 - unsigned long gaddr) 31 + unsigned long gra) 24 32 { 25 - unsigned long prefix = vcpu->arch.sie_block->prefix; 26 - if (gaddr < 2 * PAGE_SIZE) 27 - gaddr += prefix; 28 - else if (gaddr >= prefix && gaddr < prefix + 2 * PAGE_SIZE) 29 - gaddr -= prefix; 30 - return gaddr; 33 + unsigned long prefix = kvm_s390_get_prefix(vcpu); 34 + 35 + if (gra < 2 * PAGE_SIZE) 36 + gra += prefix; 37 + else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE) 38 + gra -= prefix; 39 + return gra; 31 40 } 32 41 33 - static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu, 34 - void __user *gptr, 35 - int prefixing) 42 + /** 43 + * kvm_s390_logical_to_effective - convert guest logical to effective address 44 + * @vcpu: guest virtual cpu 45 + * @ga: guest logical address 46 + * 47 + * Convert a guest vcpu logical address to a guest vcpu effective address by 48 + * applying the rules of the vcpu's addressing mode defined by PSW bits 31 49 + * and 32 (extendended/basic addressing mode). 50 + * 51 + * Depending on the vcpu's addressing mode the upper 40 bits (24 bit addressing 52 + * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing mode) 53 + * of @ga will be zeroed and the remaining bits will be returned. 54 + */ 55 + static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, 56 + unsigned long ga) 36 57 { 37 - unsigned long gaddr = (unsigned long) gptr; 38 - unsigned long uaddr; 58 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 39 59 40 - if (prefixing) 41 - gaddr = kvm_s390_real_to_abs(vcpu, gaddr); 42 - uaddr = gmap_fault(gaddr, vcpu->arch.gmap); 43 - if (IS_ERR_VALUE(uaddr)) 44 - uaddr = -EFAULT; 45 - return (void __user *)uaddr; 60 + if (psw_bits(*psw).eaba == PSW_AMODE_64BIT) 61 + return ga; 62 + if (psw_bits(*psw).eaba == PSW_AMODE_31BIT) 63 + return ga & ((1UL << 31) - 1); 64 + return ga & ((1UL << 24) - 1); 46 65 } 47 66 48 - #define get_guest(vcpu, x, gptr) \ 67 + /* 68 + * put_guest_lc, read_guest_lc and write_guest_lc are guest access functions 69 + * which shall only be used to access the lowcore of a vcpu. 70 + * These functions should be used for e.g. interrupt handlers where no 71 + * guest memory access protection facilities, like key or low address 72 + * protection, are applicable. 73 + * At a later point guest vcpu lowcore access should happen via pinned 74 + * prefix pages, so that these pages can be accessed directly via the 75 + * kernel mapping. All of these *_lc functions can be removed then. 76 + */ 77 + 78 + /** 79 + * put_guest_lc - write a simple variable to a guest vcpu's lowcore 80 + * @vcpu: virtual cpu 81 + * @x: value to copy to guest 82 + * @gra: vcpu's destination guest real address 83 + * 84 + * Copies a simple value from kernel space to a guest vcpu's lowcore. 85 + * The size of the variable may be 1, 2, 4 or 8 bytes. The destination 86 + * must be located in the vcpu's lowcore. Otherwise the result is undefined. 87 + * 88 + * Returns zero on success or -EFAULT on error. 89 + * 90 + * Note: an error indicates that either the kernel is out of memory or 91 + * the guest memory mapping is broken. In any case the best solution 92 + * would be to terminate the guest. 93 + * It is wrong to inject a guest exception. 94 + */ 95 + #define put_guest_lc(vcpu, x, gra) \ 49 96 ({ \ 50 - __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ 51 - int __mask = sizeof(__typeof__(*(gptr))) - 1; \ 52 - int __ret; \ 97 + struct kvm_vcpu *__vcpu = (vcpu); \ 98 + __typeof__(*(gra)) __x = (x); \ 99 + unsigned long __gpa; \ 53 100 \ 54 - if (IS_ERR((void __force *)__uptr)) { \ 55 - __ret = PTR_ERR((void __force *)__uptr); \ 56 - } else { \ 57 - BUG_ON((unsigned long)__uptr & __mask); \ 58 - __ret = get_user(x, __uptr); \ 59 - } \ 60 - __ret; \ 101 + __gpa = (unsigned long)(gra); \ 102 + __gpa += kvm_s390_get_prefix(__vcpu); \ 103 + kvm_write_guest(__vcpu->kvm, __gpa, &__x, sizeof(__x)); \ 61 104 }) 62 105 63 - #define put_guest(vcpu, x, gptr) \ 64 - ({ \ 65 - __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ 66 - int __mask = sizeof(__typeof__(*(gptr))) - 1; \ 67 - int __ret; \ 68 - \ 69 - if (IS_ERR((void __force *)__uptr)) { \ 70 - __ret = PTR_ERR((void __force *)__uptr); \ 71 - } else { \ 72 - BUG_ON((unsigned long)__uptr & __mask); \ 73 - __ret = put_user(x, __uptr); \ 74 - } \ 75 - __ret; \ 76 - }) 77 - 78 - static inline int __copy_guest(struct kvm_vcpu *vcpu, unsigned long to, 79 - unsigned long from, unsigned long len, 80 - int to_guest, int prefixing) 106 + /** 107 + * write_guest_lc - copy data from kernel space to guest vcpu's lowcore 108 + * @vcpu: virtual cpu 109 + * @gra: vcpu's source guest real address 110 + * @data: source address in kernel space 111 + * @len: number of bytes to copy 112 + * 113 + * Copy data from kernel space to guest vcpu's lowcore. The entire range must 114 + * be located within the vcpu's lowcore, otherwise the result is undefined. 115 + * 116 + * Returns zero on success or -EFAULT on error. 117 + * 118 + * Note: an error indicates that either the kernel is out of memory or 119 + * the guest memory mapping is broken. In any case the best solution 120 + * would be to terminate the guest. 121 + * It is wrong to inject a guest exception. 122 + */ 123 + static inline __must_check 124 + int write_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, 125 + unsigned long len) 81 126 { 82 - unsigned long _len, rc; 83 - void __user *uptr; 127 + unsigned long gpa = gra + kvm_s390_get_prefix(vcpu); 84 128 85 - while (len) { 86 - uptr = to_guest ? (void __user *)to : (void __user *)from; 87 - uptr = __gptr_to_uptr(vcpu, uptr, prefixing); 88 - if (IS_ERR((void __force *)uptr)) 89 - return -EFAULT; 90 - _len = PAGE_SIZE - ((unsigned long)uptr & (PAGE_SIZE - 1)); 91 - _len = min(_len, len); 92 - if (to_guest) 93 - rc = copy_to_user((void __user *) uptr, (void *)from, _len); 94 - else 95 - rc = copy_from_user((void *)to, (void __user *)uptr, _len); 96 - if (rc) 97 - return -EFAULT; 98 - len -= _len; 99 - from += _len; 100 - to += _len; 101 - } 102 - return 0; 129 + return kvm_write_guest(vcpu->kvm, gpa, data, len); 103 130 } 104 131 105 - #define copy_to_guest(vcpu, to, from, size) \ 106 - __copy_guest(vcpu, to, (unsigned long)from, size, 1, 1) 107 - #define copy_from_guest(vcpu, to, from, size) \ 108 - __copy_guest(vcpu, (unsigned long)to, from, size, 0, 1) 109 - #define copy_to_guest_absolute(vcpu, to, from, size) \ 110 - __copy_guest(vcpu, to, (unsigned long)from, size, 1, 0) 111 - #define copy_from_guest_absolute(vcpu, to, from, size) \ 112 - __copy_guest(vcpu, (unsigned long)to, from, size, 0, 0) 132 + /** 133 + * read_guest_lc - copy data from guest vcpu's lowcore to kernel space 134 + * @vcpu: virtual cpu 135 + * @gra: vcpu's source guest real address 136 + * @data: destination address in kernel space 137 + * @len: number of bytes to copy 138 + * 139 + * Copy data from guest vcpu's lowcore to kernel space. The entire range must 140 + * be located within the vcpu's lowcore, otherwise the result is undefined. 141 + * 142 + * Returns zero on success or -EFAULT on error. 143 + * 144 + * Note: an error indicates that either the kernel is out of memory or 145 + * the guest memory mapping is broken. In any case the best solution 146 + * would be to terminate the guest. 147 + * It is wrong to inject a guest exception. 148 + */ 149 + static inline __must_check 150 + int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data, 151 + unsigned long len) 152 + { 153 + unsigned long gpa = gra + kvm_s390_get_prefix(vcpu); 154 + 155 + return kvm_read_guest(vcpu->kvm, gpa, data, len); 156 + } 157 + 158 + int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, 159 + unsigned long *gpa, int write); 160 + 161 + int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data, 162 + unsigned long len, int write); 163 + 164 + int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, 165 + void *data, unsigned long len, int write); 166 + 167 + /** 168 + * write_guest - copy data from kernel space to guest space 169 + * @vcpu: virtual cpu 170 + * @ga: guest address 171 + * @data: source address in kernel space 172 + * @len: number of bytes to copy 173 + * 174 + * Copy @len bytes from @data (kernel space) to @ga (guest address). 175 + * In order to copy data to guest space the PSW of the vcpu is inspected: 176 + * If DAT is off data will be copied to guest real or absolute memory. 177 + * If DAT is on data will be copied to the address space as specified by 178 + * the address space bits of the PSW: 179 + * Primary, secondory or home space (access register mode is currently not 180 + * implemented). 181 + * The addressing mode of the PSW is also inspected, so that address wrap 182 + * around is taken into account for 24-, 31- and 64-bit addressing mode, 183 + * if the to be copied data crosses page boundaries in guest address space. 184 + * In addition also low address and DAT protection are inspected before 185 + * copying any data (key protection is currently not implemented). 186 + * 187 + * This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu. 188 + * In case of an access exception (e.g. protection exception) pgm will contain 189 + * all data necessary so that a subsequent call to 'kvm_s390_inject_prog_vcpu()' 190 + * will inject a correct exception into the guest. 191 + * If no access exception happened, the contents of pgm are undefined when 192 + * this function returns. 193 + * 194 + * Returns: - zero on success 195 + * - a negative value if e.g. the guest mapping is broken or in 196 + * case of out-of-memory. In this case the contents of pgm are 197 + * undefined. Also parts of @data may have been copied to guest 198 + * space. 199 + * - a positive value if an access exception happened. In this case 200 + * the returned value is the program interruption code and the 201 + * contents of pgm may be used to inject an exception into the 202 + * guest. No data has been copied to guest space. 203 + * 204 + * Note: in case an access exception is recognized no data has been copied to 205 + * guest space (this is also true, if the to be copied data would cross 206 + * one or more page boundaries in guest space). 207 + * Therefore this function may be used for nullifying and suppressing 208 + * instruction emulation. 209 + * It may also be used for terminating instructions, if it is undefined 210 + * if data has been changed in guest space in case of an exception. 211 + */ 212 + static inline __must_check 213 + int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data, 214 + unsigned long len) 215 + { 216 + return access_guest(vcpu, ga, data, len, 1); 217 + } 218 + 219 + /** 220 + * read_guest - copy data from guest space to kernel space 221 + * @vcpu: virtual cpu 222 + * @ga: guest address 223 + * @data: destination address in kernel space 224 + * @len: number of bytes to copy 225 + * 226 + * Copy @len bytes from @ga (guest address) to @data (kernel space). 227 + * 228 + * The behaviour of read_guest is identical to write_guest, except that 229 + * data will be copied from guest space to kernel space. 230 + */ 231 + static inline __must_check 232 + int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data, 233 + unsigned long len) 234 + { 235 + return access_guest(vcpu, ga, data, len, 0); 236 + } 237 + 238 + /** 239 + * write_guest_abs - copy data from kernel space to guest space absolute 240 + * @vcpu: virtual cpu 241 + * @gpa: guest physical (absolute) address 242 + * @data: source address in kernel space 243 + * @len: number of bytes to copy 244 + * 245 + * Copy @len bytes from @data (kernel space) to @gpa (guest absolute address). 246 + * It is up to the caller to ensure that the entire guest memory range is 247 + * valid memory before calling this function. 248 + * Guest low address and key protection are not checked. 249 + * 250 + * Returns zero on success or -EFAULT on error. 251 + * 252 + * If an error occurs data may have been copied partially to guest memory. 253 + */ 254 + static inline __must_check 255 + int write_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data, 256 + unsigned long len) 257 + { 258 + return kvm_write_guest(vcpu->kvm, gpa, data, len); 259 + } 260 + 261 + /** 262 + * read_guest_abs - copy data from guest space absolute to kernel space 263 + * @vcpu: virtual cpu 264 + * @gpa: guest physical (absolute) address 265 + * @data: destination address in kernel space 266 + * @len: number of bytes to copy 267 + * 268 + * Copy @len bytes from @gpa (guest absolute address) to @data (kernel space). 269 + * It is up to the caller to ensure that the entire guest memory range is 270 + * valid memory before calling this function. 271 + * Guest key protection is not checked. 272 + * 273 + * Returns zero on success or -EFAULT on error. 274 + * 275 + * If an error occurs data may have been copied partially to kernel space. 276 + */ 277 + static inline __must_check 278 + int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data, 279 + unsigned long len) 280 + { 281 + return kvm_read_guest(vcpu->kvm, gpa, data, len); 282 + } 283 + 284 + /** 285 + * write_guest_real - copy data from kernel space to guest space real 286 + * @vcpu: virtual cpu 287 + * @gra: guest real address 288 + * @data: source address in kernel space 289 + * @len: number of bytes to copy 290 + * 291 + * Copy @len bytes from @data (kernel space) to @gra (guest real address). 292 + * It is up to the caller to ensure that the entire guest memory range is 293 + * valid memory before calling this function. 294 + * Guest low address and key protection are not checked. 295 + * 296 + * Returns zero on success or -EFAULT on error. 297 + * 298 + * If an error occurs data may have been copied partially to guest memory. 299 + */ 300 + static inline __must_check 301 + int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, 302 + unsigned long len) 303 + { 304 + return access_guest_real(vcpu, gra, data, len, 1); 305 + } 306 + 307 + /** 308 + * read_guest_real - copy data from guest space real to kernel space 309 + * @vcpu: virtual cpu 310 + * @gra: guest real address 311 + * @data: destination address in kernel space 312 + * @len: number of bytes to copy 313 + * 314 + * Copy @len bytes from @gra (guest real address) to @data (kernel space). 315 + * It is up to the caller to ensure that the entire guest memory range is 316 + * valid memory before calling this function. 317 + * Guest key protection is not checked. 318 + * 319 + * Returns zero on success or -EFAULT on error. 320 + * 321 + * If an error occurs data may have been copied partially to kernel space. 322 + */ 323 + static inline __must_check 324 + int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, 325 + unsigned long len) 326 + { 327 + return access_guest_real(vcpu, gra, data, len, 0); 328 + } 329 + 330 + void ipte_lock(struct kvm_vcpu *vcpu); 331 + void ipte_unlock(struct kvm_vcpu *vcpu); 332 + int ipte_lock_held(struct kvm_vcpu *vcpu); 333 + int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga); 113 334 114 335 #endif /* __KVM_S390_GACCESS_H */

+482

arch/s390/kvm/guestdbg.c

··· 1 + /* 2 + * kvm guest debug support 3 + * 4 + * Copyright IBM Corp. 2014 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License (version 2 only) 8 + * as published by the Free Software Foundation. 9 + * 10 + * Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com> 11 + */ 12 + #include <linux/kvm_host.h> 13 + #include <linux/errno.h> 14 + #include "kvm-s390.h" 15 + #include "gaccess.h" 16 + 17 + /* 18 + * Extends the address range given by *start and *stop to include the address 19 + * range starting with estart and the length len. Takes care of overflowing 20 + * intervals and tries to minimize the overall intervall size. 21 + */ 22 + static void extend_address_range(u64 *start, u64 *stop, u64 estart, int len) 23 + { 24 + u64 estop; 25 + 26 + if (len > 0) 27 + len--; 28 + else 29 + len = 0; 30 + 31 + estop = estart + len; 32 + 33 + /* 0-0 range represents "not set" */ 34 + if ((*start == 0) && (*stop == 0)) { 35 + *start = estart; 36 + *stop = estop; 37 + } else if (*start <= *stop) { 38 + /* increase the existing range */ 39 + if (estart < *start) 40 + *start = estart; 41 + if (estop > *stop) 42 + *stop = estop; 43 + } else { 44 + /* "overflowing" interval, whereby *stop > *start */ 45 + if (estart <= *stop) { 46 + if (estop > *stop) 47 + *stop = estop; 48 + } else if (estop > *start) { 49 + if (estart < *start) 50 + *start = estart; 51 + } 52 + /* minimize the range */ 53 + else if ((estop - *stop) < (*start - estart)) 54 + *stop = estop; 55 + else 56 + *start = estart; 57 + } 58 + } 59 + 60 + #define MAX_INST_SIZE 6 61 + 62 + static void enable_all_hw_bp(struct kvm_vcpu *vcpu) 63 + { 64 + unsigned long start, len; 65 + u64 *cr9 = &vcpu->arch.sie_block->gcr[9]; 66 + u64 *cr10 = &vcpu->arch.sie_block->gcr[10]; 67 + u64 *cr11 = &vcpu->arch.sie_block->gcr[11]; 68 + int i; 69 + 70 + if (vcpu->arch.guestdbg.nr_hw_bp <= 0 || 71 + vcpu->arch.guestdbg.hw_bp_info == NULL) 72 + return; 73 + 74 + /* 75 + * If the guest is not interrested in branching events, we can savely 76 + * limit them to the PER address range. 77 + */ 78 + if (!(*cr9 & PER_EVENT_BRANCH)) 79 + *cr9 |= PER_CONTROL_BRANCH_ADDRESS; 80 + *cr9 |= PER_EVENT_IFETCH | PER_EVENT_BRANCH; 81 + 82 + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_bp; i++) { 83 + start = vcpu->arch.guestdbg.hw_bp_info[i].addr; 84 + len = vcpu->arch.guestdbg.hw_bp_info[i].len; 85 + 86 + /* 87 + * The instruction in front of the desired bp has to 88 + * report instruction-fetching events 89 + */ 90 + if (start < MAX_INST_SIZE) { 91 + len += start; 92 + start = 0; 93 + } else { 94 + start -= MAX_INST_SIZE; 95 + len += MAX_INST_SIZE; 96 + } 97 + 98 + extend_address_range(cr10, cr11, start, len); 99 + } 100 + } 101 + 102 + static void enable_all_hw_wp(struct kvm_vcpu *vcpu) 103 + { 104 + unsigned long start, len; 105 + u64 *cr9 = &vcpu->arch.sie_block->gcr[9]; 106 + u64 *cr10 = &vcpu->arch.sie_block->gcr[10]; 107 + u64 *cr11 = &vcpu->arch.sie_block->gcr[11]; 108 + int i; 109 + 110 + if (vcpu->arch.guestdbg.nr_hw_wp <= 0 || 111 + vcpu->arch.guestdbg.hw_wp_info == NULL) 112 + return; 113 + 114 + /* if host uses storage alternation for special address 115 + * spaces, enable all events and give all to the guest */ 116 + if (*cr9 & PER_EVENT_STORE && *cr9 & PER_CONTROL_ALTERATION) { 117 + *cr9 &= ~PER_CONTROL_ALTERATION; 118 + *cr10 = 0; 119 + *cr11 = PSW_ADDR_INSN; 120 + } else { 121 + *cr9 &= ~PER_CONTROL_ALTERATION; 122 + *cr9 |= PER_EVENT_STORE; 123 + 124 + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) { 125 + start = vcpu->arch.guestdbg.hw_wp_info[i].addr; 126 + len = vcpu->arch.guestdbg.hw_wp_info[i].len; 127 + 128 + extend_address_range(cr10, cr11, start, len); 129 + } 130 + } 131 + } 132 + 133 + void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu) 134 + { 135 + vcpu->arch.guestdbg.cr0 = vcpu->arch.sie_block->gcr[0]; 136 + vcpu->arch.guestdbg.cr9 = vcpu->arch.sie_block->gcr[9]; 137 + vcpu->arch.guestdbg.cr10 = vcpu->arch.sie_block->gcr[10]; 138 + vcpu->arch.guestdbg.cr11 = vcpu->arch.sie_block->gcr[11]; 139 + } 140 + 141 + void kvm_s390_restore_guest_per_regs(struct kvm_vcpu *vcpu) 142 + { 143 + vcpu->arch.sie_block->gcr[0] = vcpu->arch.guestdbg.cr0; 144 + vcpu->arch.sie_block->gcr[9] = vcpu->arch.guestdbg.cr9; 145 + vcpu->arch.sie_block->gcr[10] = vcpu->arch.guestdbg.cr10; 146 + vcpu->arch.sie_block->gcr[11] = vcpu->arch.guestdbg.cr11; 147 + } 148 + 149 + void kvm_s390_patch_guest_per_regs(struct kvm_vcpu *vcpu) 150 + { 151 + /* 152 + * TODO: if guest psw has per enabled, otherwise 0s! 153 + * This reduces the amount of reported events. 154 + * Need to intercept all psw changes! 155 + */ 156 + 157 + if (guestdbg_sstep_enabled(vcpu)) { 158 + /* disable timer (clock-comparator) interrupts */ 159 + vcpu->arch.sie_block->gcr[0] &= ~0x800ul; 160 + vcpu->arch.sie_block->gcr[9] |= PER_EVENT_IFETCH; 161 + vcpu->arch.sie_block->gcr[10] = 0; 162 + vcpu->arch.sie_block->gcr[11] = PSW_ADDR_INSN; 163 + } 164 + 165 + if (guestdbg_hw_bp_enabled(vcpu)) { 166 + enable_all_hw_bp(vcpu); 167 + enable_all_hw_wp(vcpu); 168 + } 169 + 170 + /* TODO: Instruction-fetching-nullification not allowed for now */ 171 + if (vcpu->arch.sie_block->gcr[9] & PER_EVENT_NULLIFICATION) 172 + vcpu->arch.sie_block->gcr[9] &= ~PER_EVENT_NULLIFICATION; 173 + } 174 + 175 + #define MAX_WP_SIZE 100 176 + 177 + static int __import_wp_info(struct kvm_vcpu *vcpu, 178 + struct kvm_hw_breakpoint *bp_data, 179 + struct kvm_hw_wp_info_arch *wp_info) 180 + { 181 + int ret = 0; 182 + wp_info->len = bp_data->len; 183 + wp_info->addr = bp_data->addr; 184 + wp_info->phys_addr = bp_data->phys_addr; 185 + wp_info->old_data = NULL; 186 + 187 + if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE) 188 + return -EINVAL; 189 + 190 + wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL); 191 + if (!wp_info->old_data) 192 + return -ENOMEM; 193 + /* try to backup the original value */ 194 + ret = read_guest(vcpu, wp_info->phys_addr, wp_info->old_data, 195 + wp_info->len); 196 + if (ret) { 197 + kfree(wp_info->old_data); 198 + wp_info->old_data = NULL; 199 + } 200 + 201 + return ret; 202 + } 203 + 204 + #define MAX_BP_COUNT 50 205 + 206 + int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, 207 + struct kvm_guest_debug *dbg) 208 + { 209 + int ret = 0, nr_wp = 0, nr_bp = 0, i, size; 210 + struct kvm_hw_breakpoint *bp_data = NULL; 211 + struct kvm_hw_wp_info_arch *wp_info = NULL; 212 + struct kvm_hw_bp_info_arch *bp_info = NULL; 213 + 214 + if (dbg->arch.nr_hw_bp <= 0 || !dbg->arch.hw_bp) 215 + return 0; 216 + else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) 217 + return -EINVAL; 218 + 219 + size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint); 220 + bp_data = kmalloc(size, GFP_KERNEL); 221 + if (!bp_data) { 222 + ret = -ENOMEM; 223 + goto error; 224 + } 225 + 226 + if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) { 227 + ret = -EFAULT; 228 + goto error; 229 + } 230 + 231 + for (i = 0; i < dbg->arch.nr_hw_bp; i++) { 232 + switch (bp_data[i].type) { 233 + case KVM_HW_WP_WRITE: 234 + nr_wp++; 235 + break; 236 + case KVM_HW_BP: 237 + nr_bp++; 238 + break; 239 + default: 240 + break; 241 + } 242 + } 243 + 244 + size = nr_wp * sizeof(struct kvm_hw_wp_info_arch); 245 + if (size > 0) { 246 + wp_info = kmalloc(size, GFP_KERNEL); 247 + if (!wp_info) { 248 + ret = -ENOMEM; 249 + goto error; 250 + } 251 + } 252 + size = nr_bp * sizeof(struct kvm_hw_bp_info_arch); 253 + if (size > 0) { 254 + bp_info = kmalloc(size, GFP_KERNEL); 255 + if (!bp_info) { 256 + ret = -ENOMEM; 257 + goto error; 258 + } 259 + } 260 + 261 + for (nr_wp = 0, nr_bp = 0, i = 0; i < dbg->arch.nr_hw_bp; i++) { 262 + switch (bp_data[i].type) { 263 + case KVM_HW_WP_WRITE: 264 + ret = __import_wp_info(vcpu, &bp_data[i], 265 + &wp_info[nr_wp]); 266 + if (ret) 267 + goto error; 268 + nr_wp++; 269 + break; 270 + case KVM_HW_BP: 271 + bp_info[nr_bp].len = bp_data[i].len; 272 + bp_info[nr_bp].addr = bp_data[i].addr; 273 + nr_bp++; 274 + break; 275 + } 276 + } 277 + 278 + vcpu->arch.guestdbg.nr_hw_bp = nr_bp; 279 + vcpu->arch.guestdbg.hw_bp_info = bp_info; 280 + vcpu->arch.guestdbg.nr_hw_wp = nr_wp; 281 + vcpu->arch.guestdbg.hw_wp_info = wp_info; 282 + return 0; 283 + error: 284 + kfree(bp_data); 285 + kfree(wp_info); 286 + kfree(bp_info); 287 + return ret; 288 + } 289 + 290 + void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu) 291 + { 292 + int i; 293 + struct kvm_hw_wp_info_arch *hw_wp_info = NULL; 294 + 295 + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) { 296 + hw_wp_info = &vcpu->arch.guestdbg.hw_wp_info[i]; 297 + kfree(hw_wp_info->old_data); 298 + hw_wp_info->old_data = NULL; 299 + } 300 + kfree(vcpu->arch.guestdbg.hw_wp_info); 301 + vcpu->arch.guestdbg.hw_wp_info = NULL; 302 + 303 + kfree(vcpu->arch.guestdbg.hw_bp_info); 304 + vcpu->arch.guestdbg.hw_bp_info = NULL; 305 + 306 + vcpu->arch.guestdbg.nr_hw_wp = 0; 307 + vcpu->arch.guestdbg.nr_hw_bp = 0; 308 + } 309 + 310 + static inline int in_addr_range(u64 addr, u64 a, u64 b) 311 + { 312 + if (a <= b) 313 + return (addr >= a) && (addr <= b); 314 + else 315 + /* "overflowing" interval */ 316 + return (addr <= a) && (addr >= b); 317 + } 318 + 319 + #define end_of_range(bp_info) (bp_info->addr + bp_info->len - 1) 320 + 321 + static struct kvm_hw_bp_info_arch *find_hw_bp(struct kvm_vcpu *vcpu, 322 + unsigned long addr) 323 + { 324 + struct kvm_hw_bp_info_arch *bp_info = vcpu->arch.guestdbg.hw_bp_info; 325 + int i; 326 + 327 + if (vcpu->arch.guestdbg.nr_hw_bp == 0) 328 + return NULL; 329 + 330 + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_bp; i++) { 331 + /* addr is directly the start or in the range of a bp */ 332 + if (addr == bp_info->addr) 333 + goto found; 334 + if (bp_info->len > 0 && 335 + in_addr_range(addr, bp_info->addr, end_of_range(bp_info))) 336 + goto found; 337 + 338 + bp_info++; 339 + } 340 + 341 + return NULL; 342 + found: 343 + return bp_info; 344 + } 345 + 346 + static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu) 347 + { 348 + int i; 349 + struct kvm_hw_wp_info_arch *wp_info = NULL; 350 + void *temp = NULL; 351 + 352 + if (vcpu->arch.guestdbg.nr_hw_wp == 0) 353 + return NULL; 354 + 355 + for (i = 0; i < vcpu->arch.guestdbg.nr_hw_wp; i++) { 356 + wp_info = &vcpu->arch.guestdbg.hw_wp_info[i]; 357 + if (!wp_info || !wp_info->old_data || wp_info->len <= 0) 358 + continue; 359 + 360 + temp = kmalloc(wp_info->len, GFP_KERNEL); 361 + if (!temp) 362 + continue; 363 + 364 + /* refetch the wp data and compare it to the old value */ 365 + if (!read_guest(vcpu, wp_info->phys_addr, temp, 366 + wp_info->len)) { 367 + if (memcmp(temp, wp_info->old_data, wp_info->len)) { 368 + kfree(temp); 369 + return wp_info; 370 + } 371 + } 372 + kfree(temp); 373 + temp = NULL; 374 + } 375 + 376 + return NULL; 377 + } 378 + 379 + void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu) 380 + { 381 + vcpu->run->exit_reason = KVM_EXIT_DEBUG; 382 + vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; 383 + } 384 + 385 + #define per_bp_event(code) \ 386 + (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH)) 387 + #define per_write_wp_event(code) \ 388 + (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL)) 389 + 390 + static int debug_exit_required(struct kvm_vcpu *vcpu) 391 + { 392 + u32 perc = (vcpu->arch.sie_block->perc << 24); 393 + struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; 394 + struct kvm_hw_wp_info_arch *wp_info = NULL; 395 + struct kvm_hw_bp_info_arch *bp_info = NULL; 396 + unsigned long addr = vcpu->arch.sie_block->gpsw.addr; 397 + unsigned long peraddr = vcpu->arch.sie_block->peraddr; 398 + 399 + if (guestdbg_hw_bp_enabled(vcpu)) { 400 + if (per_write_wp_event(perc) && 401 + vcpu->arch.guestdbg.nr_hw_wp > 0) { 402 + wp_info = any_wp_changed(vcpu); 403 + if (wp_info) { 404 + debug_exit->addr = wp_info->addr; 405 + debug_exit->type = KVM_HW_WP_WRITE; 406 + goto exit_required; 407 + } 408 + } 409 + if (per_bp_event(perc) && 410 + vcpu->arch.guestdbg.nr_hw_bp > 0) { 411 + bp_info = find_hw_bp(vcpu, addr); 412 + /* remove duplicate events if PC==PER address */ 413 + if (bp_info && (addr != peraddr)) { 414 + debug_exit->addr = addr; 415 + debug_exit->type = KVM_HW_BP; 416 + vcpu->arch.guestdbg.last_bp = addr; 417 + goto exit_required; 418 + } 419 + /* breakpoint missed */ 420 + bp_info = find_hw_bp(vcpu, peraddr); 421 + if (bp_info && vcpu->arch.guestdbg.last_bp != peraddr) { 422 + debug_exit->addr = peraddr; 423 + debug_exit->type = KVM_HW_BP; 424 + goto exit_required; 425 + } 426 + } 427 + } 428 + if (guestdbg_sstep_enabled(vcpu) && per_bp_event(perc)) { 429 + debug_exit->addr = addr; 430 + debug_exit->type = KVM_SINGLESTEP; 431 + goto exit_required; 432 + } 433 + 434 + return 0; 435 + exit_required: 436 + return 1; 437 + } 438 + 439 + #define guest_per_enabled(vcpu) \ 440 + (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) 441 + 442 + static void filter_guest_per_event(struct kvm_vcpu *vcpu) 443 + { 444 + u32 perc = vcpu->arch.sie_block->perc << 24; 445 + u64 peraddr = vcpu->arch.sie_block->peraddr; 446 + u64 addr = vcpu->arch.sie_block->gpsw.addr; 447 + u64 cr9 = vcpu->arch.sie_block->gcr[9]; 448 + u64 cr10 = vcpu->arch.sie_block->gcr[10]; 449 + u64 cr11 = vcpu->arch.sie_block->gcr[11]; 450 + /* filter all events, demanded by the guest */ 451 + u32 guest_perc = perc & cr9 & PER_EVENT_MASK; 452 + 453 + if (!guest_per_enabled(vcpu)) 454 + guest_perc = 0; 455 + 456 + /* filter "successful-branching" events */ 457 + if (guest_perc & PER_EVENT_BRANCH && 458 + cr9 & PER_CONTROL_BRANCH_ADDRESS && 459 + !in_addr_range(addr, cr10, cr11)) 460 + guest_perc &= ~PER_EVENT_BRANCH; 461 + 462 + /* filter "instruction-fetching" events */ 463 + if (guest_perc & PER_EVENT_IFETCH && 464 + !in_addr_range(peraddr, cr10, cr11)) 465 + guest_perc &= ~PER_EVENT_IFETCH; 466 + 467 + /* All other PER events will be given to the guest */ 468 + /* TODO: Check alterated address/address space */ 469 + 470 + vcpu->arch.sie_block->perc = guest_perc >> 24; 471 + 472 + if (!guest_perc) 473 + vcpu->arch.sie_block->iprcc &= ~PGM_PER; 474 + } 475 + 476 + void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) 477 + { 478 + if (debug_exit_required(vcpu)) 479 + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; 480 + 481 + filter_guest_per_event(vcpu); 482 + }

+206 -16

arch/s390/kvm/intercept.c

··· 1 1 /* 2 2 * in-kernel handling for sie intercepts 3 3 * 4 - * Copyright IBM Corp. 2008, 2009 4 + * Copyright IBM Corp. 2008, 2014 5 5 * 6 6 * This program is free software; you can redistribute it and/or modify 7 7 * it under the terms of the GNU General Public License (version 2 only) ··· 16 16 #include <linux/pagemap.h> 17 17 18 18 #include <asm/kvm_host.h> 19 + #include <asm/asm-offsets.h> 20 + #include <asm/irq.h> 19 21 20 22 #include "kvm-s390.h" 21 23 #include "gaccess.h" ··· 31 29 [0x83] = kvm_s390_handle_diag, 32 30 [0xae] = kvm_s390_handle_sigp, 33 31 [0xb2] = kvm_s390_handle_b2, 32 + [0xb6] = kvm_s390_handle_stctl, 34 33 [0xb7] = kvm_s390_handle_lctl, 35 34 [0xb9] = kvm_s390_handle_b9, 36 35 [0xe5] = kvm_s390_handle_e5, ··· 46 43 break; 47 44 case 0x10: 48 45 vcpu->stat.exit_external_request++; 49 - break; 50 - case 0x14: 51 - vcpu->stat.exit_external_interrupt++; 52 46 break; 53 47 default: 54 48 break; /* nothing */ ··· 63 63 trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits); 64 64 65 65 if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { 66 - atomic_set_mask(CPUSTAT_STOPPED, 67 - &vcpu->arch.sie_block->cpuflags); 66 + kvm_s390_vcpu_stop(vcpu); 68 67 vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP; 69 68 VCPU_EVENT(vcpu, 3, "%s", "cpu stopped"); 70 69 rc = -EOPNOTSUPP; ··· 108 109 return -EOPNOTSUPP; 109 110 } 110 111 112 + static void __extract_prog_irq(struct kvm_vcpu *vcpu, 113 + struct kvm_s390_pgm_info *pgm_info) 114 + { 115 + memset(pgm_info, 0, sizeof(struct kvm_s390_pgm_info)); 116 + pgm_info->code = vcpu->arch.sie_block->iprcc; 117 + 118 + switch (vcpu->arch.sie_block->iprcc & ~PGM_PER) { 119 + case PGM_AFX_TRANSLATION: 120 + case PGM_ASX_TRANSLATION: 121 + case PGM_EX_TRANSLATION: 122 + case PGM_LFX_TRANSLATION: 123 + case PGM_LSTE_SEQUENCE: 124 + case PGM_LSX_TRANSLATION: 125 + case PGM_LX_TRANSLATION: 126 + case PGM_PRIMARY_AUTHORITY: 127 + case PGM_SECONDARY_AUTHORITY: 128 + case PGM_SPACE_SWITCH: 129 + pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc; 130 + break; 131 + case PGM_ALEN_TRANSLATION: 132 + case PGM_ALE_SEQUENCE: 133 + case PGM_ASTE_INSTANCE: 134 + case PGM_ASTE_SEQUENCE: 135 + case PGM_ASTE_VALIDITY: 136 + case PGM_EXTENDED_AUTHORITY: 137 + pgm_info->exc_access_id = vcpu->arch.sie_block->eai; 138 + break; 139 + case PGM_ASCE_TYPE: 140 + case PGM_PAGE_TRANSLATION: 141 + case PGM_REGION_FIRST_TRANS: 142 + case PGM_REGION_SECOND_TRANS: 143 + case PGM_REGION_THIRD_TRANS: 144 + case PGM_SEGMENT_TRANSLATION: 145 + pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc; 146 + pgm_info->exc_access_id = vcpu->arch.sie_block->eai; 147 + pgm_info->op_access_id = vcpu->arch.sie_block->oai; 148 + break; 149 + case PGM_MONITOR: 150 + pgm_info->mon_class_nr = vcpu->arch.sie_block->mcn; 151 + pgm_info->mon_code = vcpu->arch.sie_block->tecmc; 152 + break; 153 + case PGM_DATA: 154 + pgm_info->data_exc_code = vcpu->arch.sie_block->dxc; 155 + break; 156 + case PGM_PROTECTION: 157 + pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc; 158 + pgm_info->exc_access_id = vcpu->arch.sie_block->eai; 159 + break; 160 + default: 161 + break; 162 + } 163 + 164 + if (vcpu->arch.sie_block->iprcc & PGM_PER) { 165 + pgm_info->per_code = vcpu->arch.sie_block->perc; 166 + pgm_info->per_atmid = vcpu->arch.sie_block->peratmid; 167 + pgm_info->per_address = vcpu->arch.sie_block->peraddr; 168 + pgm_info->per_access_id = vcpu->arch.sie_block->peraid; 169 + } 170 + } 171 + 172 + /* 173 + * restore ITDB to program-interruption TDB in guest lowcore 174 + * and set TX abort indication if required 175 + */ 176 + static int handle_itdb(struct kvm_vcpu *vcpu) 177 + { 178 + struct kvm_s390_itdb *itdb; 179 + int rc; 180 + 181 + if (!IS_TE_ENABLED(vcpu) || !IS_ITDB_VALID(vcpu)) 182 + return 0; 183 + if (current->thread.per_flags & PER_FLAG_NO_TE) 184 + return 0; 185 + itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba; 186 + rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb)); 187 + if (rc) 188 + return rc; 189 + memset(itdb, 0, sizeof(*itdb)); 190 + 191 + return 0; 192 + } 193 + 194 + #define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER) 195 + 111 196 static int handle_prog(struct kvm_vcpu *vcpu) 112 197 { 198 + struct kvm_s390_pgm_info pgm_info; 199 + psw_t psw; 200 + int rc; 201 + 113 202 vcpu->stat.exit_program_interruption++; 114 203 115 - /* Restore ITDB to Program-Interruption TDB in guest memory */ 116 - if (IS_TE_ENABLED(vcpu) && 117 - !(current->thread.per_flags & PER_FLAG_NO_TE) && 118 - IS_ITDB_VALID(vcpu)) { 119 - copy_to_guest(vcpu, TDB_ADDR, vcpu->arch.sie_block->itdba, 120 - sizeof(struct kvm_s390_itdb)); 121 - memset((void *) vcpu->arch.sie_block->itdba, 0, 122 - sizeof(struct kvm_s390_itdb)); 204 + if (guestdbg_enabled(vcpu) && per_event(vcpu)) { 205 + kvm_s390_handle_per_event(vcpu); 206 + /* the interrupt might have been filtered out completely */ 207 + if (vcpu->arch.sie_block->iprcc == 0) 208 + return 0; 123 209 } 124 210 125 211 trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc); 126 - return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc); 212 + if (vcpu->arch.sie_block->iprcc == PGM_SPECIFICATION) { 213 + rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &psw, sizeof(psw_t)); 214 + if (rc) 215 + return rc; 216 + /* Avoid endless loops of specification exceptions */ 217 + if (!is_valid_psw(&psw)) 218 + return -EOPNOTSUPP; 219 + } 220 + rc = handle_itdb(vcpu); 221 + if (rc) 222 + return rc; 223 + 224 + __extract_prog_irq(vcpu, &pgm_info); 225 + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); 127 226 } 128 227 129 228 static int handle_instruction_and_prog(struct kvm_vcpu *vcpu) ··· 239 142 return rc2; 240 143 } 241 144 145 + /** 146 + * handle_external_interrupt - used for external interruption interceptions 147 + * 148 + * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if 149 + * the new PSW does not have external interrupts disabled. In the first case, 150 + * we've got to deliver the interrupt manually, and in the second case, we 151 + * drop to userspace to handle the situation there. 152 + */ 153 + static int handle_external_interrupt(struct kvm_vcpu *vcpu) 154 + { 155 + u16 eic = vcpu->arch.sie_block->eic; 156 + struct kvm_s390_interrupt irq; 157 + psw_t newpsw; 158 + int rc; 159 + 160 + vcpu->stat.exit_external_interrupt++; 161 + 162 + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); 163 + if (rc) 164 + return rc; 165 + /* We can not handle clock comparator or timer interrupt with bad PSW */ 166 + if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && 167 + (newpsw.mask & PSW_MASK_EXT)) 168 + return -EOPNOTSUPP; 169 + 170 + switch (eic) { 171 + case EXT_IRQ_CLK_COMP: 172 + irq.type = KVM_S390_INT_CLOCK_COMP; 173 + break; 174 + case EXT_IRQ_CPU_TIMER: 175 + irq.type = KVM_S390_INT_CPU_TIMER; 176 + break; 177 + case EXT_IRQ_EXTERNAL_CALL: 178 + if (kvm_s390_si_ext_call_pending(vcpu)) 179 + return 0; 180 + irq.type = KVM_S390_INT_EXTERNAL_CALL; 181 + irq.parm = vcpu->arch.sie_block->extcpuaddr; 182 + break; 183 + default: 184 + return -EOPNOTSUPP; 185 + } 186 + 187 + return kvm_s390_inject_vcpu(vcpu, &irq); 188 + } 189 + 190 + /** 191 + * Handle MOVE PAGE partial execution interception. 192 + * 193 + * This interception can only happen for guests with DAT disabled and 194 + * addresses that are currently not mapped in the host. Thus we try to 195 + * set up the mappings for the corresponding user pages here (or throw 196 + * addressing exceptions in case of illegal guest addresses). 197 + */ 198 + static int handle_mvpg_pei(struct kvm_vcpu *vcpu) 199 + { 200 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 201 + unsigned long srcaddr, dstaddr; 202 + int reg1, reg2, rc; 203 + 204 + kvm_s390_get_regs_rre(vcpu, &reg1, &reg2); 205 + 206 + /* Make sure that the source is paged-in */ 207 + srcaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg2]); 208 + if (kvm_is_error_gpa(vcpu->kvm, srcaddr)) 209 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 210 + rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); 211 + if (rc != 0) 212 + return rc; 213 + 214 + /* Make sure that the destination is paged-in */ 215 + dstaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg1]); 216 + if (kvm_is_error_gpa(vcpu->kvm, dstaddr)) 217 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 218 + rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); 219 + if (rc != 0) 220 + return rc; 221 + 222 + psw->addr = __rewind_psw(*psw, 4); 223 + 224 + return 0; 225 + } 226 + 227 + static int handle_partial_execution(struct kvm_vcpu *vcpu) 228 + { 229 + if (vcpu->arch.sie_block->ipa == 0xb254) /* MVPG */ 230 + return handle_mvpg_pei(vcpu); 231 + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) /* SIGP */ 232 + return kvm_s390_handle_sigp_pei(vcpu); 233 + 234 + return -EOPNOTSUPP; 235 + } 236 + 242 237 static const intercept_handler_t intercept_funcs[] = { 243 238 [0x00 >> 2] = handle_noop, 244 239 [0x04 >> 2] = handle_instruction, 245 240 [0x08 >> 2] = handle_prog, 246 241 [0x0C >> 2] = handle_instruction_and_prog, 247 242 [0x10 >> 2] = handle_noop, 248 - [0x14 >> 2] = handle_noop, 243 + [0x14 >> 2] = handle_external_interrupt, 249 244 [0x18 >> 2] = handle_noop, 250 245 [0x1C >> 2] = kvm_s390_handle_wait, 251 246 [0x20 >> 2] = handle_validity, 252 247 [0x28 >> 2] = handle_stop, 248 + [0x38 >> 2] = handle_partial_execution, 253 249 }; 254 250 255 251 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)

+299 -101

arch/s390/kvm/interrupt.c

··· 27 27 #define IOINT_CSSID_MASK 0x03fc0000 28 28 #define IOINT_AI_MASK 0x04000000 29 29 30 + static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu); 31 + 30 32 static int is_ioint(u64 type) 31 33 { 32 34 return ((type & 0xfffe0000u) != 0xfffe0000u); ··· 58 56 return 1; 59 57 } 60 58 59 + static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) 60 + { 61 + if (psw_extint_disabled(vcpu) || 62 + !(vcpu->arch.sie_block->gcr[0] & 0x800ul)) 63 + return 0; 64 + if (guestdbg_enabled(vcpu) && guestdbg_sstep_enabled(vcpu)) 65 + /* No timer interrupts when single stepping */ 66 + return 0; 67 + return 1; 68 + } 69 + 61 70 static u64 int_word_to_isc_bits(u32 int_word) 62 71 { 63 72 u8 isc = (int_word & 0x38000000) >> 27; ··· 89 76 if (psw_extint_disabled(vcpu)) 90 77 return 0; 91 78 if (vcpu->arch.sie_block->gcr[0] & 0x4000ul) 79 + return 1; 80 + return 0; 81 + case KVM_S390_INT_CLOCK_COMP: 82 + return ckc_interrupts_enabled(vcpu); 83 + case KVM_S390_INT_CPU_TIMER: 84 + if (psw_extint_disabled(vcpu)) 85 + return 0; 86 + if (vcpu->arch.sie_block->gcr[0] & 0x400ul) 92 87 return 1; 93 88 return 0; 94 89 case KVM_S390_INT_SERVICE: ··· 148 127 149 128 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) 150 129 { 151 - atomic_clear_mask(CPUSTAT_ECALL_PEND | 152 - CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT, 153 - &vcpu->arch.sie_block->cpuflags); 130 + atomic_clear_mask(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT, 131 + &vcpu->arch.sie_block->cpuflags); 154 132 vcpu->arch.sie_block->lctl = 0x0000; 155 - vcpu->arch.sie_block->ictl &= ~ICTL_LPSW; 133 + vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT); 134 + 135 + if (guestdbg_enabled(vcpu)) { 136 + vcpu->arch.sie_block->lctl |= (LCTL_CR0 | LCTL_CR9 | 137 + LCTL_CR10 | LCTL_CR11); 138 + vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT); 139 + } 156 140 } 157 141 158 142 static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) ··· 175 149 case KVM_S390_INT_PFAULT_INIT: 176 150 case KVM_S390_INT_PFAULT_DONE: 177 151 case KVM_S390_INT_VIRTIO: 152 + case KVM_S390_INT_CLOCK_COMP: 153 + case KVM_S390_INT_CPU_TIMER: 178 154 if (psw_extint_disabled(vcpu)) 179 155 __set_cpuflag(vcpu, CPUSTAT_EXT_INT); 180 156 else ··· 202 174 } 203 175 } 204 176 177 + static int __deliver_prog_irq(struct kvm_vcpu *vcpu, 178 + struct kvm_s390_pgm_info *pgm_info) 179 + { 180 + const unsigned short table[] = { 2, 4, 4, 6 }; 181 + int rc = 0; 182 + 183 + switch (pgm_info->code & ~PGM_PER) { 184 + case PGM_AFX_TRANSLATION: 185 + case PGM_ASX_TRANSLATION: 186 + case PGM_EX_TRANSLATION: 187 + case PGM_LFX_TRANSLATION: 188 + case PGM_LSTE_SEQUENCE: 189 + case PGM_LSX_TRANSLATION: 190 + case PGM_LX_TRANSLATION: 191 + case PGM_PRIMARY_AUTHORITY: 192 + case PGM_SECONDARY_AUTHORITY: 193 + case PGM_SPACE_SWITCH: 194 + rc = put_guest_lc(vcpu, pgm_info->trans_exc_code, 195 + (u64 *)__LC_TRANS_EXC_CODE); 196 + break; 197 + case PGM_ALEN_TRANSLATION: 198 + case PGM_ALE_SEQUENCE: 199 + case PGM_ASTE_INSTANCE: 200 + case PGM_ASTE_SEQUENCE: 201 + case PGM_ASTE_VALIDITY: 202 + case PGM_EXTENDED_AUTHORITY: 203 + rc = put_guest_lc(vcpu, pgm_info->exc_access_id, 204 + (u8 *)__LC_EXC_ACCESS_ID); 205 + break; 206 + case PGM_ASCE_TYPE: 207 + case PGM_PAGE_TRANSLATION: 208 + case PGM_REGION_FIRST_TRANS: 209 + case PGM_REGION_SECOND_TRANS: 210 + case PGM_REGION_THIRD_TRANS: 211 + case PGM_SEGMENT_TRANSLATION: 212 + rc = put_guest_lc(vcpu, pgm_info->trans_exc_code, 213 + (u64 *)__LC_TRANS_EXC_CODE); 214 + rc |= put_guest_lc(vcpu, pgm_info->exc_access_id, 215 + (u8 *)__LC_EXC_ACCESS_ID); 216 + rc |= put_guest_lc(vcpu, pgm_info->op_access_id, 217 + (u8 *)__LC_OP_ACCESS_ID); 218 + break; 219 + case PGM_MONITOR: 220 + rc = put_guest_lc(vcpu, pgm_info->mon_class_nr, 221 + (u64 *)__LC_MON_CLASS_NR); 222 + rc |= put_guest_lc(vcpu, pgm_info->mon_code, 223 + (u64 *)__LC_MON_CODE); 224 + break; 225 + case PGM_DATA: 226 + rc = put_guest_lc(vcpu, pgm_info->data_exc_code, 227 + (u32 *)__LC_DATA_EXC_CODE); 228 + break; 229 + case PGM_PROTECTION: 230 + rc = put_guest_lc(vcpu, pgm_info->trans_exc_code, 231 + (u64 *)__LC_TRANS_EXC_CODE); 232 + rc |= put_guest_lc(vcpu, pgm_info->exc_access_id, 233 + (u8 *)__LC_EXC_ACCESS_ID); 234 + break; 235 + } 236 + 237 + if (pgm_info->code & PGM_PER) { 238 + rc |= put_guest_lc(vcpu, pgm_info->per_code, 239 + (u8 *) __LC_PER_CODE); 240 + rc |= put_guest_lc(vcpu, pgm_info->per_atmid, 241 + (u8 *)__LC_PER_ATMID); 242 + rc |= put_guest_lc(vcpu, pgm_info->per_address, 243 + (u64 *) __LC_PER_ADDRESS); 244 + rc |= put_guest_lc(vcpu, pgm_info->per_access_id, 245 + (u8 *) __LC_PER_ACCESS_ID); 246 + } 247 + 248 + switch (vcpu->arch.sie_block->icptcode) { 249 + case ICPT_INST: 250 + case ICPT_INSTPROGI: 251 + case ICPT_OPEREXC: 252 + case ICPT_PARTEXEC: 253 + case ICPT_IOINST: 254 + /* last instruction only stored for these icptcodes */ 255 + rc |= put_guest_lc(vcpu, table[vcpu->arch.sie_block->ipa >> 14], 256 + (u16 *) __LC_PGM_ILC); 257 + break; 258 + case ICPT_PROGI: 259 + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->pgmilc, 260 + (u16 *) __LC_PGM_ILC); 261 + break; 262 + default: 263 + rc |= put_guest_lc(vcpu, 0, 264 + (u16 *) __LC_PGM_ILC); 265 + } 266 + 267 + rc |= put_guest_lc(vcpu, pgm_info->code, 268 + (u16 *)__LC_PGM_INT_CODE); 269 + rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, 270 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 271 + rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW, 272 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 273 + 274 + return rc; 275 + } 276 + 205 277 static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, 206 278 struct kvm_s390_interrupt_info *inti) 207 279 { ··· 314 186 vcpu->stat.deliver_emergency_signal++; 315 187 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 316 188 inti->emerg.code, 0); 317 - rc = put_guest(vcpu, 0x1201, (u16 __user *)__LC_EXT_INT_CODE); 318 - rc |= put_guest(vcpu, inti->emerg.code, 319 - (u16 __user *)__LC_EXT_CPU_ADDR); 320 - rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 189 + rc = put_guest_lc(vcpu, 0x1201, (u16 *)__LC_EXT_INT_CODE); 190 + rc |= put_guest_lc(vcpu, inti->emerg.code, 191 + (u16 *)__LC_EXT_CPU_ADDR); 192 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 193 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 194 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 321 195 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 322 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 323 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 324 196 break; 325 197 case KVM_S390_INT_EXTERNAL_CALL: 326 198 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); 327 199 vcpu->stat.deliver_external_call++; 328 200 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 329 201 inti->extcall.code, 0); 330 - rc = put_guest(vcpu, 0x1202, (u16 __user *)__LC_EXT_INT_CODE); 331 - rc |= put_guest(vcpu, inti->extcall.code, 332 - (u16 __user *)__LC_EXT_CPU_ADDR); 333 - rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 202 + rc = put_guest_lc(vcpu, 0x1202, (u16 *)__LC_EXT_INT_CODE); 203 + rc |= put_guest_lc(vcpu, inti->extcall.code, 204 + (u16 *)__LC_EXT_CPU_ADDR); 205 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 206 + &vcpu->arch.sie_block->gpsw, 207 + sizeof(psw_t)); 208 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 209 + &vcpu->arch.sie_block->gpsw, 210 + sizeof(psw_t)); 211 + break; 212 + case KVM_S390_INT_CLOCK_COMP: 213 + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 214 + inti->ext.ext_params, 0); 215 + deliver_ckc_interrupt(vcpu); 216 + break; 217 + case KVM_S390_INT_CPU_TIMER: 218 + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 219 + inti->ext.ext_params, 0); 220 + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, 221 + (u16 *)__LC_EXT_INT_CODE); 222 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 223 + &vcpu->arch.sie_block->gpsw, 224 + sizeof(psw_t)); 225 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 334 226 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 335 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 336 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 227 + rc |= put_guest_lc(vcpu, inti->ext.ext_params, 228 + (u32 *)__LC_EXT_PARAMS); 337 229 break; 338 230 case KVM_S390_INT_SERVICE: 339 231 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", ··· 361 213 vcpu->stat.deliver_service_signal++; 362 214 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 363 215 inti->ext.ext_params, 0); 364 - rc = put_guest(vcpu, 0x2401, (u16 __user *)__LC_EXT_INT_CODE); 365 - rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 216 + rc = put_guest_lc(vcpu, 0x2401, (u16 *)__LC_EXT_INT_CODE); 217 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 218 + &vcpu->arch.sie_block->gpsw, 219 + sizeof(psw_t)); 220 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 366 221 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 367 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 368 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 369 - rc |= put_guest(vcpu, inti->ext.ext_params, 370 - (u32 __user *)__LC_EXT_PARAMS); 222 + rc |= put_guest_lc(vcpu, inti->ext.ext_params, 223 + (u32 *)__LC_EXT_PARAMS); 371 224 break; 372 225 case KVM_S390_INT_PFAULT_INIT: 373 226 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, 374 227 inti->ext.ext_params2); 375 - rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE); 376 - rc |= put_guest(vcpu, 0x0600, (u16 __user *) __LC_EXT_CPU_ADDR); 377 - rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 228 + rc = put_guest_lc(vcpu, 0x2603, (u16 *) __LC_EXT_INT_CODE); 229 + rc |= put_guest_lc(vcpu, 0x0600, (u16 *) __LC_EXT_CPU_ADDR); 230 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 231 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 232 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 378 233 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 379 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 380 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 381 - rc |= put_guest(vcpu, inti->ext.ext_params2, 382 - (u64 __user *) __LC_EXT_PARAMS2); 234 + rc |= put_guest_lc(vcpu, inti->ext.ext_params2, 235 + (u64 *) __LC_EXT_PARAMS2); 383 236 break; 384 237 case KVM_S390_INT_PFAULT_DONE: 385 238 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, 386 239 inti->ext.ext_params2); 387 - rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE); 388 - rc |= put_guest(vcpu, 0x0680, (u16 __user *) __LC_EXT_CPU_ADDR); 389 - rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 240 + rc = put_guest_lc(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE); 241 + rc |= put_guest_lc(vcpu, 0x0680, (u16 *)__LC_EXT_CPU_ADDR); 242 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 243 + &vcpu->arch.sie_block->gpsw, 244 + sizeof(psw_t)); 245 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 390 246 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 391 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 392 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 393 - rc |= put_guest(vcpu, inti->ext.ext_params2, 394 - (u64 __user *) __LC_EXT_PARAMS2); 247 + rc |= put_guest_lc(vcpu, inti->ext.ext_params2, 248 + (u64 *)__LC_EXT_PARAMS2); 395 249 break; 396 250 case KVM_S390_INT_VIRTIO: 397 251 VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", ··· 402 252 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 403 253 inti->ext.ext_params, 404 254 inti->ext.ext_params2); 405 - rc = put_guest(vcpu, 0x2603, (u16 __user *)__LC_EXT_INT_CODE); 406 - rc |= put_guest(vcpu, 0x0d00, (u16 __user *)__LC_EXT_CPU_ADDR); 407 - rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 255 + rc = put_guest_lc(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE); 256 + rc |= put_guest_lc(vcpu, 0x0d00, (u16 *)__LC_EXT_CPU_ADDR); 257 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 258 + &vcpu->arch.sie_block->gpsw, 259 + sizeof(psw_t)); 260 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 408 261 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 409 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 410 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 411 - rc |= put_guest(vcpu, inti->ext.ext_params, 412 - (u32 __user *)__LC_EXT_PARAMS); 413 - rc |= put_guest(vcpu, inti->ext.ext_params2, 414 - (u64 __user *)__LC_EXT_PARAMS2); 262 + rc |= put_guest_lc(vcpu, inti->ext.ext_params, 263 + (u32 *)__LC_EXT_PARAMS); 264 + rc |= put_guest_lc(vcpu, inti->ext.ext_params2, 265 + (u64 *)__LC_EXT_PARAMS2); 415 266 break; 416 267 case KVM_S390_SIGP_STOP: 417 268 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); ··· 436 285 vcpu->stat.deliver_restart_signal++; 437 286 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 438 287 0, 0); 439 - rc = copy_to_guest(vcpu, 440 - offsetof(struct _lowcore, restart_old_psw), 441 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 442 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 443 - offsetof(struct _lowcore, restart_psw), 444 - sizeof(psw_t)); 445 - atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 288 + rc = write_guest_lc(vcpu, 289 + offsetof(struct _lowcore, restart_old_psw), 290 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 291 + rc |= read_guest_lc(vcpu, offsetof(struct _lowcore, restart_psw), 292 + &vcpu->arch.sie_block->gpsw, 293 + sizeof(psw_t)); 446 294 break; 447 295 case KVM_S390_PROGRAM_INT: 448 296 VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x", ··· 450 300 vcpu->stat.deliver_program_int++; 451 301 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 452 302 inti->pgm.code, 0); 453 - rc = put_guest(vcpu, inti->pgm.code, (u16 __user *)__LC_PGM_INT_CODE); 454 - rc |= put_guest(vcpu, table[vcpu->arch.sie_block->ipa >> 14], 455 - (u16 __user *)__LC_PGM_ILC); 456 - rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW, 457 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 458 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 459 - __LC_PGM_NEW_PSW, sizeof(psw_t)); 303 + rc = __deliver_prog_irq(vcpu, &inti->pgm); 460 304 break; 461 305 462 306 case KVM_S390_MCHK: ··· 461 317 inti->mchk.mcic); 462 318 rc = kvm_s390_vcpu_store_status(vcpu, 463 319 KVM_S390_STORE_STATUS_PREFIXED); 464 - rc |= put_guest(vcpu, inti->mchk.mcic, (u64 __user *) __LC_MCCK_CODE); 465 - rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW, 320 + rc |= put_guest_lc(vcpu, inti->mchk.mcic, (u64 *)__LC_MCCK_CODE); 321 + rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, 322 + &vcpu->arch.sie_block->gpsw, 323 + sizeof(psw_t)); 324 + rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, 466 325 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 467 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 468 - __LC_MCK_NEW_PSW, sizeof(psw_t)); 469 326 break; 470 327 471 328 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: ··· 479 334 vcpu->stat.deliver_io_int++; 480 335 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 481 336 param0, param1); 482 - rc = put_guest(vcpu, inti->io.subchannel_id, 483 - (u16 __user *) __LC_SUBCHANNEL_ID); 484 - rc |= put_guest(vcpu, inti->io.subchannel_nr, 485 - (u16 __user *) __LC_SUBCHANNEL_NR); 486 - rc |= put_guest(vcpu, inti->io.io_int_parm, 487 - (u32 __user *) __LC_IO_INT_PARM); 488 - rc |= put_guest(vcpu, inti->io.io_int_word, 489 - (u32 __user *) __LC_IO_INT_WORD); 490 - rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW, 491 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 492 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 493 - __LC_IO_NEW_PSW, sizeof(psw_t)); 337 + rc = put_guest_lc(vcpu, inti->io.subchannel_id, 338 + (u16 *)__LC_SUBCHANNEL_ID); 339 + rc |= put_guest_lc(vcpu, inti->io.subchannel_nr, 340 + (u16 *)__LC_SUBCHANNEL_NR); 341 + rc |= put_guest_lc(vcpu, inti->io.io_int_parm, 342 + (u32 *)__LC_IO_INT_PARM); 343 + rc |= put_guest_lc(vcpu, inti->io.io_int_word, 344 + (u32 *)__LC_IO_INT_WORD); 345 + rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW, 346 + &vcpu->arch.sie_block->gpsw, 347 + sizeof(psw_t)); 348 + rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW, 349 + &vcpu->arch.sie_block->gpsw, 350 + sizeof(psw_t)); 494 351 break; 495 352 } 496 353 default: ··· 505 358 } 506 359 } 507 360 508 - static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) 361 + static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu) 509 362 { 510 363 int rc; 511 364 512 - if (psw_extint_disabled(vcpu)) 513 - return 0; 514 - if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul)) 515 - return 0; 516 - rc = put_guest(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE); 517 - rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 518 - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 519 - rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 520 - __LC_EXT_NEW_PSW, sizeof(psw_t)); 365 + rc = put_guest_lc(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE); 366 + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, 367 + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 368 + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, 369 + &vcpu->arch.sie_block->gpsw, 370 + sizeof(psw_t)); 521 371 if (rc) { 522 372 printk("kvm: The guest lowcore is not mapped during interrupt " 523 373 "delivery, killing userspace\n"); 524 374 do_exit(SIGKILL); 525 375 } 526 - return 1; 376 + } 377 + 378 + /* Check whether SIGP interpretation facility has an external call pending */ 379 + int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu) 380 + { 381 + atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl; 382 + 383 + if (!psw_extint_disabled(vcpu) && 384 + (vcpu->arch.sie_block->gcr[0] & 0x2000ul) && 385 + (atomic_read(sigp_ctrl) & SIGP_CTRL_C) && 386 + (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND)) 387 + return 1; 388 + 389 + return 0; 527 390 } 528 391 529 392 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) ··· 563 406 spin_unlock(&fi->lock); 564 407 } 565 408 566 - if ((!rc) && (vcpu->arch.sie_block->ckc < 567 - get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) { 568 - if ((!psw_extint_disabled(vcpu)) && 569 - (vcpu->arch.sie_block->gcr[0] & 0x800ul)) 570 - rc = 1; 571 - } 409 + if (!rc && kvm_cpu_has_pending_timer(vcpu)) 410 + rc = 1; 411 + 412 + if (!rc && kvm_s390_si_ext_call_pending(vcpu)) 413 + rc = 1; 572 414 573 415 return rc; 574 416 } 575 417 576 418 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 577 419 { 578 - return 0; 420 + if (!(vcpu->arch.sie_block->ckc < 421 + get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) 422 + return 0; 423 + if (!ckc_interrupts_enabled(vcpu)) 424 + return 0; 425 + return 1; 579 426 } 580 427 581 428 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) ··· 602 441 return -EOPNOTSUPP; /* disabled wait */ 603 442 } 604 443 605 - if (psw_extint_disabled(vcpu) || 606 - (!(vcpu->arch.sie_block->gcr[0] & 0x800ul))) { 444 + if (!ckc_interrupts_enabled(vcpu)) { 607 445 VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); 608 446 goto no_timer; 609 447 } ··· 625 465 while (list_empty(&vcpu->arch.local_int.list) && 626 466 list_empty(&vcpu->arch.local_int.float_int->list) && 627 467 (!vcpu->arch.local_int.timer_due) && 628 - !signal_pending(current)) { 468 + !signal_pending(current) && 469 + !kvm_s390_si_ext_call_pending(vcpu)) { 629 470 set_current_state(TASK_INTERRUPTIBLE); 630 471 spin_unlock_bh(&vcpu->arch.local_int.lock); 631 472 spin_unlock(&vcpu->arch.local_int.float_int->lock); ··· 683 522 } 684 523 atomic_set(&li->active, 0); 685 524 spin_unlock_bh(&li->lock); 525 + 526 + /* clear pending external calls set by sigp interpretation facility */ 527 + atomic_clear_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags); 528 + atomic_clear_mask(SIGP_CTRL_C, 529 + &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl); 686 530 } 687 531 688 532 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) ··· 720 554 } while (deliver); 721 555 } 722 556 723 - if ((vcpu->arch.sie_block->ckc < 724 - get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) 725 - __try_deliver_ckc_interrupt(vcpu); 557 + if (kvm_cpu_has_pending_timer(vcpu)) 558 + deliver_ckc_interrupt(vcpu); 726 559 727 560 if (atomic_read(&fi->active)) { 728 561 do { ··· 817 652 818 653 VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code); 819 654 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1); 655 + spin_lock_bh(&li->lock); 656 + list_add(&inti->list, &li->list); 657 + atomic_set(&li->active, 1); 658 + BUG_ON(waitqueue_active(li->wq)); 659 + spin_unlock_bh(&li->lock); 660 + return 0; 661 + } 662 + 663 + int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, 664 + struct kvm_s390_pgm_info *pgm_info) 665 + { 666 + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 667 + struct kvm_s390_interrupt_info *inti; 668 + 669 + inti = kzalloc(sizeof(*inti), GFP_KERNEL); 670 + if (!inti) 671 + return -ENOMEM; 672 + 673 + VCPU_EVENT(vcpu, 3, "inject: prog irq %d (from kernel)", 674 + pgm_info->code); 675 + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, 676 + pgm_info->code, 0, 1); 677 + 678 + inti->type = KVM_S390_PROGRAM_INT; 679 + memcpy(&inti->pgm, pgm_info, sizeof(inti->pgm)); 820 680 spin_lock_bh(&li->lock); 821 681 list_add(&inti->list, &li->list); 822 682 atomic_set(&li->active, 1); ··· 1000 810 return __inject_vm(kvm, inti); 1001 811 } 1002 812 813 + void kvm_s390_reinject_io_int(struct kvm *kvm, 814 + struct kvm_s390_interrupt_info *inti) 815 + { 816 + __inject_vm(kvm, inti); 817 + } 818 + 1003 819 int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 1004 820 struct kvm_s390_interrupt *s390int) 1005 821 { ··· 1035 839 break; 1036 840 case KVM_S390_SIGP_STOP: 1037 841 case KVM_S390_RESTART: 842 + case KVM_S390_INT_CLOCK_COMP: 843 + case KVM_S390_INT_CPU_TIMER: 1038 844 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); 1039 845 inti->type = s390int->type; 1040 846 break; ··· 1098 900 return 0; 1099 901 } 1100 902 1101 - static void clear_floating_interrupts(struct kvm *kvm) 903 + void kvm_s390_clear_float_irqs(struct kvm *kvm) 1102 904 { 1103 905 struct kvm_s390_float_interrupt *fi; 1104 906 struct kvm_s390_interrupt_info *n, *inti = NULL; ··· 1444 1246 break; 1445 1247 case KVM_DEV_FLIC_CLEAR_IRQS: 1446 1248 r = 0; 1447 - clear_floating_interrupts(dev->kvm); 1249 + kvm_s390_clear_float_irqs(dev->kvm); 1448 1250 break; 1449 1251 case KVM_DEV_FLIC_APF_ENABLE: 1450 1252 dev->kvm->arch.gmap->pfault_enabled = 1;

+447 -107

arch/s390/kvm/kvm-s390.c

··· 11 11 * Christian Borntraeger <borntraeger@de.ibm.com> 12 12 * Heiko Carstens <heiko.carstens@de.ibm.com> 13 13 * Christian Ehrhardt <ehrhardt@de.ibm.com> 14 + * Jason J. Herne <jjherne@us.ibm.com> 14 15 */ 15 16 16 17 #include <linux/compiler.h> ··· 52 51 { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, 53 52 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, 54 53 { "instruction_lctl", VCPU_STAT(instruction_lctl) }, 54 + { "instruction_stctl", VCPU_STAT(instruction_stctl) }, 55 + { "instruction_stctg", VCPU_STAT(instruction_stctg) }, 55 56 { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, 56 57 { "deliver_external_call", VCPU_STAT(deliver_external_call) }, 57 58 { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, ··· 69 66 { "instruction_stpx", VCPU_STAT(instruction_stpx) }, 70 67 { "instruction_stap", VCPU_STAT(instruction_stap) }, 71 68 { "instruction_storage_key", VCPU_STAT(instruction_storage_key) }, 69 + { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) }, 72 70 { "instruction_stsch", VCPU_STAT(instruction_stsch) }, 73 71 { "instruction_chsc", VCPU_STAT(instruction_chsc) }, 74 72 { "instruction_essa", VCPU_STAT(instruction_essa) }, ··· 94 90 static struct gmap_notifier gmap_notifier; 95 91 96 92 /* test availability of vfacility */ 97 - static inline int test_vfacility(unsigned long nr) 93 + int test_vfacility(unsigned long nr) 98 94 { 99 95 return __test_facility(nr, (void *) vfacilities); 100 96 } ··· 166 162 case KVM_CAP_IOEVENTFD: 167 163 case KVM_CAP_DEVICE_CTRL: 168 164 case KVM_CAP_ENABLE_CAP_VM: 165 + case KVM_CAP_VM_ATTRIBUTES: 169 166 r = 1; 170 167 break; 171 168 case KVM_CAP_NR_VCPUS: ··· 185 180 return r; 186 181 } 187 182 183 + static void kvm_s390_sync_dirty_log(struct kvm *kvm, 184 + struct kvm_memory_slot *memslot) 185 + { 186 + gfn_t cur_gfn, last_gfn; 187 + unsigned long address; 188 + struct gmap *gmap = kvm->arch.gmap; 189 + 190 + down_read(&gmap->mm->mmap_sem); 191 + /* Loop over all guest pages */ 192 + last_gfn = memslot->base_gfn + memslot->npages; 193 + for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) { 194 + address = gfn_to_hva_memslot(memslot, cur_gfn); 195 + 196 + if (gmap_test_and_clear_dirty(address, gmap)) 197 + mark_page_dirty(kvm, cur_gfn); 198 + } 199 + up_read(&gmap->mm->mmap_sem); 200 + } 201 + 188 202 /* Section: vm related */ 189 203 /* 190 204 * Get (and clear) the dirty memory log for a memory slot. ··· 211 187 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 212 188 struct kvm_dirty_log *log) 213 189 { 214 - return 0; 190 + int r; 191 + unsigned long n; 192 + struct kvm_memory_slot *memslot; 193 + int is_dirty = 0; 194 + 195 + mutex_lock(&kvm->slots_lock); 196 + 197 + r = -EINVAL; 198 + if (log->slot >= KVM_USER_MEM_SLOTS) 199 + goto out; 200 + 201 + memslot = id_to_memslot(kvm->memslots, log->slot); 202 + r = -ENOENT; 203 + if (!memslot->dirty_bitmap) 204 + goto out; 205 + 206 + kvm_s390_sync_dirty_log(kvm, memslot); 207 + r = kvm_get_dirty_log(kvm, log, &is_dirty); 208 + if (r) 209 + goto out; 210 + 211 + /* Clear the dirty log */ 212 + if (is_dirty) { 213 + n = kvm_dirty_bitmap_bytes(memslot); 214 + memset(memslot->dirty_bitmap, 0, n); 215 + } 216 + r = 0; 217 + out: 218 + mutex_unlock(&kvm->slots_lock); 219 + return r; 215 220 } 216 221 217 222 static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) ··· 262 209 return r; 263 210 } 264 211 212 + static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) 213 + { 214 + int ret; 215 + unsigned int idx; 216 + switch (attr->attr) { 217 + case KVM_S390_VM_MEM_ENABLE_CMMA: 218 + ret = -EBUSY; 219 + mutex_lock(&kvm->lock); 220 + if (atomic_read(&kvm->online_vcpus) == 0) { 221 + kvm->arch.use_cmma = 1; 222 + ret = 0; 223 + } 224 + mutex_unlock(&kvm->lock); 225 + break; 226 + case KVM_S390_VM_MEM_CLR_CMMA: 227 + mutex_lock(&kvm->lock); 228 + idx = srcu_read_lock(&kvm->srcu); 229 + page_table_reset_pgste(kvm->arch.gmap->mm, 0, TASK_SIZE, false); 230 + srcu_read_unlock(&kvm->srcu, idx); 231 + mutex_unlock(&kvm->lock); 232 + ret = 0; 233 + break; 234 + default: 235 + ret = -ENXIO; 236 + break; 237 + } 238 + return ret; 239 + } 240 + 241 + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) 242 + { 243 + int ret; 244 + 245 + switch (attr->group) { 246 + case KVM_S390_VM_MEM_CTRL: 247 + ret = kvm_s390_mem_control(kvm, attr); 248 + break; 249 + default: 250 + ret = -ENXIO; 251 + break; 252 + } 253 + 254 + return ret; 255 + } 256 + 257 + static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) 258 + { 259 + return -ENXIO; 260 + } 261 + 262 + static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) 263 + { 264 + int ret; 265 + 266 + switch (attr->group) { 267 + case KVM_S390_VM_MEM_CTRL: 268 + switch (attr->attr) { 269 + case KVM_S390_VM_MEM_ENABLE_CMMA: 270 + case KVM_S390_VM_MEM_CLR_CMMA: 271 + ret = 0; 272 + break; 273 + default: 274 + ret = -ENXIO; 275 + break; 276 + } 277 + break; 278 + default: 279 + ret = -ENXIO; 280 + break; 281 + } 282 + 283 + return ret; 284 + } 285 + 265 286 long kvm_arch_vm_ioctl(struct file *filp, 266 287 unsigned int ioctl, unsigned long arg) 267 288 { 268 289 struct kvm *kvm = filp->private_data; 269 290 void __user *argp = (void __user *)arg; 291 + struct kvm_device_attr attr; 270 292 int r; 271 293 272 294 switch (ioctl) { ··· 372 244 kvm_set_irq_routing(kvm, &routing, 0, 0); 373 245 r = 0; 374 246 } 247 + break; 248 + } 249 + case KVM_SET_DEVICE_ATTR: { 250 + r = -EFAULT; 251 + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 252 + break; 253 + r = kvm_s390_vm_set_attr(kvm, &attr); 254 + break; 255 + } 256 + case KVM_GET_DEVICE_ATTR: { 257 + r = -EFAULT; 258 + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 259 + break; 260 + r = kvm_s390_vm_get_attr(kvm, &attr); 261 + break; 262 + } 263 + case KVM_HAS_DEVICE_ATTR: { 264 + r = -EFAULT; 265 + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 266 + break; 267 + r = kvm_s390_vm_has_attr(kvm, &attr); 375 268 break; 376 269 } 377 270 default: ··· 441 292 442 293 spin_lock_init(&kvm->arch.float_int.lock); 443 294 INIT_LIST_HEAD(&kvm->arch.float_int.list); 295 + init_waitqueue_head(&kvm->arch.ipte_wq); 444 296 445 297 debug_register_view(kvm->arch.dbf, &debug_sprintf_view); 446 298 VM_EVENT(kvm, 3, "%s", "vm created"); ··· 459 309 kvm->arch.css_support = 0; 460 310 kvm->arch.use_irqchip = 0; 461 311 312 + spin_lock_init(&kvm->arch.start_stop_lock); 313 + 462 314 return 0; 463 315 out_nogmap: 464 316 debug_unregister(kvm->arch.dbf); ··· 474 322 { 475 323 VCPU_EVENT(vcpu, 3, "%s", "free cpu"); 476 324 trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); 325 + kvm_s390_clear_local_irqs(vcpu); 477 326 kvm_clear_async_pf_completion_queue(vcpu); 478 327 if (!kvm_is_ucontrol(vcpu->kvm)) { 479 328 clear_bit(63 - vcpu->vcpu_id, ··· 488 335 if (kvm_is_ucontrol(vcpu->kvm)) 489 336 gmap_free(vcpu->arch.gmap); 490 337 491 - if (vcpu->arch.sie_block->cbrlo) 492 - __free_page(__pfn_to_page( 493 - vcpu->arch.sie_block->cbrlo >> PAGE_SHIFT)); 338 + if (kvm_s390_cmma_enabled(vcpu->kvm)) 339 + kvm_s390_vcpu_unsetup_cmma(vcpu); 494 340 free_page((unsigned long)(vcpu->arch.sie_block)); 495 341 496 342 kvm_vcpu_uninit(vcpu); ··· 524 372 if (!kvm_is_ucontrol(kvm)) 525 373 gmap_free(kvm->arch.gmap); 526 374 kvm_s390_destroy_adapters(kvm); 375 + kvm_s390_clear_float_irqs(kvm); 527 376 } 528 377 529 378 /* Section: vcpu related */ ··· 595 442 vcpu->arch.sie_block->pp = 0; 596 443 vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; 597 444 kvm_clear_async_pf_completion_queue(vcpu); 598 - atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 445 + kvm_s390_vcpu_stop(vcpu); 599 446 kvm_s390_clear_local_irqs(vcpu); 600 447 } 601 448 ··· 604 451 return 0; 605 452 } 606 453 454 + void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) 455 + { 456 + free_page(vcpu->arch.sie_block->cbrlo); 457 + vcpu->arch.sie_block->cbrlo = 0; 458 + } 459 + 460 + int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) 461 + { 462 + vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL); 463 + if (!vcpu->arch.sie_block->cbrlo) 464 + return -ENOMEM; 465 + 466 + vcpu->arch.sie_block->ecb2 |= 0x80; 467 + vcpu->arch.sie_block->ecb2 &= ~0x08; 468 + return 0; 469 + } 470 + 607 471 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 608 472 { 609 - struct page *cbrl; 473 + int rc = 0; 610 474 611 475 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | 612 476 CPUSTAT_SM | ··· 634 464 vcpu->arch.sie_block->ecb |= 0x10; 635 465 636 466 vcpu->arch.sie_block->ecb2 = 8; 637 - vcpu->arch.sie_block->eca = 0xC1002001U; 467 + vcpu->arch.sie_block->eca = 0xD1002000U; 468 + if (sclp_has_siif()) 469 + vcpu->arch.sie_block->eca |= 1; 638 470 vcpu->arch.sie_block->fac = (int) (long) vfacilities; 639 - if (kvm_enabled_cmma()) { 640 - cbrl = alloc_page(GFP_KERNEL | __GFP_ZERO); 641 - if (cbrl) { 642 - vcpu->arch.sie_block->ecb2 |= 0x80; 643 - vcpu->arch.sie_block->ecb2 &= ~0x08; 644 - vcpu->arch.sie_block->cbrlo = page_to_phys(cbrl); 645 - } 471 + vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE | 472 + ICTL_TPROT; 473 + 474 + if (kvm_s390_cmma_enabled(vcpu->kvm)) { 475 + rc = kvm_s390_vcpu_setup_cmma(vcpu); 476 + if (rc) 477 + return rc; 646 478 } 647 479 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 648 480 tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet, ··· 652 480 vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; 653 481 get_cpu_id(&vcpu->arch.cpu_id); 654 482 vcpu->arch.cpu_id.version = 0xff; 655 - return 0; 483 + return rc; 656 484 } 657 485 658 486 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, ··· 756 584 757 585 kvm_for_each_vcpu(i, vcpu, kvm) { 758 586 /* match against both prefix pages */ 759 - if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) { 587 + if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { 760 588 VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); 761 589 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); 762 590 exit_sie_sync(vcpu); ··· 941 769 return -EINVAL; /* not implemented yet */ 942 770 } 943 771 772 + #define VALID_GUESTDBG_FLAGS (KVM_GUESTDBG_SINGLESTEP | \ 773 + KVM_GUESTDBG_USE_HW_BP | \ 774 + KVM_GUESTDBG_ENABLE) 775 + 944 776 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 945 777 struct kvm_guest_debug *dbg) 946 778 { 947 - return -EINVAL; /* not implemented yet */ 779 + int rc = 0; 780 + 781 + vcpu->guest_debug = 0; 782 + kvm_s390_clear_bp_data(vcpu); 783 + 784 + if (dbg->control & ~VALID_GUESTDBG_FLAGS) 785 + return -EINVAL; 786 + 787 + if (dbg->control & KVM_GUESTDBG_ENABLE) { 788 + vcpu->guest_debug = dbg->control; 789 + /* enforce guest PER */ 790 + atomic_set_mask(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); 791 + 792 + if (dbg->control & KVM_GUESTDBG_USE_HW_BP) 793 + rc = kvm_s390_import_bp_data(vcpu, dbg); 794 + } else { 795 + atomic_clear_mask(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); 796 + vcpu->arch.guestdbg.last_bp = 0; 797 + } 798 + 799 + if (rc) { 800 + vcpu->guest_debug = 0; 801 + kvm_s390_clear_bp_data(vcpu); 802 + atomic_clear_mask(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); 803 + } 804 + 805 + return rc; 948 806 } 949 807 950 808 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, ··· 989 787 return -EINVAL; /* not implemented yet */ 990 788 } 991 789 790 + bool kvm_s390_cmma_enabled(struct kvm *kvm) 791 + { 792 + if (!MACHINE_IS_LPAR) 793 + return false; 794 + /* only enable for z10 and later */ 795 + if (!MACHINE_HAS_EDAT1) 796 + return false; 797 + if (!kvm->arch.use_cmma) 798 + return false; 799 + return true; 800 + } 801 + 802 + static bool ibs_enabled(struct kvm_vcpu *vcpu) 803 + { 804 + return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS; 805 + } 806 + 992 807 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) 993 808 { 809 + retry: 810 + s390_vcpu_unblock(vcpu); 994 811 /* 995 812 * We use MMU_RELOAD just to re-arm the ipte notifier for the 996 813 * guest prefix page. gmap_ipte_notify will wait on the ptl lock. ··· 1017 796 * already finished. We might race against a second unmapper that 1018 797 * wants to set the blocking bit. Lets just retry the request loop. 1019 798 */ 1020 - while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { 799 + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { 1021 800 int rc; 1022 801 rc = gmap_ipte_notify(vcpu->arch.gmap, 1023 - vcpu->arch.sie_block->prefix, 802 + kvm_s390_get_prefix(vcpu), 1024 803 PAGE_SIZE * 2); 1025 804 if (rc) 1026 805 return rc; 1027 - s390_vcpu_unblock(vcpu); 806 + goto retry; 1028 807 } 808 + 809 + if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) { 810 + if (!ibs_enabled(vcpu)) { 811 + trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1); 812 + atomic_set_mask(CPUSTAT_IBS, 813 + &vcpu->arch.sie_block->cpuflags); 814 + } 815 + goto retry; 816 + } 817 + 818 + if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) { 819 + if (ibs_enabled(vcpu)) { 820 + trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0); 821 + atomic_clear_mask(CPUSTAT_IBS, 822 + &vcpu->arch.sie_block->cpuflags); 823 + } 824 + goto retry; 825 + } 826 + 1029 827 return 0; 1030 828 } 1031 829 1032 - static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu) 830 + /** 831 + * kvm_arch_fault_in_page - fault-in guest page if necessary 832 + * @vcpu: The corresponding virtual cpu 833 + * @gpa: Guest physical address 834 + * @writable: Whether the page should be writable or not 835 + * 836 + * Make sure that a guest page has been faulted-in on the host. 837 + * 838 + * Return: Zero on success, negative error code otherwise. 839 + */ 840 + long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable) 1033 841 { 1034 - long rc; 1035 - hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap); 1036 842 struct mm_struct *mm = current->mm; 843 + hva_t hva; 844 + long rc; 845 + 846 + hva = gmap_fault(gpa, vcpu->arch.gmap); 847 + if (IS_ERR_VALUE(hva)) 848 + return (long)hva; 1037 849 down_read(&mm->mmap_sem); 1038 - rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL); 850 + rc = get_user_pages(current, mm, hva, 1, writable, 0, NULL, NULL); 1039 851 up_read(&mm->mmap_sem); 1040 - return rc; 852 + 853 + return rc < 0 ? rc : 0; 1041 854 } 1042 855 1043 856 static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, ··· 1138 883 if (!vcpu->arch.gmap->pfault_enabled) 1139 884 return 0; 1140 885 1141 - hva = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap); 1142 - if (copy_from_guest(vcpu, &arch.pfault_token, vcpu->arch.pfault_token, 8)) 886 + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr)); 887 + hva += current->thread.gmap_addr & ~PAGE_MASK; 888 + if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8)) 1143 889 return 0; 1144 890 1145 891 rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); ··· 1173 917 if (rc) 1174 918 return rc; 1175 919 920 + if (guestdbg_enabled(vcpu)) { 921 + kvm_s390_backup_guest_per_regs(vcpu); 922 + kvm_s390_patch_guest_per_regs(vcpu); 923 + } 924 + 1176 925 vcpu->arch.sie_block->icptcode = 0; 1177 926 cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); 1178 927 VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); ··· 1194 933 vcpu->arch.sie_block->icptcode); 1195 934 trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); 1196 935 936 + if (guestdbg_enabled(vcpu)) 937 + kvm_s390_restore_guest_per_regs(vcpu); 938 + 1197 939 if (exit_reason >= 0) { 1198 940 rc = 0; 1199 941 } else if (kvm_is_ucontrol(vcpu->kvm)) { ··· 1209 945 } else if (current->thread.gmap_pfault) { 1210 946 trace_kvm_s390_major_guest_pfault(vcpu); 1211 947 current->thread.gmap_pfault = 0; 1212 - if (kvm_arch_setup_async_pf(vcpu) || 1213 - (kvm_arch_fault_in_sync(vcpu) >= 0)) 948 + if (kvm_arch_setup_async_pf(vcpu)) { 1214 949 rc = 0; 950 + } else { 951 + gpa_t gpa = current->thread.gmap_addr; 952 + rc = kvm_arch_fault_in_page(vcpu, gpa, 1); 953 + } 1215 954 } 1216 955 1217 956 if (rc == -1) { ··· 1234 967 } 1235 968 1236 969 return rc; 1237 - } 1238 - 1239 - bool kvm_enabled_cmma(void) 1240 - { 1241 - if (!MACHINE_IS_LPAR) 1242 - return false; 1243 - /* only enable for z10 and later */ 1244 - if (!MACHINE_HAS_EDAT1) 1245 - return false; 1246 - return true; 1247 970 } 1248 971 1249 972 static int __vcpu_run(struct kvm_vcpu *vcpu) ··· 1265 1008 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 1266 1009 1267 1010 rc = vcpu_post_run(vcpu, exit_reason); 1268 - } while (!signal_pending(current) && !rc); 1011 + } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); 1269 1012 1270 1013 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 1271 1014 return rc; ··· 1276 1019 int rc; 1277 1020 sigset_t sigsaved; 1278 1021 1022 + if (guestdbg_exit_pending(vcpu)) { 1023 + kvm_s390_prepare_debug_exit(vcpu); 1024 + return 0; 1025 + } 1026 + 1279 1027 if (vcpu->sigset_active) 1280 1028 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 1281 1029 1282 - atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 1030 + kvm_s390_vcpu_start(vcpu); 1283 1031 1284 1032 switch (kvm_run->exit_reason) { 1285 1033 case KVM_EXIT_S390_SIEIC: ··· 1293 1031 case KVM_EXIT_S390_RESET: 1294 1032 case KVM_EXIT_S390_UCONTROL: 1295 1033 case KVM_EXIT_S390_TSCH: 1034 + case KVM_EXIT_DEBUG: 1296 1035 break; 1297 1036 default: 1298 1037 BUG(); ··· 1319 1056 rc = -EINTR; 1320 1057 } 1321 1058 1059 + if (guestdbg_exit_pending(vcpu) && !rc) { 1060 + kvm_s390_prepare_debug_exit(vcpu); 1061 + rc = 0; 1062 + } 1063 + 1322 1064 if (rc == -EOPNOTSUPP) { 1323 1065 /* intercept cannot be handled in-kernel, prepare kvm-run */ 1324 1066 kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; ··· 1341 1073 1342 1074 kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; 1343 1075 kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; 1344 - kvm_run->s.regs.prefix = vcpu->arch.sie_block->prefix; 1076 + kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); 1345 1077 memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); 1346 1078 1347 1079 if (vcpu->sigset_active) ··· 1351 1083 return rc; 1352 1084 } 1353 1085 1354 - static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, void *from, 1355 - unsigned long n, int prefix) 1356 - { 1357 - if (prefix) 1358 - return copy_to_guest(vcpu, guestdest, from, n); 1359 - else 1360 - return copy_to_guest_absolute(vcpu, guestdest, from, n); 1361 - } 1362 - 1363 1086 /* 1364 1087 * store status at address 1365 1088 * we use have two special cases: 1366 1089 * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit 1367 1090 * KVM_S390_STORE_STATUS_PREFIXED: -> prefix 1368 1091 */ 1369 - int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr) 1092 + int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) 1370 1093 { 1371 1094 unsigned char archmode = 1; 1372 - int prefix; 1095 + unsigned int px; 1373 1096 u64 clkcomp; 1097 + int rc; 1374 1098 1375 - if (addr == KVM_S390_STORE_STATUS_NOADDR) { 1376 - if (copy_to_guest_absolute(vcpu, 163ul, &archmode, 1)) 1099 + if (gpa == KVM_S390_STORE_STATUS_NOADDR) { 1100 + if (write_guest_abs(vcpu, 163, &archmode, 1)) 1377 1101 return -EFAULT; 1378 - addr = SAVE_AREA_BASE; 1379 - prefix = 0; 1380 - } else if (addr == KVM_S390_STORE_STATUS_PREFIXED) { 1381 - if (copy_to_guest(vcpu, 163ul, &archmode, 1)) 1102 + gpa = SAVE_AREA_BASE; 1103 + } else if (gpa == KVM_S390_STORE_STATUS_PREFIXED) { 1104 + if (write_guest_real(vcpu, 163, &archmode, 1)) 1382 1105 return -EFAULT; 1383 - addr = SAVE_AREA_BASE; 1384 - prefix = 1; 1385 - } else 1386 - prefix = 0; 1387 - 1388 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), 1389 - vcpu->arch.guest_fpregs.fprs, 128, prefix)) 1390 - return -EFAULT; 1391 - 1392 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, gp_regs), 1393 - vcpu->run->s.regs.gprs, 128, prefix)) 1394 - return -EFAULT; 1395 - 1396 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, psw), 1397 - &vcpu->arch.sie_block->gpsw, 16, prefix)) 1398 - return -EFAULT; 1399 - 1400 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, pref_reg), 1401 - &vcpu->arch.sie_block->prefix, 4, prefix)) 1402 - return -EFAULT; 1403 - 1404 - if (__guestcopy(vcpu, 1405 - addr + offsetof(struct save_area, fp_ctrl_reg), 1406 - &vcpu->arch.guest_fpregs.fpc, 4, prefix)) 1407 - return -EFAULT; 1408 - 1409 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, tod_reg), 1410 - &vcpu->arch.sie_block->todpr, 4, prefix)) 1411 - return -EFAULT; 1412 - 1413 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, timer), 1414 - &vcpu->arch.sie_block->cputm, 8, prefix)) 1415 - return -EFAULT; 1416 - 1106 + gpa = kvm_s390_real_to_abs(vcpu, SAVE_AREA_BASE); 1107 + } 1108 + rc = write_guest_abs(vcpu, gpa + offsetof(struct save_area, fp_regs), 1109 + vcpu->arch.guest_fpregs.fprs, 128); 1110 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, gp_regs), 1111 + vcpu->run->s.regs.gprs, 128); 1112 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, psw), 1113 + &vcpu->arch.sie_block->gpsw, 16); 1114 + px = kvm_s390_get_prefix(vcpu); 1115 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, pref_reg), 1116 + &px, 4); 1117 + rc |= write_guest_abs(vcpu, 1118 + gpa + offsetof(struct save_area, fp_ctrl_reg), 1119 + &vcpu->arch.guest_fpregs.fpc, 4); 1120 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, tod_reg), 1121 + &vcpu->arch.sie_block->todpr, 4); 1122 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, timer), 1123 + &vcpu->arch.sie_block->cputm, 8); 1417 1124 clkcomp = vcpu->arch.sie_block->ckc >> 8; 1418 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, clk_cmp), 1419 - &clkcomp, 8, prefix)) 1420 - return -EFAULT; 1421 - 1422 - if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs), 1423 - &vcpu->run->s.regs.acrs, 64, prefix)) 1424 - return -EFAULT; 1425 - 1426 - if (__guestcopy(vcpu, 1427 - addr + offsetof(struct save_area, ctrl_regs), 1428 - &vcpu->arch.sie_block->gcr, 128, prefix)) 1429 - return -EFAULT; 1430 - return 0; 1125 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, clk_cmp), 1126 + &clkcomp, 8); 1127 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, acc_regs), 1128 + &vcpu->run->s.regs.acrs, 64); 1129 + rc |= write_guest_abs(vcpu, gpa + offsetof(struct save_area, ctrl_regs), 1130 + &vcpu->arch.sie_block->gcr, 128); 1131 + return rc ? -EFAULT : 0; 1431 1132 } 1432 1133 1433 1134 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) ··· 1411 1174 save_access_regs(vcpu->run->s.regs.acrs); 1412 1175 1413 1176 return kvm_s390_store_status_unloaded(vcpu, addr); 1177 + } 1178 + 1179 + static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) 1180 + { 1181 + return atomic_read(&(vcpu)->arch.sie_block->cpuflags) & CPUSTAT_STOPPED; 1182 + } 1183 + 1184 + static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) 1185 + { 1186 + kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); 1187 + kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu); 1188 + exit_sie_sync(vcpu); 1189 + } 1190 + 1191 + static void __disable_ibs_on_all_vcpus(struct kvm *kvm) 1192 + { 1193 + unsigned int i; 1194 + struct kvm_vcpu *vcpu; 1195 + 1196 + kvm_for_each_vcpu(i, vcpu, kvm) { 1197 + __disable_ibs_on_vcpu(vcpu); 1198 + } 1199 + } 1200 + 1201 + static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) 1202 + { 1203 + kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); 1204 + kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu); 1205 + exit_sie_sync(vcpu); 1206 + } 1207 + 1208 + void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) 1209 + { 1210 + int i, online_vcpus, started_vcpus = 0; 1211 + 1212 + if (!is_vcpu_stopped(vcpu)) 1213 + return; 1214 + 1215 + trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); 1216 + /* Only one cpu at a time may enter/leave the STOPPED state. */ 1217 + spin_lock_bh(&vcpu->kvm->arch.start_stop_lock); 1218 + online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); 1219 + 1220 + for (i = 0; i < online_vcpus; i++) { 1221 + if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) 1222 + started_vcpus++; 1223 + } 1224 + 1225 + if (started_vcpus == 0) { 1226 + /* we're the only active VCPU -> speed it up */ 1227 + __enable_ibs_on_vcpu(vcpu); 1228 + } else if (started_vcpus == 1) { 1229 + /* 1230 + * As we are starting a second VCPU, we have to disable 1231 + * the IBS facility on all VCPUs to remove potentially 1232 + * oustanding ENABLE requests. 1233 + */ 1234 + __disable_ibs_on_all_vcpus(vcpu->kvm); 1235 + } 1236 + 1237 + atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 1238 + /* 1239 + * Another VCPU might have used IBS while we were offline. 1240 + * Let's play safe and flush the VCPU at startup. 1241 + */ 1242 + vcpu->arch.sie_block->ihcpu = 0xffff; 1243 + spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock); 1244 + return; 1245 + } 1246 + 1247 + void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) 1248 + { 1249 + int i, online_vcpus, started_vcpus = 0; 1250 + struct kvm_vcpu *started_vcpu = NULL; 1251 + 1252 + if (is_vcpu_stopped(vcpu)) 1253 + return; 1254 + 1255 + trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0); 1256 + /* Only one cpu at a time may enter/leave the STOPPED state. */ 1257 + spin_lock_bh(&vcpu->kvm->arch.start_stop_lock); 1258 + online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); 1259 + 1260 + atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 1261 + __disable_ibs_on_vcpu(vcpu); 1262 + 1263 + for (i = 0; i < online_vcpus; i++) { 1264 + if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) { 1265 + started_vcpus++; 1266 + started_vcpu = vcpu->kvm->vcpus[i]; 1267 + } 1268 + } 1269 + 1270 + if (started_vcpus == 1) { 1271 + /* 1272 + * As we only have one VCPU left, we want to enable the 1273 + * IBS facility for that VCPU to speed it up. 1274 + */ 1275 + __enable_ibs_on_vcpu(started_vcpu); 1276 + } 1277 + 1278 + spin_unlock_bh(&vcpu->kvm->arch.start_stop_lock); 1279 + return; 1414 1280 } 1415 1281 1416 1282 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,

+69 -4

arch/s390/kvm/kvm-s390.h

··· 28 28 29 29 /* Transactional Memory Execution related macros */ 30 30 #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & 0x10)) 31 - #define TDB_ADDR 0x1800UL 32 31 #define TDB_FORMAT1 1 33 32 #define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) 34 33 ··· 61 62 #endif 62 63 } 63 64 65 + #define GUEST_PREFIX_SHIFT 13 66 + static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) 67 + { 68 + return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT; 69 + } 70 + 64 71 static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) 65 72 { 66 - vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u; 73 + vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; 67 74 vcpu->arch.sie_block->ihcpu = 0xffff; 68 75 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); 69 76 } ··· 135 130 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); 136 131 void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu); 137 132 void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu); 133 + void kvm_s390_clear_float_irqs(struct kvm *kvm); 138 134 int __must_check kvm_s390_inject_vm(struct kvm *kvm, 139 135 struct kvm_s390_interrupt *s390int); 140 136 int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, ··· 143 137 int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 144 138 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, 145 139 u64 cr6, u64 schid); 140 + void kvm_s390_reinject_io_int(struct kvm *kvm, 141 + struct kvm_s390_interrupt_info *inti); 146 142 int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked); 147 143 148 144 /* implemented in priv.c */ 145 + int is_valid_psw(psw_t *psw); 149 146 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); 150 147 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); 151 148 int kvm_s390_handle_01(struct kvm_vcpu *vcpu); 152 149 int kvm_s390_handle_b9(struct kvm_vcpu *vcpu); 153 150 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu); 151 + int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu); 154 152 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); 155 153 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); 156 154 157 155 /* implemented in sigp.c */ 158 156 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 157 + int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); 159 158 160 159 /* implemented in kvm-s390.c */ 160 + long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); 161 161 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); 162 162 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); 163 + void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); 164 + void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); 163 165 void s390_vcpu_block(struct kvm_vcpu *vcpu); 164 166 void s390_vcpu_unblock(struct kvm_vcpu *vcpu); 165 167 void exit_sie(struct kvm_vcpu *vcpu); 166 168 void exit_sie_sync(struct kvm_vcpu *vcpu); 167 - /* are we going to support cmma? */ 168 - bool kvm_enabled_cmma(void); 169 + int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); 170 + void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); 171 + /* is cmma enabled */ 172 + bool kvm_s390_cmma_enabled(struct kvm *kvm); 173 + int test_vfacility(unsigned long nr); 174 + 169 175 /* implemented in diag.c */ 170 176 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 177 + /* implemented in interrupt.c */ 178 + int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, 179 + struct kvm_s390_pgm_info *pgm_info); 180 + 181 + /** 182 + * kvm_s390_inject_prog_cond - conditionally inject a program check 183 + * @vcpu: virtual cpu 184 + * @rc: original return/error code 185 + * 186 + * This function is supposed to be used after regular guest access functions 187 + * failed, to conditionally inject a program check to a vcpu. The typical 188 + * pattern would look like 189 + * 190 + * rc = write_guest(vcpu, addr, data, len); 191 + * if (rc) 192 + * return kvm_s390_inject_prog_cond(vcpu, rc); 193 + * 194 + * A negative return code from guest access functions implies an internal error 195 + * like e.g. out of memory. In these cases no program check should be injected 196 + * to the guest. 197 + * A positive value implies that an exception happened while accessing a guest's 198 + * memory. In this case all data belonging to the corresponding program check 199 + * has been stored in vcpu->arch.pgm and can be injected with 200 + * kvm_s390_inject_prog_irq(). 201 + * 202 + * Returns: - the original @rc value if @rc was negative (internal error) 203 + * - zero if @rc was already zero 204 + * - zero or error code from injecting if @rc was positive 205 + * (program check injected to @vcpu) 206 + */ 207 + static inline int kvm_s390_inject_prog_cond(struct kvm_vcpu *vcpu, int rc) 208 + { 209 + if (rc <= 0) 210 + return rc; 211 + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); 212 + } 171 213 172 214 /* implemented in interrupt.c */ 173 215 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 174 216 int psw_extint_disabled(struct kvm_vcpu *vcpu); 175 217 void kvm_s390_destroy_adapters(struct kvm *kvm); 218 + int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu); 219 + 220 + /* implemented in guestdbg.c */ 221 + void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); 222 + void kvm_s390_restore_guest_per_regs(struct kvm_vcpu *vcpu); 223 + void kvm_s390_patch_guest_per_regs(struct kvm_vcpu *vcpu); 224 + int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, 225 + struct kvm_guest_debug *dbg); 226 + void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); 227 + void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); 228 + void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); 176 229 177 230 #endif

+257 -98

arch/s390/kvm/priv.c

··· 35 35 { 36 36 struct kvm_vcpu *cpup; 37 37 s64 hostclk, val; 38 + int i, rc; 38 39 u64 op2; 39 - int i; 40 40 41 41 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 42 42 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); ··· 44 44 op2 = kvm_s390_get_base_disp_s(vcpu); 45 45 if (op2 & 7) /* Operand must be on a doubleword boundary */ 46 46 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 47 - if (get_guest(vcpu, val, (u64 __user *) op2)) 48 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 47 + rc = read_guest(vcpu, op2, &val, sizeof(val)); 48 + if (rc) 49 + return kvm_s390_inject_prog_cond(vcpu, rc); 49 50 50 51 if (store_tod_clock(&hostclk)) { 51 52 kvm_s390_set_psw_cc(vcpu, 3); ··· 66 65 static int handle_set_prefix(struct kvm_vcpu *vcpu) 67 66 { 68 67 u64 operand2; 69 - u32 address = 0; 70 - u8 tmp; 68 + u32 address; 69 + int rc; 71 70 72 71 vcpu->stat.instruction_spx++; 73 72 ··· 81 80 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 82 81 83 82 /* get the value */ 84 - if (get_guest(vcpu, address, (u32 __user *) operand2)) 85 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 83 + rc = read_guest(vcpu, operand2, &address, sizeof(address)); 84 + if (rc) 85 + return kvm_s390_inject_prog_cond(vcpu, rc); 86 86 87 - address = address & 0x7fffe000u; 87 + address &= 0x7fffe000u; 88 88 89 - /* make sure that the new value is valid memory */ 90 - if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || 91 - (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) 89 + /* 90 + * Make sure the new value is valid memory. We only need to check the 91 + * first page, since address is 8k aligned and memory pieces are always 92 + * at least 1MB aligned and have at least a size of 1MB. 93 + */ 94 + if (kvm_is_error_gpa(vcpu->kvm, address)) 92 95 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 93 96 94 97 kvm_s390_set_prefix(vcpu, address); ··· 106 101 { 107 102 u64 operand2; 108 103 u32 address; 104 + int rc; 109 105 110 106 vcpu->stat.instruction_stpx++; 111 107 ··· 119 113 if (operand2 & 3) 120 114 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 121 115 122 - address = vcpu->arch.sie_block->prefix; 123 - address = address & 0x7fffe000u; 116 + address = kvm_s390_get_prefix(vcpu); 124 117 125 118 /* get the value */ 126 - if (put_guest(vcpu, address, (u32 __user *)operand2)) 127 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 119 + rc = write_guest(vcpu, operand2, &address, sizeof(address)); 120 + if (rc) 121 + return kvm_s390_inject_prog_cond(vcpu, rc); 128 122 129 123 VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); 130 124 trace_kvm_s390_handle_prefix(vcpu, 0, address); ··· 133 127 134 128 static int handle_store_cpu_address(struct kvm_vcpu *vcpu) 135 129 { 136 - u64 useraddr; 130 + u16 vcpu_id = vcpu->vcpu_id; 131 + u64 ga; 132 + int rc; 137 133 138 134 vcpu->stat.instruction_stap++; 139 135 140 136 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 141 137 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 142 138 143 - useraddr = kvm_s390_get_base_disp_s(vcpu); 139 + ga = kvm_s390_get_base_disp_s(vcpu); 144 140 145 - if (useraddr & 1) 141 + if (ga & 1) 146 142 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 147 143 148 - if (put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr)) 149 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 144 + rc = write_guest(vcpu, ga, &vcpu_id, sizeof(vcpu_id)); 145 + if (rc) 146 + return kvm_s390_inject_prog_cond(vcpu, rc); 150 147 151 - VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr); 152 - trace_kvm_s390_handle_stap(vcpu, useraddr); 148 + VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", ga); 149 + trace_kvm_s390_handle_stap(vcpu, ga); 153 150 return 0; 154 151 } 155 152 153 + static void __skey_check_enable(struct kvm_vcpu *vcpu) 154 + { 155 + if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE))) 156 + return; 157 + 158 + s390_enable_skey(); 159 + trace_kvm_s390_skey_related_inst(vcpu); 160 + vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); 161 + } 162 + 163 + 156 164 static int handle_skey(struct kvm_vcpu *vcpu) 157 165 { 166 + __skey_check_enable(vcpu); 167 + 158 168 vcpu->stat.instruction_storage_key++; 159 169 160 170 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) ··· 182 160 return 0; 183 161 } 184 162 163 + static int handle_ipte_interlock(struct kvm_vcpu *vcpu) 164 + { 165 + psw_t *psw = &vcpu->arch.sie_block->gpsw; 166 + 167 + vcpu->stat.instruction_ipte_interlock++; 168 + if (psw_bits(*psw).p) 169 + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 170 + wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu)); 171 + psw->addr = __rewind_psw(*psw, 4); 172 + VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation"); 173 + return 0; 174 + } 175 + 185 176 static int handle_test_block(struct kvm_vcpu *vcpu) 186 177 { 187 - unsigned long hva; 188 178 gpa_t addr; 189 179 int reg2; 190 180 ··· 205 171 206 172 kvm_s390_get_regs_rre(vcpu, NULL, &reg2); 207 173 addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; 174 + addr = kvm_s390_logical_to_effective(vcpu, addr); 175 + if (kvm_s390_check_low_addr_protection(vcpu, addr)) 176 + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); 208 177 addr = kvm_s390_real_to_abs(vcpu, addr); 209 178 210 - hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); 211 - if (kvm_is_error_hva(hva)) 179 + if (kvm_is_error_gpa(vcpu->kvm, addr)) 212 180 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 213 181 /* 214 182 * We don't expect errors on modern systems, and do not care 215 183 * about storage keys (yet), so let's just clear the page. 216 184 */ 217 - if (clear_user((void __user *)hva, PAGE_SIZE) != 0) 185 + if (kvm_clear_guest(vcpu->kvm, addr, PAGE_SIZE)) 218 186 return -EFAULT; 219 187 kvm_s390_set_psw_cc(vcpu, 0); 220 188 vcpu->run->s.regs.gprs[0] = 0; ··· 226 190 static int handle_tpi(struct kvm_vcpu *vcpu) 227 191 { 228 192 struct kvm_s390_interrupt_info *inti; 193 + unsigned long len; 194 + u32 tpi_data[3]; 195 + int cc, rc; 229 196 u64 addr; 230 - int cc; 231 197 198 + rc = 0; 232 199 addr = kvm_s390_get_base_disp_s(vcpu); 233 200 if (addr & 3) 234 201 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); ··· 240 201 if (!inti) 241 202 goto no_interrupt; 242 203 cc = 1; 204 + tpi_data[0] = inti->io.subchannel_id << 16 | inti->io.subchannel_nr; 205 + tpi_data[1] = inti->io.io_int_parm; 206 + tpi_data[2] = inti->io.io_int_word; 243 207 if (addr) { 244 208 /* 245 209 * Store the two-word I/O interruption code into the 246 210 * provided area. 247 211 */ 248 - if (put_guest(vcpu, inti->io.subchannel_id, (u16 __user *)addr) 249 - || put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *)(addr + 2)) 250 - || put_guest(vcpu, inti->io.io_int_parm, (u32 __user *)(addr + 4))) 251 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 212 + len = sizeof(tpi_data) - 4; 213 + rc = write_guest(vcpu, addr, &tpi_data, len); 214 + if (rc) 215 + return kvm_s390_inject_prog_cond(vcpu, rc); 252 216 } else { 253 217 /* 254 218 * Store the three-word I/O interruption code into 255 219 * the appropriate lowcore area. 256 220 */ 257 - put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) __LC_SUBCHANNEL_ID); 258 - put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) __LC_SUBCHANNEL_NR); 259 - put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) __LC_IO_INT_PARM); 260 - put_guest(vcpu, inti->io.io_int_word, (u32 __user *) __LC_IO_INT_WORD); 221 + len = sizeof(tpi_data); 222 + if (write_guest_lc(vcpu, __LC_SUBCHANNEL_ID, &tpi_data, len)) 223 + rc = -EFAULT; 261 224 } 262 - kfree(inti); 225 + /* 226 + * If we encounter a problem storing the interruption code, the 227 + * instruction is suppressed from the guest's view: reinject the 228 + * interrupt. 229 + */ 230 + if (!rc) 231 + kfree(inti); 232 + else 233 + kvm_s390_reinject_io_int(vcpu->kvm, inti); 263 234 no_interrupt: 264 235 /* Set condition code and we're done. */ 265 - kvm_s390_set_psw_cc(vcpu, cc); 266 - return 0; 236 + if (!rc) 237 + kvm_s390_set_psw_cc(vcpu, cc); 238 + return rc ? -EFAULT : 0; 267 239 } 268 240 269 241 static int handle_tsch(struct kvm_vcpu *vcpu) ··· 342 292 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 343 293 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 344 294 345 - rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 346 - vfacilities, 4); 295 + rc = write_guest_lc(vcpu, offsetof(struct _lowcore, stfl_fac_list), 296 + vfacilities, 4); 347 297 if (rc) 348 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 298 + return rc; 349 299 VCPU_EVENT(vcpu, 5, "store facility list value %x", 350 300 *(unsigned int *) vfacilities); 351 301 trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities); ··· 364 314 #define PSW_ADDR_24 0x0000000000ffffffUL 365 315 #define PSW_ADDR_31 0x000000007fffffffUL 366 316 367 - static int is_valid_psw(psw_t *psw) { 317 + int is_valid_psw(psw_t *psw) 318 + { 368 319 if (psw->mask & PSW_MASK_UNASSIGNED) 369 320 return 0; 370 321 if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) { ··· 376 325 return 0; 377 326 if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_EA) 378 327 return 0; 328 + if (psw->addr & 1) 329 + return 0; 379 330 return 1; 380 331 } 381 332 ··· 386 333 psw_t *gpsw = &vcpu->arch.sie_block->gpsw; 387 334 psw_compat_t new_psw; 388 335 u64 addr; 336 + int rc; 389 337 390 338 if (gpsw->mask & PSW_MASK_PSTATE) 391 339 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); ··· 394 340 addr = kvm_s390_get_base_disp_s(vcpu); 395 341 if (addr & 7) 396 342 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 397 - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) 398 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 343 + 344 + rc = read_guest(vcpu, addr, &new_psw, sizeof(new_psw)); 345 + if (rc) 346 + return kvm_s390_inject_prog_cond(vcpu, rc); 399 347 if (!(new_psw.mask & PSW32_MASK_BASE)) 400 348 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 401 349 gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32; ··· 413 357 { 414 358 psw_t new_psw; 415 359 u64 addr; 360 + int rc; 416 361 417 362 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 418 363 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); ··· 421 364 addr = kvm_s390_get_base_disp_s(vcpu); 422 365 if (addr & 7) 423 366 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 424 - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) 425 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 367 + rc = read_guest(vcpu, addr, &new_psw, sizeof(new_psw)); 368 + if (rc) 369 + return kvm_s390_inject_prog_cond(vcpu, rc); 426 370 vcpu->arch.sie_block->gpsw = new_psw; 427 371 if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) 428 372 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); ··· 433 375 434 376 static int handle_stidp(struct kvm_vcpu *vcpu) 435 377 { 378 + u64 stidp_data = vcpu->arch.stidp_data; 436 379 u64 operand2; 380 + int rc; 437 381 438 382 vcpu->stat.instruction_stidp++; 439 383 ··· 447 387 if (operand2 & 7) 448 388 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 449 389 450 - if (put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2)) 451 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 390 + rc = write_guest(vcpu, operand2, &stidp_data, sizeof(stidp_data)); 391 + if (rc) 392 + return kvm_s390_inject_prog_cond(vcpu, rc); 452 393 453 394 VCPU_EVENT(vcpu, 5, "%s", "store cpu id"); 454 395 return 0; ··· 535 474 break; 536 475 } 537 476 538 - if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { 539 - rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 540 - goto out_exception; 477 + rc = write_guest(vcpu, operand2, (void *)mem, PAGE_SIZE); 478 + if (rc) { 479 + rc = kvm_s390_inject_prog_cond(vcpu, rc); 480 + goto out; 541 481 } 542 482 trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); 543 483 free_page(mem); ··· 547 485 return 0; 548 486 out_no_data: 549 487 kvm_s390_set_psw_cc(vcpu, 3); 550 - out_exception: 488 + out: 551 489 free_page(mem); 552 490 return rc; 553 491 } ··· 558 496 [0x10] = handle_set_prefix, 559 497 [0x11] = handle_store_prefix, 560 498 [0x12] = handle_store_cpu_address, 499 + [0x21] = handle_ipte_interlock, 561 500 [0x29] = handle_skey, 562 501 [0x2a] = handle_skey, 563 502 [0x2b] = handle_skey, ··· 576 513 [0x3a] = handle_io_inst, 577 514 [0x3b] = handle_io_inst, 578 515 [0x3c] = handle_io_inst, 516 + [0x50] = handle_ipte_interlock, 579 517 [0x5f] = handle_io_inst, 580 518 [0x74] = handle_io_inst, 581 519 [0x76] = handle_io_inst, ··· 655 591 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 656 592 657 593 start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; 594 + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { 595 + if (kvm_s390_check_low_addr_protection(vcpu, start)) 596 + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); 597 + } 598 + 658 599 switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { 659 600 case 0x00000000: 660 601 end = (start + (1UL << 12)) & ~((1UL << 12) - 1); ··· 675 606 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 676 607 } 677 608 while (start < end) { 678 - unsigned long useraddr; 609 + unsigned long useraddr, abs_addr; 679 610 680 - useraddr = gmap_translate(start, vcpu->arch.gmap); 681 - if (IS_ERR((void *)useraddr)) 611 + /* Translate guest address to host address */ 612 + if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0) 613 + abs_addr = kvm_s390_real_to_abs(vcpu, start); 614 + else 615 + abs_addr = start; 616 + useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr)); 617 + if (kvm_is_error_hva(useraddr)) 682 618 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 683 619 684 620 if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { ··· 692 618 } 693 619 694 620 if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) { 621 + __skey_check_enable(vcpu); 695 622 if (set_guest_storage_key(current->mm, useraddr, 696 623 vcpu->run->s.regs.gprs[reg1] & PFMF_KEY, 697 624 vcpu->run->s.regs.gprs[reg1] & PFMF_NQ)) ··· 717 642 VCPU_EVENT(vcpu, 5, "cmma release %d pages", entries); 718 643 gmap = vcpu->arch.gmap; 719 644 vcpu->stat.instruction_essa++; 720 - if (!kvm_enabled_cmma() || !vcpu->arch.sie_block->cbrlo) 645 + if (!kvm_s390_cmma_enabled(vcpu->kvm)) 721 646 return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); 722 647 723 648 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) ··· 747 672 } 748 673 749 674 static const intercept_handler_t b9_handlers[256] = { 675 + [0x8a] = handle_ipte_interlock, 750 676 [0x8d] = handle_epsw, 677 + [0x8e] = handle_ipte_interlock, 678 + [0x8f] = handle_ipte_interlock, 751 679 [0xab] = handle_essa, 752 680 [0xaf] = handle_pfmf, 753 681 }; ··· 771 693 { 772 694 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 773 695 int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 774 - u64 useraddr; 775 696 u32 val = 0; 776 697 int reg, rc; 698 + u64 ga; 777 699 778 700 vcpu->stat.instruction_lctl++; 779 701 780 702 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 781 703 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 782 704 783 - useraddr = kvm_s390_get_base_disp_rs(vcpu); 705 + ga = kvm_s390_get_base_disp_rs(vcpu); 784 706 785 - if (useraddr & 3) 707 + if (ga & 3) 786 708 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 787 709 788 - VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3, 789 - useraddr); 790 - trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr); 710 + VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); 711 + trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, ga); 791 712 792 713 reg = reg1; 793 714 do { 794 - rc = get_guest(vcpu, val, (u32 __user *) useraddr); 715 + rc = read_guest(vcpu, ga, &val, sizeof(val)); 795 716 if (rc) 796 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 717 + return kvm_s390_inject_prog_cond(vcpu, rc); 797 718 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul; 798 719 vcpu->arch.sie_block->gcr[reg] |= val; 799 - useraddr += 4; 720 + ga += 4; 721 + if (reg == reg3) 722 + break; 723 + reg = (reg + 1) % 16; 724 + } while (1); 725 + 726 + return 0; 727 + } 728 + 729 + int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu) 730 + { 731 + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 732 + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 733 + u64 ga; 734 + u32 val; 735 + int reg, rc; 736 + 737 + vcpu->stat.instruction_stctl++; 738 + 739 + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 740 + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 741 + 742 + ga = kvm_s390_get_base_disp_rs(vcpu); 743 + 744 + if (ga & 3) 745 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 746 + 747 + VCPU_EVENT(vcpu, 5, "stctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); 748 + trace_kvm_s390_handle_stctl(vcpu, 0, reg1, reg3, ga); 749 + 750 + reg = reg1; 751 + do { 752 + val = vcpu->arch.sie_block->gcr[reg] & 0x00000000fffffffful; 753 + rc = write_guest(vcpu, ga, &val, sizeof(val)); 754 + if (rc) 755 + return kvm_s390_inject_prog_cond(vcpu, rc); 756 + ga += 4; 800 757 if (reg == reg3) 801 758 break; 802 759 reg = (reg + 1) % 16; ··· 844 731 { 845 732 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 846 733 int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 847 - u64 useraddr; 734 + u64 ga, val; 848 735 int reg, rc; 849 736 850 737 vcpu->stat.instruction_lctlg++; ··· 852 739 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 853 740 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 854 741 855 - useraddr = kvm_s390_get_base_disp_rsy(vcpu); 742 + ga = kvm_s390_get_base_disp_rsy(vcpu); 856 743 857 - if (useraddr & 7) 744 + if (ga & 7) 858 745 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 859 746 860 747 reg = reg1; 861 748 862 - VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3, 863 - useraddr); 864 - trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); 749 + VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); 750 + trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, ga); 865 751 866 752 do { 867 - rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg], 868 - (u64 __user *) useraddr); 753 + rc = read_guest(vcpu, ga, &val, sizeof(val)); 869 754 if (rc) 870 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 871 - useraddr += 8; 755 + return kvm_s390_inject_prog_cond(vcpu, rc); 756 + vcpu->arch.sie_block->gcr[reg] = val; 757 + ga += 8; 758 + if (reg == reg3) 759 + break; 760 + reg = (reg + 1) % 16; 761 + } while (1); 762 + 763 + return 0; 764 + } 765 + 766 + static int handle_stctg(struct kvm_vcpu *vcpu) 767 + { 768 + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 769 + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 770 + u64 ga, val; 771 + int reg, rc; 772 + 773 + vcpu->stat.instruction_stctg++; 774 + 775 + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 776 + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 777 + 778 + ga = kvm_s390_get_base_disp_rsy(vcpu); 779 + 780 + if (ga & 7) 781 + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 782 + 783 + reg = reg1; 784 + 785 + VCPU_EVENT(vcpu, 5, "stctg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); 786 + trace_kvm_s390_handle_stctl(vcpu, 1, reg1, reg3, ga); 787 + 788 + do { 789 + val = vcpu->arch.sie_block->gcr[reg]; 790 + rc = write_guest(vcpu, ga, &val, sizeof(val)); 791 + if (rc) 792 + return kvm_s390_inject_prog_cond(vcpu, rc); 793 + ga += 8; 872 794 if (reg == reg3) 873 795 break; 874 796 reg = (reg + 1) % 16; ··· 914 766 915 767 static const intercept_handler_t eb_handlers[256] = { 916 768 [0x2f] = handle_lctlg, 769 + [0x25] = handle_stctg, 917 770 }; 918 771 919 772 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) ··· 930 781 static int handle_tprot(struct kvm_vcpu *vcpu) 931 782 { 932 783 u64 address1, address2; 933 - struct vm_area_struct *vma; 934 - unsigned long user_address; 784 + unsigned long hva, gpa; 785 + int ret = 0, cc = 0; 786 + bool writable; 935 787 936 788 vcpu->stat.instruction_tprot++; 937 789 ··· 943 793 944 794 /* we only handle the Linux memory detection case: 945 795 * access key == 0 946 - * guest DAT == off 947 796 * everything else goes to userspace. */ 948 797 if (address2 & 0xf0) 949 798 return -EOPNOTSUPP; 950 799 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) 951 - return -EOPNOTSUPP; 800 + ipte_lock(vcpu); 801 + ret = guest_translate_address(vcpu, address1, &gpa, 1); 802 + if (ret == PGM_PROTECTION) { 803 + /* Write protected? Try again with read-only... */ 804 + cc = 1; 805 + ret = guest_translate_address(vcpu, address1, &gpa, 0); 806 + } 807 + if (ret) { 808 + if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) { 809 + ret = kvm_s390_inject_program_int(vcpu, ret); 810 + } else if (ret > 0) { 811 + /* Translation not available */ 812 + kvm_s390_set_psw_cc(vcpu, 3); 813 + ret = 0; 814 + } 815 + goto out_unlock; 816 + } 952 817 953 - down_read(&current->mm->mmap_sem); 954 - user_address = __gmap_translate(address1, vcpu->arch.gmap); 955 - if (IS_ERR_VALUE(user_address)) 956 - goto out_inject; 957 - vma = find_vma(current->mm, user_address); 958 - if (!vma) 959 - goto out_inject; 960 - vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 961 - if (!(vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_READ)) 962 - vcpu->arch.sie_block->gpsw.mask |= (1ul << 44); 963 - if (!(vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_READ)) 964 - vcpu->arch.sie_block->gpsw.mask |= (2ul << 44); 965 - 966 - up_read(&current->mm->mmap_sem); 967 - return 0; 968 - 969 - out_inject: 970 - up_read(&current->mm->mmap_sem); 971 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 818 + hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); 819 + if (kvm_is_error_hva(hva)) { 820 + ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 821 + } else { 822 + if (!writable) 823 + cc = 1; /* Write not permitted ==> read-only */ 824 + kvm_s390_set_psw_cc(vcpu, cc); 825 + /* Note: CC2 only occurs for storage keys (not supported yet) */ 826 + } 827 + out_unlock: 828 + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) 829 + ipte_unlock(vcpu); 830 + return ret; 972 831 } 973 832 974 833 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu)

+60 -43

arch/s390/kvm/sigp.c

··· 54 54 55 55 static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) 56 56 { 57 - struct kvm_s390_local_interrupt *li; 58 - struct kvm_s390_interrupt_info *inti; 57 + struct kvm_s390_interrupt s390int = { 58 + .type = KVM_S390_INT_EMERGENCY, 59 + .parm = vcpu->vcpu_id, 60 + }; 59 61 struct kvm_vcpu *dst_vcpu = NULL; 62 + int rc = 0; 60 63 61 64 if (cpu_addr < KVM_MAX_VCPUS) 62 65 dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); 63 66 if (!dst_vcpu) 64 67 return SIGP_CC_NOT_OPERATIONAL; 65 68 66 - inti = kzalloc(sizeof(*inti), GFP_KERNEL); 67 - if (!inti) 68 - return -ENOMEM; 69 + rc = kvm_s390_inject_vcpu(dst_vcpu, &s390int); 70 + if (!rc) 71 + VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); 69 72 70 - inti->type = KVM_S390_INT_EMERGENCY; 71 - inti->emerg.code = vcpu->vcpu_id; 72 - 73 - li = &dst_vcpu->arch.local_int; 74 - spin_lock_bh(&li->lock); 75 - list_add_tail(&inti->list, &li->list); 76 - atomic_set(&li->active, 1); 77 - atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 78 - if (waitqueue_active(li->wq)) 79 - wake_up_interruptible(li->wq); 80 - spin_unlock_bh(&li->lock); 81 - VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); 82 - 83 - return SIGP_CC_ORDER_CODE_ACCEPTED; 73 + return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED; 84 74 } 85 75 86 76 static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr, ··· 106 116 107 117 static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) 108 118 { 109 - struct kvm_s390_local_interrupt *li; 110 - struct kvm_s390_interrupt_info *inti; 119 + struct kvm_s390_interrupt s390int = { 120 + .type = KVM_S390_INT_EXTERNAL_CALL, 121 + .parm = vcpu->vcpu_id, 122 + }; 111 123 struct kvm_vcpu *dst_vcpu = NULL; 124 + int rc; 112 125 113 126 if (cpu_addr < KVM_MAX_VCPUS) 114 127 dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); 115 128 if (!dst_vcpu) 116 129 return SIGP_CC_NOT_OPERATIONAL; 117 130 118 - inti = kzalloc(sizeof(*inti), GFP_KERNEL); 119 - if (!inti) 120 - return -ENOMEM; 131 + rc = kvm_s390_inject_vcpu(dst_vcpu, &s390int); 132 + if (!rc) 133 + VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); 121 134 122 - inti->type = KVM_S390_INT_EXTERNAL_CALL; 123 - inti->extcall.code = vcpu->vcpu_id; 124 - 125 - li = &dst_vcpu->arch.local_int; 126 - spin_lock_bh(&li->lock); 127 - list_add_tail(&inti->list, &li->list); 128 - atomic_set(&li->active, 1); 129 - atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 130 - if (waitqueue_active(li->wq)) 131 - wake_up_interruptible(li->wq); 132 - spin_unlock_bh(&li->lock); 133 - VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); 134 - 135 - return SIGP_CC_ORDER_CODE_ACCEPTED; 135 + return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED; 136 136 } 137 137 138 138 static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) ··· 215 235 struct kvm_vcpu *dst_vcpu = NULL; 216 236 struct kvm_s390_interrupt_info *inti; 217 237 int rc; 218 - u8 tmp; 219 238 220 239 if (cpu_addr < KVM_MAX_VCPUS) 221 240 dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); ··· 222 243 return SIGP_CC_NOT_OPERATIONAL; 223 244 li = &dst_vcpu->arch.local_int; 224 245 225 - /* make sure that the new value is valid memory */ 226 - address = address & 0x7fffe000u; 227 - if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || 228 - copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)) { 246 + /* 247 + * Make sure the new value is valid memory. We only need to check the 248 + * first page, since address is 8k aligned and memory pieces are always 249 + * at least 1MB aligned and have at least a size of 1MB. 250 + */ 251 + address &= 0x7fffe000u; 252 + if (kvm_is_error_gpa(vcpu->kvm, address)) { 229 253 *reg &= 0xffffffff00000000UL; 230 254 *reg |= SIGP_STATUS_INVALID_PARAMETER; 231 255 return SIGP_CC_STATUS_STORED; ··· 437 455 438 456 kvm_s390_set_psw_cc(vcpu, rc); 439 457 return 0; 458 + } 459 + 460 + /* 461 + * Handle SIGP partial execution interception. 462 + * 463 + * This interception will occur at the source cpu when a source cpu sends an 464 + * external call to a target cpu and the target cpu has the WAIT bit set in 465 + * its cpuflags. Interception will occurr after the interrupt indicator bits at 466 + * the target cpu have been set. All error cases will lead to instruction 467 + * interception, therefore nothing is to be checked or prepared. 468 + */ 469 + int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) 470 + { 471 + int r3 = vcpu->arch.sie_block->ipa & 0x000f; 472 + u16 cpu_addr = vcpu->run->s.regs.gprs[r3]; 473 + struct kvm_vcpu *dest_vcpu; 474 + u8 order_code = kvm_s390_get_base_disp_rs(vcpu); 475 + 476 + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); 477 + 478 + if (order_code == SIGP_EXTERNAL_CALL) { 479 + dest_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); 480 + BUG_ON(dest_vcpu == NULL); 481 + 482 + spin_lock_bh(&dest_vcpu->arch.local_int.lock); 483 + if (waitqueue_active(&dest_vcpu->wq)) 484 + wake_up_interruptible(&dest_vcpu->wq); 485 + dest_vcpu->preempted = true; 486 + spin_unlock_bh(&dest_vcpu->arch.local_int.lock); 487 + 488 + kvm_s390_set_psw_cc(vcpu, SIGP_CC_ORDER_CODE_ACCEPTED); 489 + return 0; 490 + } 491 + 492 + return -EOPNOTSUPP; 440 493 }

+43

arch/s390/kvm/trace-s390.h

··· 68 68 ); 69 69 70 70 /* 71 + * Trace point for start and stop of vpcus. 72 + */ 73 + TRACE_EVENT(kvm_s390_vcpu_start_stop, 74 + TP_PROTO(unsigned int id, int state), 75 + TP_ARGS(id, state), 76 + 77 + TP_STRUCT__entry( 78 + __field(unsigned int, id) 79 + __field(int, state) 80 + ), 81 + 82 + TP_fast_assign( 83 + __entry->id = id; 84 + __entry->state = state; 85 + ), 86 + 87 + TP_printk("%s cpu %d", __entry->state ? "starting" : "stopping", 88 + __entry->id) 89 + ); 90 + 91 + /* 71 92 * Trace points for injection of interrupts, either per machine or 72 93 * per vcpu. 73 94 */ ··· 242 221 243 222 TP_printk("enabling channel I/O support (kvm @ %p)\n", 244 223 __entry->kvm) 224 + ); 225 + 226 + /* 227 + * Trace point for enabling and disabling interlocking-and-broadcasting 228 + * suppression. 229 + */ 230 + TRACE_EVENT(kvm_s390_enable_disable_ibs, 231 + TP_PROTO(unsigned int id, int state), 232 + TP_ARGS(id, state), 233 + 234 + TP_STRUCT__entry( 235 + __field(unsigned int, id) 236 + __field(int, state) 237 + ), 238 + 239 + TP_fast_assign( 240 + __entry->id = id; 241 + __entry->state = state; 242 + ), 243 + 244 + TP_printk("%s ibs on cpu %d", 245 + __entry->state ? "enabling" : "disabling", __entry->id) 245 246 ); 246 247 247 248

+64 -35

arch/s390/kvm/trace.h

··· 2 2 #define _TRACE_KVM_H 3 3 4 4 #include <linux/tracepoint.h> 5 - #include <asm/sigp.h> 5 + #include <asm/sie.h> 6 6 #include <asm/debug.h> 7 7 #include <asm/dis.h> 8 8 ··· 29 29 #define VCPU_TP_PRINTK(p_str, p_args...) \ 30 30 TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \ 31 31 __entry->pswmask, __entry->pswaddr, p_args) 32 + 33 + TRACE_EVENT(kvm_s390_skey_related_inst, 34 + TP_PROTO(VCPU_PROTO_COMMON), 35 + TP_ARGS(VCPU_ARGS_COMMON), 36 + 37 + TP_STRUCT__entry( 38 + VCPU_FIELD_COMMON 39 + ), 40 + 41 + TP_fast_assign( 42 + VCPU_ASSIGN_COMMON 43 + ), 44 + VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu") 45 + ); 32 46 33 47 TRACE_EVENT(kvm_s390_major_guest_pfault, 34 48 TP_PROTO(VCPU_PROTO_COMMON), ··· 125 111 VCPU_TP_PRINTK("%s", "fault in sie instruction") 126 112 ); 127 113 128 - #define sie_intercept_code \ 129 - {0x04, "Instruction"}, \ 130 - {0x08, "Program interruption"}, \ 131 - {0x0C, "Instruction and program interruption"}, \ 132 - {0x10, "External request"}, \ 133 - {0x14, "External interruption"}, \ 134 - {0x18, "I/O request"}, \ 135 - {0x1C, "Wait state"}, \ 136 - {0x20, "Validity"}, \ 137 - {0x28, "Stop request"} 138 - 139 114 TRACE_EVENT(kvm_s390_sie_exit, 140 115 TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode), 141 116 TP_ARGS(VCPU_ARGS_COMMON, icptcode), ··· 154 151 TP_STRUCT__entry( 155 152 VCPU_FIELD_COMMON 156 153 __field(__u64, instruction) 157 - __field(char, insn[8]) 158 154 ), 159 155 160 156 TP_fast_assign( ··· 164 162 165 163 VCPU_TP_PRINTK("intercepted instruction %016llx (%s)", 166 164 __entry->instruction, 167 - insn_to_mnemonic((unsigned char *) 168 - &__entry->instruction, 169 - __entry->insn, sizeof(__entry->insn)) ? 170 - "unknown" : __entry->insn) 165 + __print_symbolic(icpt_insn_decoder(__entry->instruction), 166 + icpt_insn_codes)) 171 167 ); 172 168 173 169 /* ··· 213 213 * Trace points for instructions that are of special interest. 214 214 */ 215 215 216 - #define sigp_order_codes \ 217 - {SIGP_SENSE, "sense"}, \ 218 - {SIGP_EXTERNAL_CALL, "external call"}, \ 219 - {SIGP_EMERGENCY_SIGNAL, "emergency signal"}, \ 220 - {SIGP_STOP, "stop"}, \ 221 - {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \ 222 - {SIGP_SET_ARCHITECTURE, "set architecture"}, \ 223 - {SIGP_SET_PREFIX, "set prefix"}, \ 224 - {SIGP_STORE_STATUS_AT_ADDRESS, "store status at addr"}, \ 225 - {SIGP_SENSE_RUNNING, "sense running"}, \ 226 - {SIGP_RESTART, "restart"} 227 - 228 216 TRACE_EVENT(kvm_s390_handle_sigp, 229 217 TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \ 230 218 __u32 parameter), ··· 239 251 __entry->cpu_addr, __entry->parameter) 240 252 ); 241 253 242 - #define diagnose_codes \ 243 - {0x10, "release pages"}, \ 244 - {0x44, "time slice end"}, \ 245 - {0x308, "ipl functions"}, \ 246 - {0x500, "kvm hypercall"}, \ 247 - {0x501, "kvm breakpoint"} 254 + TRACE_EVENT(kvm_s390_handle_sigp_pei, 255 + TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr), 256 + TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr), 257 + 258 + TP_STRUCT__entry( 259 + VCPU_FIELD_COMMON 260 + __field(__u8, order_code) 261 + __field(__u16, cpu_addr) 262 + ), 263 + 264 + TP_fast_assign( 265 + VCPU_ASSIGN_COMMON 266 + __entry->order_code = order_code; 267 + __entry->cpu_addr = cpu_addr; 268 + ), 269 + 270 + VCPU_TP_PRINTK("handle sigp pei order %02x (%s), cpu address %04x", 271 + __entry->order_code, 272 + __print_symbolic(__entry->order_code, 273 + sigp_order_codes), 274 + __entry->cpu_addr) 275 + ); 248 276 249 277 TRACE_EVENT(kvm_s390_handle_diag, 250 278 TP_PROTO(VCPU_PROTO_COMMON, __u16 code), ··· 302 298 303 299 VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx", 304 300 __entry->g ? "lctlg" : "lctl", 301 + __entry->reg1, __entry->reg3, __entry->addr) 302 + ); 303 + 304 + TRACE_EVENT(kvm_s390_handle_stctl, 305 + TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr), 306 + TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr), 307 + 308 + TP_STRUCT__entry( 309 + VCPU_FIELD_COMMON 310 + __field(int, g) 311 + __field(int, reg1) 312 + __field(int, reg3) 313 + __field(u64, addr) 314 + ), 315 + 316 + TP_fast_assign( 317 + VCPU_ASSIGN_COMMON 318 + __entry->g = g; 319 + __entry->reg1 = reg1; 320 + __entry->reg3 = reg3; 321 + __entry->addr = addr; 322 + ), 323 + 324 + VCPU_TP_PRINTK("%s: storing cr %x-%x to %016llx", 325 + __entry->g ? "stctg" : "stctl", 305 326 __entry->reg1, __entry->reg3, __entry->addr) 306 327 ); 307 328

+73 -16

arch/s390/mm/pgtable.c

··· 834 834 } 835 835 spin_unlock(&gmap_notifier_lock); 836 836 } 837 + EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); 837 838 838 839 static inline int page_table_with_pgste(struct page *page) 839 840 { ··· 867 866 atomic_set(&page->_mapcount, 0); 868 867 table = (unsigned long *) page_to_phys(page); 869 868 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 870 - clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT, 871 - PAGE_SIZE/2); 869 + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); 872 870 return table; 873 871 } 874 872 ··· 885 885 __free_page(page); 886 886 } 887 887 888 - static inline unsigned long page_table_reset_pte(struct mm_struct *mm, 889 - pmd_t *pmd, unsigned long addr, unsigned long end) 888 + static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t *pmd, 889 + unsigned long addr, unsigned long end, bool init_skey) 890 890 { 891 891 pte_t *start_pte, *pte; 892 892 spinlock_t *ptl; ··· 897 897 do { 898 898 pgste = pgste_get_lock(pte); 899 899 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 900 + if (init_skey) { 901 + unsigned long address; 902 + 903 + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | 904 + PGSTE_GR_BIT | PGSTE_GC_BIT); 905 + 906 + /* skip invalid and not writable pages */ 907 + if (pte_val(*pte) & _PAGE_INVALID || 908 + !(pte_val(*pte) & _PAGE_WRITE)) { 909 + pgste_set_unlock(pte, pgste); 910 + continue; 911 + } 912 + 913 + address = pte_val(*pte) & PAGE_MASK; 914 + page_set_storage_key(address, PAGE_DEFAULT_KEY, 1); 915 + } 900 916 pgste_set_unlock(pte, pgste); 901 917 } while (pte++, addr += PAGE_SIZE, addr != end); 902 918 pte_unmap_unlock(start_pte, ptl); ··· 920 904 return addr; 921 905 } 922 906 923 - static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, 924 - pud_t *pud, unsigned long addr, unsigned long end) 907 + static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t *pud, 908 + unsigned long addr, unsigned long end, bool init_skey) 925 909 { 926 910 unsigned long next; 927 911 pmd_t *pmd; ··· 931 915 next = pmd_addr_end(addr, end); 932 916 if (pmd_none_or_clear_bad(pmd)) 933 917 continue; 934 - next = page_table_reset_pte(mm, pmd, addr, next); 918 + next = page_table_reset_pte(mm, pmd, addr, next, init_skey); 935 919 } while (pmd++, addr = next, addr != end); 936 920 937 921 return addr; 938 922 } 939 923 940 - static inline unsigned long page_table_reset_pud(struct mm_struct *mm, 941 - pgd_t *pgd, unsigned long addr, unsigned long end) 924 + static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t *pgd, 925 + unsigned long addr, unsigned long end, bool init_skey) 942 926 { 943 927 unsigned long next; 944 928 pud_t *pud; ··· 948 932 next = pud_addr_end(addr, end); 949 933 if (pud_none_or_clear_bad(pud)) 950 934 continue; 951 - next = page_table_reset_pmd(mm, pud, addr, next); 935 + next = page_table_reset_pmd(mm, pud, addr, next, init_skey); 952 936 } while (pud++, addr = next, addr != end); 953 937 954 938 return addr; 955 939 } 956 940 957 - void page_table_reset_pgste(struct mm_struct *mm, 958 - unsigned long start, unsigned long end) 941 + void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, 942 + unsigned long end, bool init_skey) 959 943 { 960 944 unsigned long addr, next; 961 945 pgd_t *pgd; 962 946 947 + down_write(&mm->mmap_sem); 948 + if (init_skey && mm_use_skey(mm)) 949 + goto out_up; 963 950 addr = start; 964 - down_read(&mm->mmap_sem); 965 951 pgd = pgd_offset(mm, addr); 966 952 do { 967 953 next = pgd_addr_end(addr, end); 968 954 if (pgd_none_or_clear_bad(pgd)) 969 955 continue; 970 - next = page_table_reset_pud(mm, pgd, addr, next); 956 + next = page_table_reset_pud(mm, pgd, addr, next, init_skey); 971 957 } while (pgd++, addr = next, addr != end); 972 - up_read(&mm->mmap_sem); 958 + if (init_skey) 959 + current->mm->context.use_skey = 1; 960 + out_up: 961 + up_write(&mm->mmap_sem); 973 962 } 974 963 EXPORT_SYMBOL(page_table_reset_pgste); 975 964 ··· 1012 991 /* changing the guest storage key is considered a change of the page */ 1013 992 if ((pgste_val(new) ^ pgste_val(old)) & 1014 993 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 1015 - pgste_val(new) |= PGSTE_HC_BIT; 994 + pgste_val(new) |= PGSTE_UC_BIT; 1016 995 1017 996 pgste_set_unlock(ptep, new); 1018 997 pte_unmap_unlock(*ptep, ptl); ··· 1032 1011 unsigned long vmaddr) 1033 1012 { 1034 1013 return NULL; 1014 + } 1015 + 1016 + void page_table_reset_pgste(struct mm_struct *mm, unsigned long start, 1017 + unsigned long end, bool init_skey) 1018 + { 1035 1019 } 1036 1020 1037 1021 static inline void page_table_free_pgste(unsigned long *table) ··· 1384 1358 return mm->context.has_pgste ? 0 : -ENOMEM; 1385 1359 } 1386 1360 EXPORT_SYMBOL_GPL(s390_enable_sie); 1361 + 1362 + /* 1363 + * Enable storage key handling from now on and initialize the storage 1364 + * keys with the default key. 1365 + */ 1366 + void s390_enable_skey(void) 1367 + { 1368 + page_table_reset_pgste(current->mm, 0, TASK_SIZE, true); 1369 + } 1370 + EXPORT_SYMBOL_GPL(s390_enable_skey); 1371 + 1372 + /* 1373 + * Test and reset if a guest page is dirty 1374 + */ 1375 + bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) 1376 + { 1377 + pte_t *pte; 1378 + spinlock_t *ptl; 1379 + bool dirty = false; 1380 + 1381 + pte = get_locked_pte(gmap->mm, address, &ptl); 1382 + if (unlikely(!pte)) 1383 + return false; 1384 + 1385 + if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) 1386 + dirty = true; 1387 + 1388 + spin_unlock(ptl); 1389 + return dirty; 1390 + } 1391 + EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); 1387 1392 1388 1393 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1389 1394 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,

-1

arch/x86/include/asm/kvm_emulate.h

··· 189 189 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); 190 190 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); 191 191 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); 192 - void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); 193 192 int (*cpl)(struct x86_emulate_ctxt *ctxt); 194 193 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); 195 194 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);

+1 -6

arch/x86/include/asm/kvm_host.h

··· 50 50 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 51 51 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 52 52 53 - #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 54 - #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 55 - #define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL 56 - #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 57 - 0xFFFFFF0000000000ULL) 53 + #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL 58 54 #define CR4_RESERVED_BITS \ 59 55 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 60 56 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ ··· 130 134 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 131 135 VCPU_EXREG_CR3, 132 136 VCPU_EXREG_RFLAGS, 133 - VCPU_EXREG_CPL, 134 137 VCPU_EXREG_SEGMENTS, 135 138 }; 136 139

+5

arch/x86/include/asm/traps.h

··· 74 74 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); 75 75 #ifdef CONFIG_TRACING 76 76 dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); 77 + #else 78 + static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) 79 + { 80 + do_page_fault(regs, error); 81 + } 77 82 #endif 78 83 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); 79 84 dotraplinkage void do_coprocessor_error(struct pt_regs *, long);

+1 -1

arch/x86/kernel/kvm.c

··· 259 259 260 260 switch (kvm_read_and_reset_pf_reason()) { 261 261 default: 262 - do_page_fault(regs, error_code); 262 + trace_do_page_fault(regs, error_code); 263 263 break; 264 264 case KVM_PV_REASON_PAGE_NOT_PRESENT: 265 265 /* page is swapped out by the host. */

+10 -1

arch/x86/kvm/cpuid.c

··· 283 283 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 284 284 /* cpuid 1.ecx */ 285 285 const u32 kvm_supported_word4_x86_features = 286 + /* NOTE: MONITOR (and MWAIT) are emulated as NOP, 287 + * but *not* advertised to guests via CPUID ! */ 286 288 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 287 289 0 /* DS-CPL, VMX, SMX, EST */ | 288 290 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | ··· 497 495 entry->ecx &= kvm_supported_word6_x86_features; 498 496 cpuid_mask(&entry->ecx, 6); 499 497 break; 498 + case 0x80000007: /* Advanced power management */ 499 + /* invariant TSC is CPUID.80000007H:EDX[8] */ 500 + entry->edx &= (1 << 8); 501 + /* mask against host */ 502 + entry->edx &= boot_cpu_data.x86_power; 503 + entry->eax = entry->ebx = entry->ecx = 0; 504 + break; 500 505 case 0x80000008: { 501 506 unsigned g_phys_as = (entry->eax >> 16) & 0xff; 502 507 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); ··· 534 525 case 3: /* Processor serial number */ 535 526 case 5: /* MONITOR/MWAIT */ 536 527 case 6: /* Thermal management */ 537 - case 0x80000007: /* Advanced power management */ 538 528 case 0xC0000002: 539 529 case 0xC0000003: 540 530 case 0xC0000004: ··· 734 726 not_found: 735 727 return 36; 736 728 } 729 + EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); 737 730 738 731 /* 739 732 * If no match is found, check whether we exceed the vCPU's limit

+7

arch/x86/kvm/cpuid.h

··· 88 88 return best && (best->ecx & bit(X86_FEATURE_X2APIC)); 89 89 } 90 90 91 + static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) 92 + { 93 + struct kvm_cpuid_entry2 *best; 94 + 95 + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 96 + return best && (best->edx & bit(X86_FEATURE_GBPAGES)); 97 + } 91 98 #endif

+52 -41

arch/x86/kvm/emulate.c

··· 161 161 #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ 162 162 #define NoWrite ((u64)1 << 45) /* No writeback */ 163 163 #define SrcWrite ((u64)1 << 46) /* Write back src operand */ 164 + #define NoMod ((u64)1 << 47) /* Mod field is ignored */ 164 165 165 166 #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) 166 167 ··· 1078 1077 ctxt->modrm_rm |= (ctxt->modrm & 0x07); 1079 1078 ctxt->modrm_seg = VCPU_SREG_DS; 1080 1079 1081 - if (ctxt->modrm_mod == 3) { 1080 + if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { 1082 1081 op->type = OP_REG; 1083 1082 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1084 1083 op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ··· 1325 1324 rc->end = n * size; 1326 1325 } 1327 1326 1328 - if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) { 1327 + if (ctxt->rep_prefix && (ctxt->d & String) && 1328 + !(ctxt->eflags & EFLG_DF)) { 1329 1329 ctxt->dst.data = rc->data + rc->pos; 1330 1330 ctxt->dst.type = OP_MEM_STR; 1331 1331 ctxt->dst.count = (rc->end - rc->pos) / size; ··· 1411 1409 } 1412 1410 1413 1411 /* Does not support long mode */ 1414 - static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1415 - u16 selector, int seg) 1412 + static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1413 + u16 selector, int seg, u8 cpl, bool in_task_switch) 1416 1414 { 1417 1415 struct desc_struct seg_desc, old_desc; 1418 - u8 dpl, rpl, cpl; 1416 + u8 dpl, rpl; 1419 1417 unsigned err_vec = GP_VECTOR; 1420 1418 u32 err_code = 0; 1421 1419 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ ··· 1443 1441 } 1444 1442 1445 1443 rpl = selector & 3; 1446 - cpl = ctxt->ops->cpl(ctxt); 1447 1444 1448 1445 /* NULL selector is not valid for TR, CS and SS (except for long mode) */ 1449 1446 if ((seg == VCPU_SREG_CS ··· 1487 1486 goto exception; 1488 1487 break; 1489 1488 case VCPU_SREG_CS: 1489 + if (in_task_switch && rpl != dpl) 1490 + goto exception; 1491 + 1490 1492 if (!(seg_desc.type & 8)) 1491 1493 goto exception; 1492 1494 ··· 1545 1541 exception: 1546 1542 emulate_exception(ctxt, err_vec, err_code, true); 1547 1543 return X86EMUL_PROPAGATE_FAULT; 1544 + } 1545 + 1546 + static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1547 + u16 selector, int seg) 1548 + { 1549 + u8 cpl = ctxt->ops->cpl(ctxt); 1550 + return __load_segment_descriptor(ctxt, selector, seg, cpl, false); 1548 1551 } 1549 1552 1550 1553 static void write_register_operand(struct operand *op) ··· 2415 2404 struct tss_segment_16 *tss) 2416 2405 { 2417 2406 int ret; 2407 + u8 cpl; 2418 2408 2419 2409 ctxt->_eip = tss->ip; 2420 2410 ctxt->eflags = tss->flag | 2; ··· 2438 2426 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); 2439 2427 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); 2440 2428 2429 + cpl = tss->cs & 3; 2430 + 2441 2431 /* 2442 2432 * Now load segment descriptors. If fault happens at this stage 2443 2433 * it is handled in a context of new task 2444 2434 */ 2445 - ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); 2435 + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); 2446 2436 if (ret != X86EMUL_CONTINUE) 2447 2437 return ret; 2448 - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); 2438 + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); 2449 2439 if (ret != X86EMUL_CONTINUE) 2450 2440 return ret; 2451 - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); 2441 + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); 2452 2442 if (ret != X86EMUL_CONTINUE) 2453 2443 return ret; 2454 - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); 2444 + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); 2455 2445 if (ret != X86EMUL_CONTINUE) 2456 2446 return ret; 2457 - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); 2447 + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); 2458 2448 if (ret != X86EMUL_CONTINUE) 2459 2449 return ret; 2460 2450 ··· 2510 2496 static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, 2511 2497 struct tss_segment_32 *tss) 2512 2498 { 2513 - tss->cr3 = ctxt->ops->get_cr(ctxt, 3); 2499 + /* CR3 and ldt selector are not saved intentionally */ 2514 2500 tss->eip = ctxt->_eip; 2515 2501 tss->eflags = ctxt->eflags; 2516 2502 tss->eax = reg_read(ctxt, VCPU_REGS_RAX); ··· 2528 2514 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); 2529 2515 tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); 2530 2516 tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); 2531 - tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); 2532 2517 } 2533 2518 2534 2519 static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2535 2520 struct tss_segment_32 *tss) 2536 2521 { 2537 2522 int ret; 2523 + u8 cpl; 2538 2524 2539 2525 if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) 2540 2526 return emulate_gp(ctxt, 0); ··· 2553 2539 2554 2540 /* 2555 2541 * SDM says that segment selectors are loaded before segment 2556 - * descriptors 2542 + * descriptors. This is important because CPL checks will 2543 + * use CS.RPL. 2557 2544 */ 2558 2545 set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); 2559 2546 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); ··· 2568 2553 * If we're switching between Protected Mode and VM86, we need to make 2569 2554 * sure to update the mode before loading the segment descriptors so 2570 2555 * that the selectors are interpreted correctly. 2571 - * 2572 - * Need to get rflags to the vcpu struct immediately because it 2573 - * influences the CPL which is checked at least when loading the segment 2574 - * descriptors and when pushing an error code to the new kernel stack. 2575 - * 2576 - * TODO Introduce a separate ctxt->ops->set_cpl callback 2577 2556 */ 2578 - if (ctxt->eflags & X86_EFLAGS_VM) 2557 + if (ctxt->eflags & X86_EFLAGS_VM) { 2579 2558 ctxt->mode = X86EMUL_MODE_VM86; 2580 - else 2559 + cpl = 3; 2560 + } else { 2581 2561 ctxt->mode = X86EMUL_MODE_PROT32; 2582 - 2583 - ctxt->ops->set_rflags(ctxt, ctxt->eflags); 2562 + cpl = tss->cs & 3; 2563 + } 2584 2564 2585 2565 /* 2586 2566 * Now load segment descriptors. If fault happenes at this stage 2587 2567 * it is handled in a context of new task 2588 2568 */ 2589 - ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); 2569 + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); 2590 2570 if (ret != X86EMUL_CONTINUE) 2591 2571 return ret; 2592 - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); 2572 + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); 2593 2573 if (ret != X86EMUL_CONTINUE) 2594 2574 return ret; 2595 - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); 2575 + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); 2596 2576 if (ret != X86EMUL_CONTINUE) 2597 2577 return ret; 2598 - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); 2578 + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); 2599 2579 if (ret != X86EMUL_CONTINUE) 2600 2580 return ret; 2601 - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); 2581 + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); 2602 2582 if (ret != X86EMUL_CONTINUE) 2603 2583 return ret; 2604 - ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); 2584 + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); 2605 2585 if (ret != X86EMUL_CONTINUE) 2606 2586 return ret; 2607 - ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); 2587 + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); 2608 2588 if (ret != X86EMUL_CONTINUE) 2609 2589 return ret; 2610 2590 ··· 2614 2604 struct tss_segment_32 tss_seg; 2615 2605 int ret; 2616 2606 u32 new_tss_base = get_desc_base(new_desc); 2607 + u32 eip_offset = offsetof(struct tss_segment_32, eip); 2608 + u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); 2617 2609 2618 2610 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2619 2611 &ctxt->exception); ··· 2625 2613 2626 2614 save_state_to_tss32(ctxt, &tss_seg); 2627 2615 2628 - ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, 2629 - &ctxt->exception); 2616 + /* Only GP registers and segment selectors are saved */ 2617 + ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, 2618 + ldt_sel_offset - eip_offset, &ctxt->exception); 2630 2619 if (ret != X86EMUL_CONTINUE) 2631 2620 /* FIXME: need to provide precise fault address */ 2632 2621 return ret; ··· 3399 3386 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); 3400 3387 if (efer & EFER_LMA) 3401 3388 rsvd = CR3_L_MODE_RESERVED_BITS; 3402 - else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) 3403 - rsvd = CR3_PAE_RESERVED_BITS; 3404 - else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) 3405 - rsvd = CR3_NONPAE_RESERVED_BITS; 3406 3389 3407 3390 if (new_val & rsvd) 3408 3391 return emulate_gp(ctxt, 0); ··· 3878 3869 N, N, N, N, N, N, N, N, 3879 3870 D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), 3880 3871 /* 0x20 - 0x2F */ 3881 - DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), 3882 - DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), 3883 - IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), 3884 - IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), 3872 + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), 3873 + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), 3874 + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, 3875 + check_cr_write), 3876 + IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, 3877 + check_dr_write), 3885 3878 N, N, N, N, 3886 3879 GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), 3887 3880 GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),

+1

arch/x86/kvm/irq.c

··· 113 113 114 114 return kvm_get_apic_interrupt(v); /* APIC */ 115 115 } 116 + EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); 116 117 117 118 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 118 119 {

+43 -19

arch/x86/kvm/lapic.c

··· 360 360 361 361 static inline void apic_set_isr(int vec, struct kvm_lapic *apic) 362 362 { 363 + /* Note that we never get here with APIC virtualization enabled. */ 364 + 363 365 if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) 364 366 ++apic->isr_count; 365 367 BUG_ON(apic->isr_count > MAX_APIC_VECTOR); ··· 373 371 apic->highest_isr_cache = vec; 374 372 } 375 373 374 + static inline int apic_find_highest_isr(struct kvm_lapic *apic) 375 + { 376 + int result; 377 + 378 + /* 379 + * Note that isr_count is always 1, and highest_isr_cache 380 + * is always -1, with APIC virtualization enabled. 381 + */ 382 + if (!apic->isr_count) 383 + return -1; 384 + if (likely(apic->highest_isr_cache != -1)) 385 + return apic->highest_isr_cache; 386 + 387 + result = find_highest_vector(apic->regs + APIC_ISR); 388 + ASSERT(result == -1 || result >= 16); 389 + 390 + return result; 391 + } 392 + 376 393 static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) 377 394 { 378 - if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) 395 + struct kvm_vcpu *vcpu; 396 + if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) 397 + return; 398 + 399 + vcpu = apic->vcpu; 400 + 401 + /* 402 + * We do get here for APIC virtualization enabled if the guest 403 + * uses the Hyper-V APIC enlightenment. In this case we may need 404 + * to trigger a new interrupt delivery by writing the SVI field; 405 + * on the other hand isr_count and highest_isr_cache are unused 406 + * and must be left alone. 407 + */ 408 + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) 409 + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, 410 + apic_find_highest_isr(apic)); 411 + else { 379 412 --apic->isr_count; 380 - BUG_ON(apic->isr_count < 0); 381 - apic->highest_isr_cache = -1; 413 + BUG_ON(apic->isr_count < 0); 414 + apic->highest_isr_cache = -1; 415 + } 382 416 } 383 417 384 418 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) ··· 492 454 return; 493 455 } 494 456 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 495 - } 496 - 497 - static inline int apic_find_highest_isr(struct kvm_lapic *apic) 498 - { 499 - int result; 500 - 501 - /* Note that isr_count is always 1 with vid enabled */ 502 - if (!apic->isr_count) 503 - return -1; 504 - if (likely(apic->highest_isr_cache != -1)) 505 - return apic->highest_isr_cache; 506 - 507 - result = find_highest_vector(apic->regs + APIC_ISR); 508 - ASSERT(result == -1 || result >= 16); 509 - 510 - return result; 511 457 } 512 458 513 459 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) ··· 1626 1604 { 1627 1605 int vector = kvm_apic_has_interrupt(vcpu); 1628 1606 struct kvm_lapic *apic = vcpu->arch.apic; 1607 + 1608 + /* Note that we never get here with APIC virtualization enabled. */ 1629 1609 1630 1610 if (vector == -1) 1631 1611 return -1;

+55 -29

arch/x86/kvm/mmu.c

··· 22 22 #include "mmu.h" 23 23 #include "x86.h" 24 24 #include "kvm_cache_regs.h" 25 + #include "cpuid.h" 25 26 26 27 #include <linux/kvm_host.h> 27 28 #include <linux/types.h> ··· 596 595 * we always atomicly update it, see the comments in 597 596 * spte_has_volatile_bits(). 598 597 */ 599 - if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) 598 + if (spte_is_locklessly_modifiable(old_spte) && 599 + !is_writable_pte(new_spte)) 600 600 ret = true; 601 601 602 602 if (!shadow_accessed_mask) ··· 1178 1176 1179 1177 /* 1180 1178 * Write-protect on the specified @sptep, @pt_protect indicates whether 1181 - * spte writ-protection is caused by protecting shadow page table. 1182 - * @flush indicates whether tlb need be flushed. 1179 + * spte write-protection is caused by protecting shadow page table. 1183 1180 * 1184 1181 * Note: write protection is difference between drity logging and spte 1185 1182 * protection: ··· 1187 1186 * - for spte protection, the spte can be writable only after unsync-ing 1188 1187 * shadow page. 1189 1188 * 1190 - * Return true if the spte is dropped. 1189 + * Return true if tlb need be flushed. 1191 1190 */ 1192 - static bool 1193 - spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) 1191 + static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) 1194 1192 { 1195 1193 u64 spte = *sptep; 1196 1194 ··· 1199 1199 1200 1200 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1201 1201 1202 - if (__drop_large_spte(kvm, sptep)) { 1203 - *flush |= true; 1204 - return true; 1205 - } 1206 - 1207 1202 if (pt_protect) 1208 1203 spte &= ~SPTE_MMU_WRITEABLE; 1209 1204 spte = spte & ~PT_WRITABLE_MASK; 1210 1205 1211 - *flush |= mmu_spte_update(sptep, spte); 1212 - return false; 1206 + return mmu_spte_update(sptep, spte); 1213 1207 } 1214 1208 1215 1209 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, ··· 1215 1221 1216 1222 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1217 1223 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1218 - if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { 1219 - sptep = rmap_get_first(*rmapp, &iter); 1220 - continue; 1221 - } 1222 1224 1225 + flush |= spte_write_protect(kvm, sptep, pt_protect); 1223 1226 sptep = rmap_get_next(&iter); 1224 1227 } 1225 1228 ··· 2793 2802 } 2794 2803 2795 2804 static bool 2796 - fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) 2805 + fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2806 + u64 *sptep, u64 spte) 2797 2807 { 2798 - struct kvm_mmu_page *sp = page_header(__pa(sptep)); 2799 2808 gfn_t gfn; 2800 2809 2801 2810 WARN_ON(!sp->role.direct); ··· 2821 2830 u32 error_code) 2822 2831 { 2823 2832 struct kvm_shadow_walk_iterator iterator; 2833 + struct kvm_mmu_page *sp; 2824 2834 bool ret = false; 2825 2835 u64 spte = 0ull; 2826 2836 ··· 2845 2853 goto exit; 2846 2854 } 2847 2855 2848 - if (!is_last_spte(spte, level)) 2856 + sp = page_header(__pa(iterator.sptep)); 2857 + if (!is_last_spte(spte, sp->role.level)) 2849 2858 goto exit; 2850 2859 2851 2860 /* ··· 2868 2875 goto exit; 2869 2876 2870 2877 /* 2878 + * Do not fix write-permission on the large spte since we only dirty 2879 + * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() 2880 + * that means other pages are missed if its slot is dirty-logged. 2881 + * 2882 + * Instead, we let the slow page fault path create a normal spte to 2883 + * fix the access. 2884 + * 2885 + * See the comments in kvm_arch_commit_memory_region(). 2886 + */ 2887 + if (sp->role.level > PT_PAGE_TABLE_LEVEL) 2888 + goto exit; 2889 + 2890 + /* 2871 2891 * Currently, fast page fault only works for direct mapping since 2872 2892 * the gfn is not stable for indirect shadow page. 2873 2893 * See Documentation/virtual/kvm/locking.txt to get more detail. 2874 2894 */ 2875 - ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); 2895 + ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); 2876 2896 exit: 2877 2897 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, 2878 2898 spte, ret); ··· 3517 3511 { 3518 3512 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3519 3513 u64 exb_bit_rsvd = 0; 3514 + u64 gbpages_bit_rsvd = 0; 3520 3515 3521 3516 context->bad_mt_xwr = 0; 3522 3517 3523 3518 if (!context->nx) 3524 3519 exb_bit_rsvd = rsvd_bits(63, 63); 3520 + if (!guest_cpuid_has_gbpages(vcpu)) 3521 + gbpages_bit_rsvd = rsvd_bits(7, 7); 3525 3522 switch (context->root_level) { 3526 3523 case PT32_ROOT_LEVEL: 3527 3524 /* no rsvd bits for 2 level 4K page table entries */ ··· 3547 3538 case PT32E_ROOT_LEVEL: 3548 3539 context->rsvd_bits_mask[0][2] = 3549 3540 rsvd_bits(maxphyaddr, 63) | 3550 - rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ 3541 + rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ 3551 3542 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3552 3543 rsvd_bits(maxphyaddr, 62); /* PDE */ 3553 3544 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | ··· 3559 3550 break; 3560 3551 case PT64_ROOT_LEVEL: 3561 3552 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 3562 - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 3553 + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); 3563 3554 context->rsvd_bits_mask[0][2] = exb_bit_rsvd | 3564 - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); 3555 + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); 3565 3556 context->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3566 3557 rsvd_bits(maxphyaddr, 51); 3567 3558 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 3568 3559 rsvd_bits(maxphyaddr, 51); 3569 3560 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; 3570 3561 context->rsvd_bits_mask[1][2] = exb_bit_rsvd | 3571 - rsvd_bits(maxphyaddr, 51) | 3562 + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | 3572 3563 rsvd_bits(13, 29); 3573 3564 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 3574 3565 rsvd_bits(maxphyaddr, 51) | ··· 4313 4304 if (*rmapp) 4314 4305 __rmap_write_protect(kvm, rmapp, false); 4315 4306 4316 - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 4317 - kvm_flush_remote_tlbs(kvm); 4307 + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) 4318 4308 cond_resched_lock(&kvm->mmu_lock); 4319 - } 4320 4309 } 4321 4310 } 4322 4311 4323 - kvm_flush_remote_tlbs(kvm); 4324 4312 spin_unlock(&kvm->mmu_lock); 4313 + 4314 + /* 4315 + * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() 4316 + * which do tlb flush out of mmu-lock should be serialized by 4317 + * kvm->slots_lock otherwise tlb flush would be missed. 4318 + */ 4319 + lockdep_assert_held(&kvm->slots_lock); 4320 + 4321 + /* 4322 + * We can flush all the TLBs out of the mmu lock without TLB 4323 + * corruption since we just change the spte from writable to 4324 + * readonly so that we only need to care the case of changing 4325 + * spte from present to present (changing the spte from present 4326 + * to nonpresent will flush all the TLBs immediately), in other 4327 + * words, the only case we care is mmu_spte_update() where we 4328 + * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE 4329 + * instead of PT_WRITABLE_MASK, that means it does not depend 4330 + * on PT_WRITABLE_MASK anymore. 4331 + */ 4332 + kvm_flush_remote_tlbs(kvm); 4325 4333 } 4326 4334 4327 4335 #define BATCH_ZAP_PAGES 10

+33

arch/x86/kvm/mmu.h

··· 104 104 return pte & PT_PRESENT_MASK; 105 105 } 106 106 107 + /* 108 + * Currently, we have two sorts of write-protection, a) the first one 109 + * write-protects guest page to sync the guest modification, b) another one is 110 + * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences 111 + * between these two sorts are: 112 + * 1) the first case clears SPTE_MMU_WRITEABLE bit. 113 + * 2) the first case requires flushing tlb immediately avoiding corrupting 114 + * shadow page table between all vcpus so it should be in the protection of 115 + * mmu-lock. And the another case does not need to flush tlb until returning 116 + * the dirty bitmap to userspace since it only write-protects the page 117 + * logged in the bitmap, that means the page in the dirty bitmap is not 118 + * missed, so it can flush tlb out of mmu-lock. 119 + * 120 + * So, there is the problem: the first case can meet the corrupted tlb caused 121 + * by another case which write-protects pages but without flush tlb 122 + * immediately. In order to making the first case be aware this problem we let 123 + * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit 124 + * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. 125 + * 126 + * Anyway, whenever a spte is updated (only permission and status bits are 127 + * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes 128 + * readonly, if that happens, we need to flush tlb. Fortunately, 129 + * mmu_spte_update() has already handled it perfectly. 130 + * 131 + * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: 132 + * - if we want to see if it has writable tlb entry or if the spte can be 133 + * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most 134 + * case, otherwise 135 + * - if we fix page fault on the spte or do write-protection by dirty logging, 136 + * check PT_WRITABLE_MASK. 137 + * 138 + * TODO: introduce APIs to split these two cases. 139 + */ 107 140 static inline int is_writable_pte(unsigned long pte) 108 141 { 109 142 return pte & PT_WRITABLE_MASK;

+3 -4

arch/x86/kvm/paging_tmpl.h

··· 913 913 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't 914 914 * used by guest then tlbs are not flushed, so guest is allowed to access the 915 915 * freed pages. 916 - * We set tlbs_dirty to let the notifier know this change and delay the flush 917 - * until such a case actually happens. 916 + * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. 918 917 */ 919 918 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 920 919 { ··· 942 943 return -EINVAL; 943 944 944 945 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 945 - vcpu->kvm->tlbs_dirty = true; 946 + vcpu->kvm->tlbs_dirty++; 946 947 continue; 947 948 } 948 949 ··· 957 958 958 959 if (gfn != sp->gfns[i]) { 959 960 drop_spte(vcpu->kvm, &sp->spt[i]); 960 - vcpu->kvm->tlbs_dirty = true; 961 + vcpu->kvm->tlbs_dirty++; 961 962 continue; 962 963 } 963 964

+5 -2

arch/x86/kvm/pmu.c

··· 108 108 { 109 109 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 110 110 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; 111 - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 111 + if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { 112 + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 113 + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 114 + } 112 115 } 113 116 114 117 static void kvm_perf_overflow_intr(struct perf_event *perf_event, ··· 120 117 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 121 118 struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; 122 119 if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { 123 - kvm_perf_overflow(perf_event, data, regs); 120 + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); 124 121 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 125 122 /* 126 123 * Inject PMI. If vcpu was in a guest mode during NMI PMI

+34 -29

arch/x86/kvm/svm.c

··· 1338 1338 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1339 1339 } 1340 1340 1341 - static void svm_update_cpl(struct kvm_vcpu *vcpu) 1342 - { 1343 - struct vcpu_svm *svm = to_svm(vcpu); 1344 - int cpl; 1345 - 1346 - if (!is_protmode(vcpu)) 1347 - cpl = 0; 1348 - else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) 1349 - cpl = 3; 1350 - else 1351 - cpl = svm->vmcb->save.cs.selector & 0x3; 1352 - 1353 - svm->vmcb->save.cpl = cpl; 1354 - } 1355 - 1356 1341 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1357 1342 { 1358 1343 return to_svm(vcpu)->vmcb->save.rflags; ··· 1345 1360 1346 1361 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1347 1362 { 1348 - unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; 1349 - 1363 + /* 1364 + * Any change of EFLAGS.VM is accompained by a reload of SS 1365 + * (caused by either a task switch or an inter-privilege IRET), 1366 + * so we do not need to update the CPL here. 1367 + */ 1350 1368 to_svm(vcpu)->vmcb->save.rflags = rflags; 1351 - if ((old_rflags ^ rflags) & X86_EFLAGS_VM) 1352 - svm_update_cpl(vcpu); 1353 1369 } 1354 1370 1355 1371 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) ··· 1617 1631 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1618 1632 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1619 1633 } 1620 - if (seg == VCPU_SREG_CS) 1621 - svm_update_cpl(vcpu); 1634 + 1635 + /* 1636 + * This is always accurate, except if SYSRET returned to a segment 1637 + * with SS.DPL != 3. Intel does not have this quirk, and always 1638 + * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 1639 + * would entail passing the CPL to userspace and back. 1640 + */ 1641 + if (seg == VCPU_SREG_SS) 1642 + svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 1622 1643 1623 1644 mark_dirty(svm->vmcb, VMCB_SEG); 1624 1645 } ··· 2763 2770 return 1; 2764 2771 } 2765 2772 2766 - static int invalid_op_interception(struct vcpu_svm *svm) 2767 - { 2768 - kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2769 - return 1; 2770 - } 2771 - 2772 2773 static int task_switch_interception(struct vcpu_svm *svm) 2773 2774 { 2774 2775 u16 tss_selector; ··· 3274 3287 return 1; 3275 3288 } 3276 3289 3290 + static int nop_interception(struct vcpu_svm *svm) 3291 + { 3292 + skip_emulated_instruction(&(svm->vcpu)); 3293 + return 1; 3294 + } 3295 + 3296 + static int monitor_interception(struct vcpu_svm *svm) 3297 + { 3298 + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 3299 + return nop_interception(svm); 3300 + } 3301 + 3302 + static int mwait_interception(struct vcpu_svm *svm) 3303 + { 3304 + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 3305 + return nop_interception(svm); 3306 + } 3307 + 3277 3308 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { 3278 3309 [SVM_EXIT_READ_CR0] = cr_interception, 3279 3310 [SVM_EXIT_READ_CR3] = cr_interception, ··· 3349 3344 [SVM_EXIT_CLGI] = clgi_interception, 3350 3345 [SVM_EXIT_SKINIT] = skinit_interception, 3351 3346 [SVM_EXIT_WBINVD] = emulate_on_interception, 3352 - [SVM_EXIT_MONITOR] = invalid_op_interception, 3353 - [SVM_EXIT_MWAIT] = invalid_op_interception, 3347 + [SVM_EXIT_MONITOR] = monitor_interception, 3348 + [SVM_EXIT_MWAIT] = mwait_interception, 3354 3349 [SVM_EXIT_XSETBV] = xsetbv_interception, 3355 3350 [SVM_EXIT_NPF] = pf_interception, 3356 3351 };

+16 -4

arch/x86/kvm/trace.h

··· 91 91 /* 92 92 * Tracepoint for PIO. 93 93 */ 94 + 95 + #define KVM_PIO_IN 0 96 + #define KVM_PIO_OUT 1 97 + 94 98 TRACE_EVENT(kvm_pio, 95 99 TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, 96 - unsigned int count), 97 - TP_ARGS(rw, port, size, count), 100 + unsigned int count, void *data), 101 + TP_ARGS(rw, port, size, count, data), 98 102 99 103 TP_STRUCT__entry( 100 104 __field( unsigned int, rw ) 101 105 __field( unsigned int, port ) 102 106 __field( unsigned int, size ) 103 107 __field( unsigned int, count ) 108 + __field( unsigned int, val ) 104 109 ), 105 110 106 111 TP_fast_assign( ··· 113 108 __entry->port = port; 114 109 __entry->size = size; 115 110 __entry->count = count; 111 + if (size == 1) 112 + __entry->val = *(unsigned char *)data; 113 + else if (size == 2) 114 + __entry->val = *(unsigned short *)data; 115 + else 116 + __entry->val = *(unsigned int *)data; 116 117 ), 117 118 118 - TP_printk("pio_%s at 0x%x size %d count %d", 119 + TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", 119 120 __entry->rw ? "write" : "read", 120 - __entry->port, __entry->size, __entry->count) 121 + __entry->port, __entry->size, __entry->count, __entry->val, 122 + __entry->count > 1 ? "(...)" : "") 121 123 ); 122 124 123 125 /*

+210 -123

arch/x86/kvm/vmx.c

··· 354 354 struct nested_vmx { 355 355 /* Has the level1 guest done vmxon? */ 356 356 bool vmxon; 357 + gpa_t vmxon_ptr; 357 358 358 359 /* The guest-physical address of the current VMCS L1 keeps for L2 */ 359 360 gpa_t current_vmptr; ··· 414 413 struct kvm_vcpu vcpu; 415 414 unsigned long host_rsp; 416 415 u8 fail; 417 - u8 cpl; 418 416 bool nmi_known_unmasked; 419 417 u32 exit_intr_info; 420 418 u32 idt_vectoring_info; ··· 2283 2283 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2284 2284 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); 2285 2285 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2286 - /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2286 + 2287 2287 nested_vmx_exit_ctls_high &= 2288 2288 #ifdef CONFIG_X86_64 2289 2289 VM_EXIT_HOST_ADDR_SPACE_SIZE | ··· 2291 2291 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2292 2292 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2293 2293 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2294 - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; 2294 + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2295 + 2295 2296 if (vmx_mpx_supported()) 2296 2297 nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2297 2298 ··· 2354 2353 VMX_EPT_INVEPT_BIT; 2355 2354 nested_vmx_ept_caps &= vmx_capability.ept; 2356 2355 /* 2357 - * Since invept is completely emulated we support both global 2358 - * and context invalidation independent of what host cpu 2359 - * supports 2356 + * For nested guests, we don't do anything specific 2357 + * for single context invalidation. Hence, only advertise 2358 + * support for global context invalidation. 2360 2359 */ 2361 - nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 2362 - VMX_EPT_EXTENT_CONTEXT_BIT; 2360 + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; 2363 2361 } else 2364 2362 nested_vmx_ept_caps = 0; 2365 2363 ··· 3186 3186 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3187 3187 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3188 3188 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3189 - 3190 - /* CPL is always 0 when CPU enters protected mode */ 3191 - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3192 - vmx->cpl = 0; 3193 3189 } 3194 3190 3195 3191 static void fix_rmode_seg(int seg, struct kvm_segment *save) ··· 3587 3591 { 3588 3592 struct vcpu_vmx *vmx = to_vmx(vcpu); 3589 3593 3590 - if (!is_protmode(vcpu)) 3594 + if (unlikely(vmx->rmode.vm86_active)) 3591 3595 return 0; 3592 - 3593 - if (!is_long_mode(vcpu) 3594 - && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ 3595 - return 3; 3596 - 3597 - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3598 - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3599 - vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; 3596 + else { 3597 + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3598 + return AR_DPL(ar); 3600 3599 } 3601 - 3602 - return vmx->cpl; 3603 3600 } 3604 - 3605 3601 3606 3602 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3607 3603 { ··· 3622 3634 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3623 3635 3624 3636 vmx_segment_cache_clear(vmx); 3625 - if (seg == VCPU_SREG_CS) 3626 - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3627 3637 3628 3638 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3629 3639 vmx->rmode.segs[seg] = *var; ··· 4550 4564 PIN_BASED_EXT_INTR_MASK; 4551 4565 } 4552 4566 4567 + /* 4568 + * In nested virtualization, check if L1 has set 4569 + * VM_EXIT_ACK_INTR_ON_EXIT 4570 + */ 4571 + static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 4572 + { 4573 + return get_vmcs12(vcpu)->vm_exit_controls & 4574 + VM_EXIT_ACK_INTR_ON_EXIT; 4575 + } 4576 + 4553 4577 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 4554 4578 { 4555 4579 return get_vmcs12(vcpu)->pin_based_vm_exec_control & ··· 4874 4878 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4875 4879 vcpu->arch.dr6 &= ~15; 4876 4880 vcpu->arch.dr6 |= dr6; 4881 + if (!(dr6 & ~DR6_RESERVED)) /* icebp */ 4882 + skip_emulated_instruction(vcpu); 4883 + 4877 4884 kvm_queue_exception(vcpu, DB_VECTOR); 4878 4885 return 1; 4879 4886 } ··· 5165 5166 return 1; 5166 5167 kvm_register_write(vcpu, reg, val); 5167 5168 } else 5168 - if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg])) 5169 + if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) 5169 5170 return 1; 5170 5171 5171 5172 skip_emulated_instruction(vcpu); ··· 5438 5439 } 5439 5440 5440 5441 /* clear all local breakpoint enable flags */ 5441 - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); 5442 + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); 5442 5443 5443 5444 /* 5444 5445 * TODO: What about debug traps on tss switch? ··· 5564 5565 gpa_t gpa; 5565 5566 5566 5567 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5568 + if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5569 + skip_emulated_instruction(vcpu); 5570 + return 1; 5571 + } 5567 5572 5568 5573 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5569 5574 if (likely(ret == RET_MMIO_PF_EMULATE)) ··· 5672 5669 return 1; 5673 5670 } 5674 5671 5675 - static int handle_invalid_op(struct kvm_vcpu *vcpu) 5672 + static int handle_nop(struct kvm_vcpu *vcpu) 5676 5673 { 5677 - kvm_queue_exception(vcpu, UD_VECTOR); 5674 + skip_emulated_instruction(vcpu); 5678 5675 return 1; 5676 + } 5677 + 5678 + static int handle_mwait(struct kvm_vcpu *vcpu) 5679 + { 5680 + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 5681 + return handle_nop(vcpu); 5682 + } 5683 + 5684 + static int handle_monitor(struct kvm_vcpu *vcpu) 5685 + { 5686 + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 5687 + return handle_nop(vcpu); 5679 5688 } 5680 5689 5681 5690 /* ··· 5827 5812 } 5828 5813 5829 5814 /* 5815 + * Decode the memory-address operand of a vmx instruction, as recorded on an 5816 + * exit caused by such an instruction (run by a guest hypervisor). 5817 + * On success, returns 0. When the operand is invalid, returns 1 and throws 5818 + * #UD or #GP. 5819 + */ 5820 + static int get_vmx_mem_address(struct kvm_vcpu *vcpu, 5821 + unsigned long exit_qualification, 5822 + u32 vmx_instruction_info, gva_t *ret) 5823 + { 5824 + /* 5825 + * According to Vol. 3B, "Information for VM Exits Due to Instruction 5826 + * Execution", on an exit, vmx_instruction_info holds most of the 5827 + * addressing components of the operand. Only the displacement part 5828 + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5829 + * For how an actual address is calculated from all these components, 5830 + * refer to Vol. 1, "Operand Addressing". 5831 + */ 5832 + int scaling = vmx_instruction_info & 3; 5833 + int addr_size = (vmx_instruction_info >> 7) & 7; 5834 + bool is_reg = vmx_instruction_info & (1u << 10); 5835 + int seg_reg = (vmx_instruction_info >> 15) & 7; 5836 + int index_reg = (vmx_instruction_info >> 18) & 0xf; 5837 + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5838 + int base_reg = (vmx_instruction_info >> 23) & 0xf; 5839 + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5840 + 5841 + if (is_reg) { 5842 + kvm_queue_exception(vcpu, UD_VECTOR); 5843 + return 1; 5844 + } 5845 + 5846 + /* Addr = segment_base + offset */ 5847 + /* offset = base + [index * scale] + displacement */ 5848 + *ret = vmx_get_segment_base(vcpu, seg_reg); 5849 + if (base_is_valid) 5850 + *ret += kvm_register_read(vcpu, base_reg); 5851 + if (index_is_valid) 5852 + *ret += kvm_register_read(vcpu, index_reg)<<scaling; 5853 + *ret += exit_qualification; /* holds the displacement */ 5854 + 5855 + if (addr_size == 1) /* 32 bit */ 5856 + *ret &= 0xffffffff; 5857 + 5858 + /* 5859 + * TODO: throw #GP (and return 1) in various cases that the VM* 5860 + * instructions require it - e.g., offset beyond segment limit, 5861 + * unusable or unreadable/unwritable segment, non-canonical 64-bit 5862 + * address, and so on. Currently these are not checked. 5863 + */ 5864 + return 0; 5865 + } 5866 + 5867 + /* 5868 + * This function performs the various checks including 5869 + * - if it's 4KB aligned 5870 + * - No bits beyond the physical address width are set 5871 + * - Returns 0 on success or else 1 5872 + * (Intel SDM Section 30.3) 5873 + */ 5874 + static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, 5875 + gpa_t *vmpointer) 5876 + { 5877 + gva_t gva; 5878 + gpa_t vmptr; 5879 + struct x86_exception e; 5880 + struct page *page; 5881 + struct vcpu_vmx *vmx = to_vmx(vcpu); 5882 + int maxphyaddr = cpuid_maxphyaddr(vcpu); 5883 + 5884 + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 5885 + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) 5886 + return 1; 5887 + 5888 + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, 5889 + sizeof(vmptr), &e)) { 5890 + kvm_inject_page_fault(vcpu, &e); 5891 + return 1; 5892 + } 5893 + 5894 + switch (exit_reason) { 5895 + case EXIT_REASON_VMON: 5896 + /* 5897 + * SDM 3: 24.11.5 5898 + * The first 4 bytes of VMXON region contain the supported 5899 + * VMCS revision identifier 5900 + * 5901 + * Note - IA32_VMX_BASIC[48] will never be 1 5902 + * for the nested case; 5903 + * which replaces physical address width with 32 5904 + * 5905 + */ 5906 + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { 5907 + nested_vmx_failInvalid(vcpu); 5908 + skip_emulated_instruction(vcpu); 5909 + return 1; 5910 + } 5911 + 5912 + page = nested_get_page(vcpu, vmptr); 5913 + if (page == NULL || 5914 + *(u32 *)kmap(page) != VMCS12_REVISION) { 5915 + nested_vmx_failInvalid(vcpu); 5916 + kunmap(page); 5917 + skip_emulated_instruction(vcpu); 5918 + return 1; 5919 + } 5920 + kunmap(page); 5921 + vmx->nested.vmxon_ptr = vmptr; 5922 + break; 5923 + case EXIT_REASON_VMCLEAR: 5924 + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { 5925 + nested_vmx_failValid(vcpu, 5926 + VMXERR_VMCLEAR_INVALID_ADDRESS); 5927 + skip_emulated_instruction(vcpu); 5928 + return 1; 5929 + } 5930 + 5931 + if (vmptr == vmx->nested.vmxon_ptr) { 5932 + nested_vmx_failValid(vcpu, 5933 + VMXERR_VMCLEAR_VMXON_POINTER); 5934 + skip_emulated_instruction(vcpu); 5935 + return 1; 5936 + } 5937 + break; 5938 + case EXIT_REASON_VMPTRLD: 5939 + if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { 5940 + nested_vmx_failValid(vcpu, 5941 + VMXERR_VMPTRLD_INVALID_ADDRESS); 5942 + skip_emulated_instruction(vcpu); 5943 + return 1; 5944 + } 5945 + 5946 + if (vmptr == vmx->nested.vmxon_ptr) { 5947 + nested_vmx_failValid(vcpu, 5948 + VMXERR_VMCLEAR_VMXON_POINTER); 5949 + skip_emulated_instruction(vcpu); 5950 + return 1; 5951 + } 5952 + break; 5953 + default: 5954 + return 1; /* shouldn't happen */ 5955 + } 5956 + 5957 + if (vmpointer) 5958 + *vmpointer = vmptr; 5959 + return 0; 5960 + } 5961 + 5962 + /* 5830 5963 * Emulate the VMXON instruction. 5831 5964 * Currently, we just remember that VMX is active, and do not save or even 5832 5965 * inspect the argument to VMXON (the so-called "VMXON pointer") because we ··· 6012 5849 kvm_inject_gp(vcpu, 0); 6013 5850 return 1; 6014 5851 } 5852 + 5853 + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) 5854 + return 1; 5855 + 6015 5856 if (vmx->nested.vmxon) { 6016 5857 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 6017 5858 skip_emulated_instruction(vcpu); ··· 6138 5971 return 1; 6139 5972 } 6140 5973 6141 - /* 6142 - * Decode the memory-address operand of a vmx instruction, as recorded on an 6143 - * exit caused by such an instruction (run by a guest hypervisor). 6144 - * On success, returns 0. When the operand is invalid, returns 1 and throws 6145 - * #UD or #GP. 6146 - */ 6147 - static int get_vmx_mem_address(struct kvm_vcpu *vcpu, 6148 - unsigned long exit_qualification, 6149 - u32 vmx_instruction_info, gva_t *ret) 6150 - { 6151 - /* 6152 - * According to Vol. 3B, "Information for VM Exits Due to Instruction 6153 - * Execution", on an exit, vmx_instruction_info holds most of the 6154 - * addressing components of the operand. Only the displacement part 6155 - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 6156 - * For how an actual address is calculated from all these components, 6157 - * refer to Vol. 1, "Operand Addressing". 6158 - */ 6159 - int scaling = vmx_instruction_info & 3; 6160 - int addr_size = (vmx_instruction_info >> 7) & 7; 6161 - bool is_reg = vmx_instruction_info & (1u << 10); 6162 - int seg_reg = (vmx_instruction_info >> 15) & 7; 6163 - int index_reg = (vmx_instruction_info >> 18) & 0xf; 6164 - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 6165 - int base_reg = (vmx_instruction_info >> 23) & 0xf; 6166 - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 6167 - 6168 - if (is_reg) { 6169 - kvm_queue_exception(vcpu, UD_VECTOR); 6170 - return 1; 6171 - } 6172 - 6173 - /* Addr = segment_base + offset */ 6174 - /* offset = base + [index * scale] + displacement */ 6175 - *ret = vmx_get_segment_base(vcpu, seg_reg); 6176 - if (base_is_valid) 6177 - *ret += kvm_register_read(vcpu, base_reg); 6178 - if (index_is_valid) 6179 - *ret += kvm_register_read(vcpu, index_reg)<<scaling; 6180 - *ret += exit_qualification; /* holds the displacement */ 6181 - 6182 - if (addr_size == 1) /* 32 bit */ 6183 - *ret &= 0xffffffff; 6184 - 6185 - /* 6186 - * TODO: throw #GP (and return 1) in various cases that the VM* 6187 - * instructions require it - e.g., offset beyond segment limit, 6188 - * unusable or unreadable/unwritable segment, non-canonical 64-bit 6189 - * address, and so on. Currently these are not checked. 6190 - */ 6191 - return 0; 6192 - } 6193 - 6194 5974 /* Emulate the VMCLEAR instruction */ 6195 5975 static int handle_vmclear(struct kvm_vcpu *vcpu) 6196 5976 { 6197 5977 struct vcpu_vmx *vmx = to_vmx(vcpu); 6198 - gva_t gva; 6199 5978 gpa_t vmptr; 6200 5979 struct vmcs12 *vmcs12; 6201 5980 struct page *page; 6202 - struct x86_exception e; 6203 5981 6204 5982 if (!nested_vmx_check_permission(vcpu)) 6205 5983 return 1; 6206 5984 6207 - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6208 - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) 5985 + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) 6209 5986 return 1; 6210 - 6211 - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, 6212 - sizeof(vmptr), &e)) { 6213 - kvm_inject_page_fault(vcpu, &e); 6214 - return 1; 6215 - } 6216 - 6217 - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { 6218 - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 6219 - skip_emulated_instruction(vcpu); 6220 - return 1; 6221 - } 6222 5987 6223 5988 if (vmptr == vmx->nested.current_vmptr) { 6224 5989 nested_release_vmcs12(vmx); ··· 6471 6372 static int handle_vmptrld(struct kvm_vcpu *vcpu) 6472 6373 { 6473 6374 struct vcpu_vmx *vmx = to_vmx(vcpu); 6474 - gva_t gva; 6475 6375 gpa_t vmptr; 6476 - struct x86_exception e; 6477 6376 u32 exec_control; 6478 6377 6479 6378 if (!nested_vmx_check_permission(vcpu)) 6480 6379 return 1; 6481 6380 6482 - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6483 - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) 6381 + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) 6484 6382 return 1; 6485 - 6486 - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, 6487 - sizeof(vmptr), &e)) { 6488 - kvm_inject_page_fault(vcpu, &e); 6489 - return 1; 6490 - } 6491 - 6492 - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { 6493 - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 6494 - skip_emulated_instruction(vcpu); 6495 - return 1; 6496 - } 6497 6383 6498 6384 if (vmx->nested.current_vmptr != vmptr) { 6499 6385 struct vmcs12 *new_vmcs12; ··· 6555 6471 struct { 6556 6472 u64 eptp, gpa; 6557 6473 } operand; 6558 - u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; 6559 6474 6560 6475 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || 6561 6476 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { ··· 6594 6511 } 6595 6512 6596 6513 switch (type) { 6597 - case VMX_EPT_EXTENT_CONTEXT: 6598 - if ((operand.eptp & eptp_mask) != 6599 - (nested_ept_get_cr3(vcpu) & eptp_mask)) 6600 - break; 6601 6514 case VMX_EPT_EXTENT_GLOBAL: 6602 6515 kvm_mmu_sync_roots(vcpu); 6603 6516 kvm_mmu_flush_tlb(vcpu); 6604 6517 nested_vmx_succeed(vcpu); 6605 6518 break; 6606 6519 default: 6520 + /* Trap single context invalidation invept calls */ 6607 6521 BUG_ON(1); 6608 6522 break; 6609 6523 } ··· 6651 6571 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6652 6572 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6653 6573 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6654 - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, 6655 - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, 6574 + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 6575 + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 6656 6576 [EXIT_REASON_INVEPT] = handle_invept, 6657 6577 }; 6658 6578 ··· 7493 7413 7494 7414 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 7495 7415 | (1 << VCPU_EXREG_RFLAGS) 7496 - | (1 << VCPU_EXREG_CPL) 7497 7416 | (1 << VCPU_EXREG_PDPTR) 7498 7417 | (1 << VCPU_EXREG_SEGMENTS) 7499 7418 | (1 << VCPU_EXREG_CR3)); ··· 8679 8600 leave_guest_mode(vcpu); 8680 8601 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 8681 8602 exit_qualification); 8603 + 8604 + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 8605 + && nested_exit_intr_ack_set(vcpu)) { 8606 + int irq = kvm_cpu_get_interrupt(vcpu); 8607 + WARN_ON(irq < 0); 8608 + vmcs12->vm_exit_intr_info = irq | 8609 + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 8610 + } 8682 8611 8683 8612 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 8684 8613 vmcs12->exit_qualification,

+29 -32

arch/x86/kvm/x86.c

··· 704 704 } 705 705 706 706 if (is_long_mode(vcpu)) { 707 - if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { 708 - if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 709 - return 1; 710 - } else 711 - if (cr3 & CR3_L_MODE_RESERVED_BITS) 712 - return 1; 713 - } else { 714 - if (is_pae(vcpu)) { 715 - if (cr3 & CR3_PAE_RESERVED_BITS) 716 - return 1; 717 - if (is_paging(vcpu) && 718 - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 719 - return 1; 720 - } 721 - /* 722 - * We don't check reserved bits in nonpae mode, because 723 - * this isn't enforced, and VMware depends on this. 724 - */ 725 - } 707 + if (cr3 & CR3_L_MODE_RESERVED_BITS) 708 + return 1; 709 + } else if (is_pae(vcpu) && is_paging(vcpu) && 710 + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 711 + return 1; 726 712 727 713 vcpu->arch.cr3 = cr3; 728 714 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); ··· 1921 1935 1922 1936 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { 1923 1937 vcpu->arch.hv_vapic = data; 1938 + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) 1939 + return 1; 1924 1940 break; 1925 1941 } 1926 1942 gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; ··· 1933 1945 return 1; 1934 1946 vcpu->arch.hv_vapic = data; 1935 1947 mark_page_dirty(vcpu->kvm, gfn); 1948 + if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) 1949 + return 1; 1936 1950 break; 1937 1951 } 1938 1952 case HV_X64_MSR_EOI: ··· 2637 2647 case KVM_CAP_IRQ_INJECT_STATUS: 2638 2648 case KVM_CAP_IRQFD: 2639 2649 case KVM_CAP_IOEVENTFD: 2650 + case KVM_CAP_IOEVENTFD_NO_LENGTH: 2640 2651 case KVM_CAP_PIT2: 2641 2652 case KVM_CAP_PIT_STATE2: 2642 2653 case KVM_CAP_SET_IDENTITY_MAP_ADDR: ··· 3640 3649 offset = i * BITS_PER_LONG; 3641 3650 kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); 3642 3651 } 3643 - if (is_dirty) 3644 - kvm_flush_remote_tlbs(kvm); 3645 3652 3646 3653 spin_unlock(&kvm->mmu_lock); 3654 + 3655 + /* See the comments in kvm_mmu_slot_remove_write_access(). */ 3656 + lockdep_assert_held(&kvm->slots_lock); 3657 + 3658 + /* 3659 + * All the TLBs can be flushed out of mmu lock, see the comments in 3660 + * kvm_mmu_slot_remove_write_access(). 3661 + */ 3662 + if (is_dirty) 3663 + kvm_flush_remote_tlbs(kvm); 3647 3664 3648 3665 r = -EFAULT; 3649 3666 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) ··· 4488 4489 unsigned short port, void *val, 4489 4490 unsigned int count, bool in) 4490 4491 { 4491 - trace_kvm_pio(!in, port, size, count); 4492 - 4493 4492 vcpu->arch.pio.port = port; 4494 4493 vcpu->arch.pio.in = in; 4495 4494 vcpu->arch.pio.count = count; ··· 4522 4525 if (ret) { 4523 4526 data_avail: 4524 4527 memcpy(val, vcpu->arch.pio_data, size * count); 4528 + trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); 4525 4529 vcpu->arch.pio.count = 0; 4526 4530 return 1; 4527 4531 } ··· 4537 4539 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4538 4540 4539 4541 memcpy(vcpu->arch.pio_data, val, size * count); 4542 + trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); 4540 4543 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); 4541 4544 } 4542 4545 ··· 4647 4648 } 4648 4649 4649 4650 return res; 4650 - } 4651 - 4652 - static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) 4653 - { 4654 - kvm_set_rflags(emul_to_vcpu(ctxt), val); 4655 4651 } 4656 4652 4657 4653 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) ··· 4833 4839 .set_idt = emulator_set_idt, 4834 4840 .get_cr = emulator_get_cr, 4835 4841 .set_cr = emulator_set_cr, 4836 - .set_rflags = emulator_set_rflags, 4837 4842 .cpl = emulator_get_cpl, 4838 4843 .get_dr = emulator_get_dr, 4839 4844 .set_dr = emulator_set_dr, ··· 4898 4905 ctxt->eip = kvm_rip_read(vcpu); 4899 4906 ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4900 4907 (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : 4901 - cs_l ? X86EMUL_MODE_PROT64 : 4908 + (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : 4902 4909 cs_db ? X86EMUL_MODE_PROT32 : 4903 4910 X86EMUL_MODE_PROT16; 4904 4911 ctxt->guest_mode = is_guest_mode(vcpu); ··· 7326 7333 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 7327 7334 /* 7328 7335 * Write protect all pages for dirty logging. 7329 - * Existing largepage mappings are destroyed here and new ones will 7330 - * not be created until the end of the logging. 7336 + * 7337 + * All the sptes including the large sptes which point to this 7338 + * slot are set to readonly. We can not create any new large 7339 + * spte on this slot until the end of the logging. 7340 + * 7341 + * See the comments in fast_page_fault(). 7331 7342 */ 7332 7343 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7333 7344 kvm_mmu_slot_remove_write_access(kvm, mem->slot);

+32 -2

drivers/s390/char/sclp_early.c

··· 22 22 u8 rnsize; /* 10 */ 23 23 u8 _reserved0[16 - 11]; /* 11-15 */ 24 24 u16 ncpurl; /* 16-17 */ 25 - u8 _reserved7[24 - 18]; /* 18-23 */ 25 + u16 cpuoff; /* 18-19 */ 26 + u8 _reserved7[24 - 20]; /* 20-23 */ 26 27 u8 loadparm[8]; /* 24-31 */ 27 28 u8 _reserved1[48 - 32]; /* 32-47 */ 28 29 u64 facilities; /* 48-55 */ 29 - u8 _reserved2[84 - 56]; /* 56-83 */ 30 + u8 _reserved2a[76 - 56]; /* 56-75 */ 31 + u32 ibc; /* 76-79 */ 32 + u8 _reserved2b[84 - 80]; /* 80-83 */ 30 33 u8 fac84; /* 84 */ 31 34 u8 fac85; /* 85 */ 32 35 u8 _reserved3[91 - 86]; /* 86-90 */ ··· 48 45 static unsigned long sclp_hsa_size; 49 46 static unsigned int sclp_max_cpu; 50 47 static struct sclp_ipl_info sclp_ipl_info; 48 + static unsigned char sclp_siif; 49 + static u32 sclp_ibc; 51 50 52 51 u64 sclp_facilities; 53 52 u8 sclp_fac84; ··· 101 96 102 97 static void __init sclp_facilities_detect(struct read_info_sccb *sccb) 103 98 { 99 + struct sclp_cpu_entry *cpue; 100 + u16 boot_cpu_address, cpu; 101 + 104 102 if (sclp_read_info_early(sccb)) 105 103 return; 106 104 ··· 114 106 sclp_rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; 115 107 sclp_rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; 116 108 sclp_rzm <<= 20; 109 + sclp_ibc = sccb->ibc; 117 110 118 111 if (!sccb->hcpua) { 119 112 if (MACHINE_IS_VM) ··· 123 114 sclp_max_cpu = sccb->ncpurl; 124 115 } else { 125 116 sclp_max_cpu = sccb->hcpua + 1; 117 + } 118 + 119 + boot_cpu_address = stap(); 120 + cpue = (void *)sccb + sccb->cpuoff; 121 + for (cpu = 0; cpu < sccb->ncpurl; cpue++, cpu++) { 122 + if (boot_cpu_address != cpue->address) 123 + continue; 124 + sclp_siif = cpue->siif; 125 + break; 126 126 } 127 127 128 128 /* Save IPL information */ ··· 165 147 { 166 148 return sclp_max_cpu; 167 149 } 150 + 151 + int sclp_has_siif(void) 152 + { 153 + return sclp_siif; 154 + } 155 + EXPORT_SYMBOL(sclp_has_siif); 156 + 157 + unsigned int sclp_get_ibc(void) 158 + { 159 + return sclp_ibc; 160 + } 161 + EXPORT_SYMBOL(sclp_get_ibc); 168 162 169 163 /* 170 164 * This function will be called after sclp_facilities_detect(), which gets

+12 -3

include/linux/kvm_host.h

··· 134 134 #define KVM_REQ_EPR_EXIT 20 135 135 #define KVM_REQ_SCAN_IOAPIC 21 136 136 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 22 137 + #define KVM_REQ_ENABLE_IBS 23 138 + #define KVM_REQ_DISABLE_IBS 24 137 139 138 140 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 139 141 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 ··· 165 163 KVM_MMIO_BUS, 166 164 KVM_PIO_BUS, 167 165 KVM_VIRTIO_CCW_NOTIFY_BUS, 166 + KVM_FAST_MMIO_BUS, 168 167 KVM_NR_BUSES 169 168 }; 170 169 ··· 370 367 struct mm_struct *mm; /* userspace tied to this vm */ 371 368 struct kvm_memslots *memslots; 372 369 struct srcu_struct srcu; 370 + struct srcu_struct irq_srcu; 373 371 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 374 372 u32 bsp_vcpu_id; 375 373 #endif ··· 414 410 unsigned long mmu_notifier_seq; 415 411 long mmu_notifier_count; 416 412 #endif 417 - /* Protected by mmu_lock */ 418 - bool tlbs_dirty; 419 - 413 + long tlbs_dirty; 420 414 struct list_head devices; 421 415 }; 422 416 ··· 879 877 static inline hpa_t pfn_to_hpa(pfn_t pfn) 880 878 { 881 879 return (hpa_t)pfn << PAGE_SHIFT; 880 + } 881 + 882 + static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) 883 + { 884 + unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); 885 + 886 + return kvm_is_error_hva(hva); 882 887 } 883 888 884 889 static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)

+1

include/uapi/linux/Kbuild

··· 317 317 header-y += ppp_defs.h 318 318 header-y += pps.h 319 319 header-y += prctl.h 320 + header-y += psci.h 320 321 header-y += ptp_clock.h 321 322 header-y += ptrace.h 322 323 header-y += qnx4_fs.h

+16 -1

include/uapi/linux/kvm.h

··· 171 171 #define KVM_EXIT_WATCHDOG 21 172 172 #define KVM_EXIT_S390_TSCH 22 173 173 #define KVM_EXIT_EPR 23 174 + #define KVM_EXIT_SYSTEM_EVENT 24 174 175 175 176 /* For KVM_EXIT_INTERNAL_ERROR */ 176 177 /* Emulate instruction failed. */ ··· 302 301 struct { 303 302 __u32 epr; 304 303 } epr; 304 + /* KVM_EXIT_SYSTEM_EVENT */ 305 + struct { 306 + #define KVM_SYSTEM_EVENT_SHUTDOWN 1 307 + #define KVM_SYSTEM_EVENT_RESET 2 308 + __u32 type; 309 + __u64 flags; 310 + } system_event; 305 311 /* Fix the size of the union. */ 306 312 char padding[256]; 307 313 }; ··· 424 416 #define KVM_S390_INT_PFAULT_INIT 0xfffe0004u 425 417 #define KVM_S390_INT_PFAULT_DONE 0xfffe0005u 426 418 #define KVM_S390_MCHK 0xfffe1000u 419 + #define KVM_S390_INT_CLOCK_COMP 0xffff1004u 420 + #define KVM_S390_INT_CPU_TIMER 0xffff1005u 427 421 #define KVM_S390_INT_VIRTIO 0xffff2603u 428 422 #define KVM_S390_INT_SERVICE 0xffff2401u 429 423 #define KVM_S390_INT_EMERGENCY 0xffff1201u ··· 525 515 kvm_ioeventfd_flag_nr_pio, 526 516 kvm_ioeventfd_flag_nr_deassign, 527 517 kvm_ioeventfd_flag_nr_virtio_ccw_notify, 518 + kvm_ioeventfd_flag_nr_fast_mmio, 528 519 kvm_ioeventfd_flag_nr_max, 529 520 }; 530 521 ··· 540 529 struct kvm_ioeventfd { 541 530 __u64 datamatch; 542 531 __u64 addr; /* legal pio/mmio address */ 543 - __u32 len; /* 1, 2, 4, or 8 bytes */ 532 + __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */ 544 533 __s32 fd; 545 534 __u32 flags; 546 535 __u8 pad[36]; ··· 754 743 #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97 755 744 #define KVM_CAP_ENABLE_CAP_VM 98 756 745 #define KVM_CAP_S390_IRQCHIP 99 746 + #define KVM_CAP_IOEVENTFD_NO_LENGTH 100 747 + #define KVM_CAP_VM_ATTRIBUTES 101 748 + #define KVM_CAP_ARM_PSCI_0_2 102 749 + #define KVM_CAP_PPC_FIXUP_HCALL 103 757 750 758 751 #ifdef KVM_CAP_IRQ_ROUTING 759 752

+90

include/uapi/linux/psci.h

··· 1 + /* 2 + * ARM Power State and Coordination Interface (PSCI) header 3 + * 4 + * This header holds common PSCI defines and macros shared 5 + * by: ARM kernel, ARM64 kernel, KVM ARM/ARM64 and user space. 6 + * 7 + * Copyright (C) 2014 Linaro Ltd. 8 + * Author: Anup Patel <anup.patel@linaro.org> 9 + */ 10 + 11 + #ifndef _UAPI_LINUX_PSCI_H 12 + #define _UAPI_LINUX_PSCI_H 13 + 14 + /* 15 + * PSCI v0.1 interface 16 + * 17 + * The PSCI v0.1 function numbers are implementation defined. 18 + * 19 + * Only PSCI return values such as: SUCCESS, NOT_SUPPORTED, 20 + * INVALID_PARAMS, and DENIED defined below are applicable 21 + * to PSCI v0.1. 22 + */ 23 + 24 + /* PSCI v0.2 interface */ 25 + #define PSCI_0_2_FN_BASE 0x84000000 26 + #define PSCI_0_2_FN(n) (PSCI_0_2_FN_BASE + (n)) 27 + #define PSCI_0_2_64BIT 0x40000000 28 + #define PSCI_0_2_FN64_BASE \ 29 + (PSCI_0_2_FN_BASE + PSCI_0_2_64BIT) 30 + #define PSCI_0_2_FN64(n) (PSCI_0_2_FN64_BASE + (n)) 31 + 32 + #define PSCI_0_2_FN_PSCI_VERSION PSCI_0_2_FN(0) 33 + #define PSCI_0_2_FN_CPU_SUSPEND PSCI_0_2_FN(1) 34 + #define PSCI_0_2_FN_CPU_OFF PSCI_0_2_FN(2) 35 + #define PSCI_0_2_FN_CPU_ON PSCI_0_2_FN(3) 36 + #define PSCI_0_2_FN_AFFINITY_INFO PSCI_0_2_FN(4) 37 + #define PSCI_0_2_FN_MIGRATE PSCI_0_2_FN(5) 38 + #define PSCI_0_2_FN_MIGRATE_INFO_TYPE PSCI_0_2_FN(6) 39 + #define PSCI_0_2_FN_MIGRATE_INFO_UP_CPU PSCI_0_2_FN(7) 40 + #define PSCI_0_2_FN_SYSTEM_OFF PSCI_0_2_FN(8) 41 + #define PSCI_0_2_FN_SYSTEM_RESET PSCI_0_2_FN(9) 42 + 43 + #define PSCI_0_2_FN64_CPU_SUSPEND PSCI_0_2_FN64(1) 44 + #define PSCI_0_2_FN64_CPU_ON PSCI_0_2_FN64(3) 45 + #define PSCI_0_2_FN64_AFFINITY_INFO PSCI_0_2_FN64(4) 46 + #define PSCI_0_2_FN64_MIGRATE PSCI_0_2_FN64(5) 47 + #define PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU PSCI_0_2_FN64(7) 48 + 49 + /* PSCI v0.2 power state encoding for CPU_SUSPEND function */ 50 + #define PSCI_0_2_POWER_STATE_ID_MASK 0xffff 51 + #define PSCI_0_2_POWER_STATE_ID_SHIFT 0 52 + #define PSCI_0_2_POWER_STATE_TYPE_SHIFT 16 53 + #define PSCI_0_2_POWER_STATE_TYPE_MASK \ 54 + (0x1 << PSCI_0_2_POWER_STATE_TYPE_SHIFT) 55 + #define PSCI_0_2_POWER_STATE_AFFL_SHIFT 24 56 + #define PSCI_0_2_POWER_STATE_AFFL_MASK \ 57 + (0x3 << PSCI_0_2_POWER_STATE_AFFL_SHIFT) 58 + 59 + /* PSCI v0.2 affinity level state returned by AFFINITY_INFO */ 60 + #define PSCI_0_2_AFFINITY_LEVEL_ON 0 61 + #define PSCI_0_2_AFFINITY_LEVEL_OFF 1 62 + #define PSCI_0_2_AFFINITY_LEVEL_ON_PENDING 2 63 + 64 + /* PSCI v0.2 multicore support in Trusted OS returned by MIGRATE_INFO_TYPE */ 65 + #define PSCI_0_2_TOS_UP_MIGRATE 0 66 + #define PSCI_0_2_TOS_UP_NO_MIGRATE 1 67 + #define PSCI_0_2_TOS_MP 2 68 + 69 + /* PSCI version decoding (independent of PSCI version) */ 70 + #define PSCI_VERSION_MAJOR_SHIFT 16 71 + #define PSCI_VERSION_MINOR_MASK \ 72 + ((1U << PSCI_VERSION_MAJOR_SHIFT) - 1) 73 + #define PSCI_VERSION_MAJOR_MASK ~PSCI_VERSION_MINOR_MASK 74 + #define PSCI_VERSION_MAJOR(ver) \ 75 + (((ver) & PSCI_VERSION_MAJOR_MASK) >> PSCI_VERSION_MAJOR_SHIFT) 76 + #define PSCI_VERSION_MINOR(ver) \ 77 + ((ver) & PSCI_VERSION_MINOR_MASK) 78 + 79 + /* PSCI return values (inclusive of all PSCI versions) */ 80 + #define PSCI_RET_SUCCESS 0 81 + #define PSCI_RET_NOT_SUPPORTED -1 82 + #define PSCI_RET_INVALID_PARAMS -2 83 + #define PSCI_RET_DENIED -3 84 + #define PSCI_RET_ALREADY_ON -4 85 + #define PSCI_RET_ON_PENDING -5 86 + #define PSCI_RET_INTERNAL_FAILURE -6 87 + #define PSCI_RET_NOT_PRESENT -7 88 + #define PSCI_RET_DISABLED -8 89 + 90 + #endif /* _UAPI_LINUX_PSCI_H */

+1 -3

virt/kvm/async_pf.c

··· 80 80 81 81 might_sleep(); 82 82 83 - use_mm(mm); 84 83 down_read(&mm->mmap_sem); 85 - get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL); 84 + get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL); 86 85 up_read(&mm->mmap_sem); 87 86 kvm_async_page_present_sync(vcpu, apf); 88 - unuse_mm(mm); 89 87 90 88 spin_lock(&vcpu->async_pf.lock); 91 89 list_add_tail(&apf->link, &vcpu->async_pf.done);

+53 -15

virt/kvm/eventfd.c

··· 31 31 #include <linux/list.h> 32 32 #include <linux/eventfd.h> 33 33 #include <linux/kernel.h> 34 + #include <linux/srcu.h> 34 35 #include <linux/slab.h> 35 36 36 37 #include "iodev.h" ··· 119 118 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 120 119 { 121 120 struct _irqfd_resampler *resampler; 121 + struct kvm *kvm; 122 122 struct _irqfd *irqfd; 123 + int idx; 123 124 124 125 resampler = container_of(kian, struct _irqfd_resampler, notifier); 126 + kvm = resampler->kvm; 125 127 126 - kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 128 + kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 127 129 resampler->notifier.gsi, 0, false); 128 130 129 - rcu_read_lock(); 131 + idx = srcu_read_lock(&kvm->irq_srcu); 130 132 131 133 list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) 132 134 eventfd_signal(irqfd->resamplefd, 1); 133 135 134 - rcu_read_unlock(); 136 + srcu_read_unlock(&kvm->irq_srcu, idx); 135 137 } 136 138 137 139 static void ··· 146 142 mutex_lock(&kvm->irqfds.resampler_lock); 147 143 148 144 list_del_rcu(&irqfd->resampler_link); 149 - synchronize_rcu(); 145 + synchronize_srcu(&kvm->irq_srcu); 150 146 151 147 if (list_empty(&resampler->list)) { 152 148 list_del(&resampler->link); ··· 225 221 unsigned long flags = (unsigned long)key; 226 222 struct kvm_kernel_irq_routing_entry *irq; 227 223 struct kvm *kvm = irqfd->kvm; 224 + int idx; 228 225 229 226 if (flags & POLLIN) { 230 - rcu_read_lock(); 231 - irq = rcu_dereference(irqfd->irq_entry); 227 + idx = srcu_read_lock(&kvm->irq_srcu); 228 + irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu); 232 229 /* An event has been signaled, inject an interrupt */ 233 230 if (irq) 234 231 kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, 235 232 false); 236 233 else 237 234 schedule_work(&irqfd->inject); 238 - rcu_read_unlock(); 235 + srcu_read_unlock(&kvm->irq_srcu, idx); 239 236 } 240 237 241 238 if (flags & POLLHUP) { ··· 368 363 } 369 364 370 365 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); 371 - synchronize_rcu(); 366 + synchronize_srcu(&kvm->irq_srcu); 372 367 373 368 mutex_unlock(&kvm->irqfds.resampler_lock); 374 369 } ··· 470 465 * another thread calls kvm_irq_routing_update before 471 466 * we flush workqueue below (we synchronize with 472 467 * kvm_irq_routing_update using irqfds.lock). 473 - * It is paired with synchronize_rcu done by caller 468 + * It is paired with synchronize_srcu done by caller 474 469 * of that function. 475 470 */ 476 471 rcu_assign_pointer(irqfd->irq_entry, NULL); ··· 529 524 530 525 /* 531 526 * Change irq_routing and irqfd. 532 - * Caller must invoke synchronize_rcu afterwards. 527 + * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. 533 528 */ 534 529 void kvm_irq_routing_update(struct kvm *kvm, 535 530 struct kvm_irq_routing_table *irq_rt) ··· 605 600 { 606 601 u64 _val; 607 602 608 - if (!(addr == p->addr && len == p->length)) 603 + if (addr != p->addr) 604 + /* address must be precise for a hit */ 605 + return false; 606 + 607 + if (!p->length) 608 + /* length = 0 means only look at the address, so always a hit */ 609 + return true; 610 + 611 + if (len != p->length) 609 612 /* address-range must be precise for a hit */ 610 613 return false; 611 614 ··· 684 671 685 672 list_for_each_entry(_p, &kvm->ioeventfds, list) 686 673 if (_p->bus_idx == p->bus_idx && 687 - _p->addr == p->addr && _p->length == p->length && 688 - (_p->wildcard || p->wildcard || 689 - _p->datamatch == p->datamatch)) 674 + _p->addr == p->addr && 675 + (!_p->length || !p->length || 676 + (_p->length == p->length && 677 + (_p->wildcard || p->wildcard || 678 + _p->datamatch == p->datamatch)))) 690 679 return true; 691 680 692 681 return false; ··· 712 697 int ret; 713 698 714 699 bus_idx = ioeventfd_bus_from_flags(args->flags); 715 - /* must be natural-word sized */ 700 + /* must be natural-word sized, or 0 to ignore length */ 716 701 switch (args->len) { 702 + case 0: 717 703 case 1: 718 704 case 2: 719 705 case 4: ··· 730 714 731 715 /* check for extra flags that we don't understand */ 732 716 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 717 + return -EINVAL; 718 + 719 + /* ioeventfd with no length can't be combined with DATAMATCH */ 720 + if (!args->len && 721 + args->flags & (KVM_IOEVENTFD_FLAG_PIO | 722 + KVM_IOEVENTFD_FLAG_DATAMATCH)) 733 723 return -EINVAL; 734 724 735 725 eventfd = eventfd_ctx_fdget(args->fd); ··· 775 753 if (ret < 0) 776 754 goto unlock_fail; 777 755 756 + /* When length is ignored, MMIO is also put on a separate bus, for 757 + * faster lookups. 758 + */ 759 + if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) { 760 + ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS, 761 + p->addr, 0, &p->dev); 762 + if (ret < 0) 763 + goto register_fail; 764 + } 765 + 778 766 kvm->buses[bus_idx]->ioeventfd_count++; 779 767 list_add_tail(&p->list, &kvm->ioeventfds); 780 768 ··· 792 760 793 761 return 0; 794 762 763 + register_fail: 764 + kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 795 765 unlock_fail: 796 766 mutex_unlock(&kvm->slots_lock); 797 767 ··· 833 799 continue; 834 800 835 801 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 802 + if (!p->length) { 803 + kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS, 804 + &p->dev); 805 + } 836 806 kvm->buses[bus_idx]->ioeventfd_count--; 837 807 ioeventfd_release(p); 838 808 ret = 0;

+9 -8

virt/kvm/irq_comm.c

··· 163 163 struct kvm_kernel_irq_routing_entry *e; 164 164 int ret = -EINVAL; 165 165 struct kvm_irq_routing_table *irq_rt; 166 + int idx; 166 167 167 168 trace_kvm_set_irq(irq, level, irq_source_id); 168 169 ··· 175 174 * Since there's no easy way to do this, we only support injecting MSI 176 175 * which is limited to 1:1 GSI mapping. 177 176 */ 178 - rcu_read_lock(); 179 - irq_rt = rcu_dereference(kvm->irq_routing); 177 + idx = srcu_read_lock(&kvm->irq_srcu); 178 + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 180 179 if (irq < irq_rt->nr_rt_entries) 181 180 hlist_for_each_entry(e, &irq_rt->map[irq], link) { 182 181 if (likely(e->type == KVM_IRQ_ROUTING_MSI)) ··· 185 184 ret = -EWOULDBLOCK; 186 185 break; 187 186 } 188 - rcu_read_unlock(); 187 + srcu_read_unlock(&kvm->irq_srcu, idx); 189 188 return ret; 190 189 } 191 190 ··· 254 253 mutex_lock(&kvm->irq_lock); 255 254 hlist_del_rcu(&kimn->link); 256 255 mutex_unlock(&kvm->irq_lock); 257 - synchronize_rcu(); 256 + synchronize_srcu(&kvm->irq_srcu); 258 257 } 259 258 260 259 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 261 260 bool mask) 262 261 { 263 262 struct kvm_irq_mask_notifier *kimn; 264 - int gsi; 263 + int idx, gsi; 265 264 266 - rcu_read_lock(); 267 - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 265 + idx = srcu_read_lock(&kvm->irq_srcu); 266 + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; 268 267 if (gsi != -1) 269 268 hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link) 270 269 if (kimn->irq == gsi) 271 270 kimn->func(kimn, mask); 272 - rcu_read_unlock(); 271 + srcu_read_unlock(&kvm->irq_srcu, idx); 273 272 } 274 273 275 274 int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,

+16 -15

virt/kvm/irqchip.c

··· 26 26 27 27 #include <linux/kvm_host.h> 28 28 #include <linux/slab.h> 29 + #include <linux/srcu.h> 29 30 #include <linux/export.h> 30 31 #include <trace/events/kvm.h> 31 32 #include "irq.h" ··· 34 33 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 35 34 { 36 35 struct kvm_irq_ack_notifier *kian; 37 - int gsi; 36 + int gsi, idx; 38 37 39 - rcu_read_lock(); 40 - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 38 + idx = srcu_read_lock(&kvm->irq_srcu); 39 + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; 41 40 if (gsi != -1) 42 41 hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, 43 42 link) 44 43 if (kian->gsi == gsi) { 45 - rcu_read_unlock(); 44 + srcu_read_unlock(&kvm->irq_srcu, idx); 46 45 return true; 47 46 } 48 47 49 - rcu_read_unlock(); 48 + srcu_read_unlock(&kvm->irq_srcu, idx); 50 49 51 50 return false; 52 51 } ··· 55 54 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 56 55 { 57 56 struct kvm_irq_ack_notifier *kian; 58 - int gsi; 57 + int gsi, idx; 59 58 60 59 trace_kvm_ack_irq(irqchip, pin); 61 60 62 - rcu_read_lock(); 63 - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 61 + idx = srcu_read_lock(&kvm->irq_srcu); 62 + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; 64 63 if (gsi != -1) 65 64 hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, 66 65 link) 67 66 if (kian->gsi == gsi) 68 67 kian->irq_acked(kian); 69 - rcu_read_unlock(); 68 + srcu_read_unlock(&kvm->irq_srcu, idx); 70 69 } 71 70 72 71 void kvm_register_irq_ack_notifier(struct kvm *kvm, ··· 86 85 mutex_lock(&kvm->irq_lock); 87 86 hlist_del_init_rcu(&kian->link); 88 87 mutex_unlock(&kvm->irq_lock); 89 - synchronize_rcu(); 88 + synchronize_srcu(&kvm->irq_srcu); 90 89 #ifdef __KVM_HAVE_IOAPIC 91 90 kvm_vcpu_request_scan_ioapic(kvm); 92 91 #endif ··· 116 115 bool line_status) 117 116 { 118 117 struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; 119 - int ret = -1, i = 0; 118 + int ret = -1, i = 0, idx; 120 119 struct kvm_irq_routing_table *irq_rt; 121 120 122 121 trace_kvm_set_irq(irq, level, irq_source_id); ··· 125 124 * IOAPIC. So set the bit in both. The guest will ignore 126 125 * writes to the unused one. 127 126 */ 128 - rcu_read_lock(); 129 - irq_rt = rcu_dereference(kvm->irq_routing); 127 + idx = srcu_read_lock(&kvm->irq_srcu); 128 + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 130 129 if (irq < irq_rt->nr_rt_entries) 131 130 hlist_for_each_entry(e, &irq_rt->map[irq], link) 132 131 irq_set[i++] = *e; 133 - rcu_read_unlock(); 132 + srcu_read_unlock(&kvm->irq_srcu, idx); 134 133 135 134 while(i--) { 136 135 int r; ··· 227 226 kvm_irq_routing_update(kvm, new); 228 227 mutex_unlock(&kvm->irq_lock); 229 228 230 - synchronize_rcu(); 229 + synchronize_srcu_expedited(&kvm->irq_srcu); 231 230 232 231 new = old; 233 232 r = 0;

+16 -9

virt/kvm/kvm_main.c

··· 186 186 187 187 void kvm_flush_remote_tlbs(struct kvm *kvm) 188 188 { 189 + long dirty_count = kvm->tlbs_dirty; 190 + 191 + smp_mb(); 189 192 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 190 193 ++kvm->stat.remote_tlb_flush; 191 - kvm->tlbs_dirty = false; 194 + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 192 195 } 193 196 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 194 197 ··· 457 454 458 455 r = kvm_arch_init_vm(kvm, type); 459 456 if (r) 460 - goto out_err_nodisable; 457 + goto out_err_no_disable; 461 458 462 459 r = hardware_enable_all(); 463 460 if (r) 464 - goto out_err_nodisable; 461 + goto out_err_no_disable; 465 462 466 463 #ifdef CONFIG_HAVE_KVM_IRQCHIP 467 464 INIT_HLIST_HEAD(&kvm->mask_notifier_list); ··· 473 470 r = -ENOMEM; 474 471 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 475 472 if (!kvm->memslots) 476 - goto out_err_nosrcu; 473 + goto out_err_no_srcu; 477 474 kvm_init_memslots_id(kvm); 478 475 if (init_srcu_struct(&kvm->srcu)) 479 - goto out_err_nosrcu; 476 + goto out_err_no_srcu; 477 + if (init_srcu_struct(&kvm->irq_srcu)) 478 + goto out_err_no_irq_srcu; 480 479 for (i = 0; i < KVM_NR_BUSES; i++) { 481 480 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 482 481 GFP_KERNEL); ··· 507 502 return kvm; 508 503 509 504 out_err: 505 + cleanup_srcu_struct(&kvm->irq_srcu); 506 + out_err_no_irq_srcu: 510 507 cleanup_srcu_struct(&kvm->srcu); 511 - out_err_nosrcu: 508 + out_err_no_srcu: 512 509 hardware_disable_all(); 513 - out_err_nodisable: 510 + out_err_no_disable: 514 511 for (i = 0; i < KVM_NR_BUSES; i++) 515 512 kfree(kvm->buses[i]); 516 513 kfree(kvm->memslots); ··· 608 601 kvm_arch_destroy_vm(kvm); 609 602 kvm_destroy_devices(kvm); 610 603 kvm_free_physmem(kvm); 604 + cleanup_srcu_struct(&kvm->irq_srcu); 611 605 cleanup_srcu_struct(&kvm->srcu); 612 606 kvm_arch_free_vm(kvm); 613 607 hardware_disable_all(); ··· 645 637 */ 646 638 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 647 639 { 648 - #ifndef CONFIG_S390 649 640 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 650 641 651 642 memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); 652 643 if (!memslot->dirty_bitmap) 653 644 return -ENOMEM; 654 645 655 - #endif /* !CONFIG_S390 */ 656 646 return 0; 657 647 } 658 648 ··· 2928 2922 2929 2923 return -EOPNOTSUPP; 2930 2924 } 2925 + EXPORT_SYMBOL_GPL(kvm_io_bus_write); 2931 2926 2932 2927 /* kvm_io_bus_read - called under kvm->slots_lock */ 2933 2928 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,