Merge tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+13

Documentation/virtual/kvm/api.txt

··· 1124 1124 or any mmio address. The guest may malfunction if it accesses this memory 1125 1125 region. 1126 1126 1127 + Setting the address to 0 will result in resetting the address to its default 1128 + (0xfffbc000). 1129 + 1127 1130 This ioctl is required on Intel-based hosts. This is needed on Intel hardware 1128 1131 because of a quirk in the virtualization implementation (see the internals 1129 1132 documentation when it pops into existence). 1130 1133 1134 + Fails if any VCPU has already been created. 1131 1135 1132 1136 4.41 KVM_SET_BOOT_CPU_ID 1133 1137 ··· 4351 4347 value is used to denote the target vcpu for a SynIC interrupt. For 4352 4348 compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this 4353 4349 capability is absent, userspace can still query this msr's value. 4350 + 4351 + 8.13 KVM_CAP_S390_AIS_MIGRATION 4352 + 4353 + Architectures: s390 4354 + Parameters: none 4355 + 4356 + This capability indicates if the flic device will be able to get/set the 4357 + AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows 4358 + to discover this without having to create a flic device.

+20

Documentation/virtual/kvm/devices/arm-vgic-its.txt

··· 33 33 request the initialization of the ITS, no additional parameter in 34 34 kvm_device_attr.addr. 35 35 36 + KVM_DEV_ARM_ITS_CTRL_RESET 37 + reset the ITS, no additional parameter in kvm_device_attr.addr. 38 + See "ITS Reset State" section. 39 + 36 40 KVM_DEV_ARM_ITS_SAVE_TABLES 37 41 save the ITS table data into guest RAM, at the location provisioned 38 42 by the guest in corresponding registers/table entries. ··· 161 157 - pINTID is the physical LPI ID; if zero, it means the entry is not valid 162 158 and other fields are not meaningful. 163 159 - ICID is the collection ID 160 + 161 + ITS Reset State: 162 + ---------------- 163 + 164 + RESET returns the ITS to the same state that it was when first created and 165 + initialized. When the RESET command returns, the following things are 166 + guaranteed: 167 + 168 + - The ITS is not enabled and quiescent 169 + GITS_CTLR.Enabled = 0 .Quiescent=1 170 + - There is no internally cached state 171 + - No collection or device table are used 172 + GITS_BASER<n>.Valid = 0 173 + - GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0 174 + - The ABI version is unchanged and remains the one set when the ITS 175 + device was first created.

+5

Documentation/virtual/kvm/devices/s390_flic.txt

··· 151 151 to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and 152 152 nimm bit presents AIS mode for a ISC. 153 153 154 + KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION. 155 + 154 156 Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on 155 157 FLIC with an unknown group or attribute gives the error code EINVAL (instead of 156 158 ENXIO, as specified in the API documentation). It is not possible to conclude 157 159 that a FLIC operation is unavailable based on the error code resulting from a 158 160 usage attempt. 161 + 162 + Note: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a zero 163 + schid is specified.

+2

arch/arm/include/asm/kvm_asm.h

··· 68 68 extern void __kvm_tlb_flush_vmid(struct kvm *kvm); 69 69 extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); 70 70 71 + extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); 72 + 71 73 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 72 74 73 75 extern void __init_stage2_translation(void);

+34 -4

arch/arm/include/asm/kvm_emulate.h

··· 25 25 #include <asm/kvm_arm.h> 26 26 #include <asm/cputype.h> 27 27 28 + /* arm64 compatibility macros */ 29 + #define COMPAT_PSR_MODE_ABT ABT_MODE 30 + #define COMPAT_PSR_MODE_UND UND_MODE 31 + #define COMPAT_PSR_T_BIT PSR_T_BIT 32 + #define COMPAT_PSR_I_BIT PSR_I_BIT 33 + #define COMPAT_PSR_A_BIT PSR_A_BIT 34 + #define COMPAT_PSR_E_BIT PSR_E_BIT 35 + #define COMPAT_PSR_IT_MASK PSR_IT_MASK 36 + 28 37 unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); 38 + 39 + static inline unsigned long *vcpu_reg32(struct kvm_vcpu *vcpu, u8 reg_num) 40 + { 41 + return vcpu_reg(vcpu, reg_num); 42 + } 43 + 29 44 unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu); 30 45 31 46 static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, ··· 57 42 58 43 bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); 59 44 void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); 60 - void kvm_inject_undefined(struct kvm_vcpu *vcpu); 45 + void kvm_inject_undef32(struct kvm_vcpu *vcpu); 46 + void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr); 47 + void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); 61 48 void kvm_inject_vabt(struct kvm_vcpu *vcpu); 62 - void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); 63 - void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); 49 + 50 + static inline void kvm_inject_undefined(struct kvm_vcpu *vcpu) 51 + { 52 + kvm_inject_undef32(vcpu); 53 + } 54 + 55 + static inline void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) 56 + { 57 + kvm_inject_dabt32(vcpu, addr); 58 + } 59 + 60 + static inline void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) 61 + { 62 + kvm_inject_pabt32(vcpu, addr); 63 + } 64 64 65 65 static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) 66 66 { ··· 233 203 234 204 static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) 235 205 { 236 - switch (kvm_vcpu_trap_get_fault_type(vcpu)) { 206 + switch (kvm_vcpu_trap_get_fault(vcpu)) { 237 207 case FSC_SEA: 238 208 case FSC_SEA_TTW0: 239 209 case FSC_SEA_TTW1:

+2 -2

arch/arm/include/asm/kvm_hyp.h

··· 98 98 #define cntvoff_el2 CNTVOFF 99 99 #define cnthctl_el2 CNTHCTL 100 100 101 - void __timer_save_state(struct kvm_vcpu *vcpu); 102 - void __timer_restore_state(struct kvm_vcpu *vcpu); 101 + void __timer_enable_traps(struct kvm_vcpu *vcpu); 102 + void __timer_disable_traps(struct kvm_vcpu *vcpu); 103 103 104 104 void __vgic_v2_save_state(struct kvm_vcpu *vcpu); 105 105 void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);

+7

arch/arm/include/uapi/asm/kvm.h

··· 152 152 (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64) 153 153 #define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__) 154 154 155 + /* PL1 Physical Timer Registers */ 156 + #define KVM_REG_ARM_PTIMER_CTL ARM_CP15_REG32(0, 14, 2, 1) 157 + #define KVM_REG_ARM_PTIMER_CNT ARM_CP15_REG64(0, 14) 158 + #define KVM_REG_ARM_PTIMER_CVAL ARM_CP15_REG64(2, 14) 159 + 160 + /* Virtual Timer Registers */ 155 161 #define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1) 156 162 #define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14) 157 163 #define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14) ··· 222 216 #define KVM_DEV_ARM_ITS_SAVE_TABLES 1 223 217 #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 224 218 #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 219 + #define KVM_DEV_ARM_ITS_CTRL_RESET 4 225 220 226 221 /* KVM_IRQ_LINE irq field index values */ 227 222 #define KVM_ARM_IRQ_TYPE_SHIFT 24

-137

arch/arm/kvm/emulate.c

··· 165 165 * Inject exceptions into the guest 166 166 */ 167 167 168 - static u32 exc_vector_base(struct kvm_vcpu *vcpu) 169 - { 170 - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); 171 - u32 vbar = vcpu_cp15(vcpu, c12_VBAR); 172 - 173 - if (sctlr & SCTLR_V) 174 - return 0xffff0000; 175 - else /* always have security exceptions */ 176 - return vbar; 177 - } 178 - 179 - /* 180 - * Switch to an exception mode, updating both CPSR and SPSR. Follow 181 - * the logic described in AArch32.EnterMode() from the ARMv8 ARM. 182 - */ 183 - static void kvm_update_psr(struct kvm_vcpu *vcpu, unsigned long mode) 184 - { 185 - unsigned long cpsr = *vcpu_cpsr(vcpu); 186 - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); 187 - 188 - *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | mode; 189 - 190 - switch (mode) { 191 - case FIQ_MODE: 192 - *vcpu_cpsr(vcpu) |= PSR_F_BIT; 193 - /* Fall through */ 194 - case ABT_MODE: 195 - case IRQ_MODE: 196 - *vcpu_cpsr(vcpu) |= PSR_A_BIT; 197 - /* Fall through */ 198 - default: 199 - *vcpu_cpsr(vcpu) |= PSR_I_BIT; 200 - } 201 - 202 - *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT); 203 - 204 - if (sctlr & SCTLR_TE) 205 - *vcpu_cpsr(vcpu) |= PSR_T_BIT; 206 - if (sctlr & SCTLR_EE) 207 - *vcpu_cpsr(vcpu) |= PSR_E_BIT; 208 - 209 - /* Note: These now point to the mode banked copies */ 210 - *vcpu_spsr(vcpu) = cpsr; 211 - } 212 - 213 - /** 214 - * kvm_inject_undefined - inject an undefined exception into the guest 215 - * @vcpu: The VCPU to receive the undefined exception 216 - * 217 - * It is assumed that this code is called from the VCPU thread and that the 218 - * VCPU therefore is not currently executing guest code. 219 - * 220 - * Modelled after TakeUndefInstrException() pseudocode. 221 - */ 222 - void kvm_inject_undefined(struct kvm_vcpu *vcpu) 223 - { 224 - unsigned long cpsr = *vcpu_cpsr(vcpu); 225 - bool is_thumb = (cpsr & PSR_T_BIT); 226 - u32 vect_offset = 4; 227 - u32 return_offset = (is_thumb) ? 2 : 4; 228 - 229 - kvm_update_psr(vcpu, UND_MODE); 230 - *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; 231 - 232 - /* Branch to exception vector */ 233 - *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; 234 - } 235 - 236 - /* 237 - * Modelled after TakeDataAbortException() and TakePrefetchAbortException 238 - * pseudocode. 239 - */ 240 - static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr) 241 - { 242 - u32 vect_offset; 243 - u32 return_offset = (is_pabt) ? 4 : 8; 244 - bool is_lpae; 245 - 246 - kvm_update_psr(vcpu, ABT_MODE); 247 - *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; 248 - 249 - if (is_pabt) 250 - vect_offset = 12; 251 - else 252 - vect_offset = 16; 253 - 254 - /* Branch to exception vector */ 255 - *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; 256 - 257 - if (is_pabt) { 258 - /* Set IFAR and IFSR */ 259 - vcpu_cp15(vcpu, c6_IFAR) = addr; 260 - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); 261 - /* Always give debug fault for now - should give guest a clue */ 262 - if (is_lpae) 263 - vcpu_cp15(vcpu, c5_IFSR) = 1 << 9 | 0x22; 264 - else 265 - vcpu_cp15(vcpu, c5_IFSR) = 2; 266 - } else { /* !iabt */ 267 - /* Set DFAR and DFSR */ 268 - vcpu_cp15(vcpu, c6_DFAR) = addr; 269 - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); 270 - /* Always give debug fault for now - should give guest a clue */ 271 - if (is_lpae) 272 - vcpu_cp15(vcpu, c5_DFSR) = 1 << 9 | 0x22; 273 - else 274 - vcpu_cp15(vcpu, c5_DFSR) = 2; 275 - } 276 - 277 - } 278 - 279 - /** 280 - * kvm_inject_dabt - inject a data abort into the guest 281 - * @vcpu: The VCPU to receive the undefined exception 282 - * @addr: The address to report in the DFAR 283 - * 284 - * It is assumed that this code is called from the VCPU thread and that the 285 - * VCPU therefore is not currently executing guest code. 286 - */ 287 - void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) 288 - { 289 - inject_abt(vcpu, false, addr); 290 - } 291 - 292 - /** 293 - * kvm_inject_pabt - inject a prefetch abort into the guest 294 - * @vcpu: The VCPU to receive the undefined exception 295 - * @addr: The address to report in the DFAR 296 - * 297 - * It is assumed that this code is called from the VCPU thread and that the 298 - * VCPU therefore is not currently executing guest code. 299 - */ 300 - void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) 301 - { 302 - inject_abt(vcpu, true, addr); 303 - } 304 - 305 168 /** 306 169 * kvm_inject_vabt - inject an async abort / SError into the guest 307 170 * @vcpu: The VCPU to receive the exception

+4 -3

arch/arm/kvm/hyp/switch.c

··· 174 174 __activate_vm(vcpu); 175 175 176 176 __vgic_restore_state(vcpu); 177 - __timer_restore_state(vcpu); 177 + __timer_enable_traps(vcpu); 178 178 179 179 __sysreg_restore_state(guest_ctxt); 180 180 __banked_restore_state(guest_ctxt); ··· 191 191 192 192 __banked_save_state(guest_ctxt); 193 193 __sysreg_save_state(guest_ctxt); 194 - __timer_save_state(vcpu); 194 + __timer_disable_traps(vcpu); 195 + 195 196 __vgic_save_state(vcpu); 196 197 197 198 __deactivate_traps(vcpu); ··· 238 237 239 238 vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR); 240 239 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); 241 - __timer_save_state(vcpu); 240 + __timer_disable_traps(vcpu); 242 241 __deactivate_traps(vcpu); 243 242 __deactivate_vm(vcpu); 244 243 __banked_restore_state(host_ctxt);

+3 -5

arch/arm64/include/asm/arch_timer.h

··· 52 52 const char *desc; 53 53 u32 (*read_cntp_tval_el0)(void); 54 54 u32 (*read_cntv_tval_el0)(void); 55 + u64 (*read_cntpct_el0)(void); 55 56 u64 (*read_cntvct_el0)(void); 56 57 int (*set_next_event_phys)(unsigned long, struct clock_event_device *); 57 58 int (*set_next_event_virt)(unsigned long, struct clock_event_device *); ··· 150 149 151 150 static inline u64 arch_counter_get_cntpct(void) 152 151 { 153 - /* 154 - * AArch64 kernel and user space mandate the use of CNTVCT. 155 - */ 156 - BUG(); 157 - return 0; 152 + isb(); 153 + return arch_timer_reg_read_stable(cntpct_el0); 158 154 } 159 155 160 156 static inline u64 arch_counter_get_cntvct(void)

+2

arch/arm64/include/asm/kvm_asm.h

··· 55 55 extern void __kvm_tlb_flush_vmid(struct kvm *kvm); 56 56 extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); 57 57 58 + extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); 59 + 58 60 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 59 61 60 62 extern u64 __vgic_v3_get_ich_vtr_el2(void);

+4 -1

arch/arm64/include/asm/kvm_emulate.h

··· 41 41 void kvm_inject_vabt(struct kvm_vcpu *vcpu); 42 42 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); 43 43 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); 44 + void kvm_inject_undef32(struct kvm_vcpu *vcpu); 45 + void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr); 46 + void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); 44 47 45 48 static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) 46 49 { ··· 240 237 241 238 static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu) 242 239 { 243 - switch (kvm_vcpu_trap_get_fault_type(vcpu)) { 240 + switch (kvm_vcpu_trap_get_fault(vcpu)) { 244 241 case FSC_SEA: 245 242 case FSC_SEA_TTW0: 246 243 case FSC_SEA_TTW1:

+2 -2

arch/arm64/include/asm/kvm_hyp.h

··· 129 129 void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); 130 130 int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); 131 131 132 - void __timer_save_state(struct kvm_vcpu *vcpu); 133 - void __timer_restore_state(struct kvm_vcpu *vcpu); 132 + void __timer_enable_traps(struct kvm_vcpu *vcpu); 133 + void __timer_disable_traps(struct kvm_vcpu *vcpu); 134 134 135 135 void __sysreg_save_host_state(struct kvm_cpu_context *ctxt); 136 136 void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt);

+1 -1

arch/arm64/include/asm/timex.h

··· 22 22 * Use the current timer as a cycle counter since this is what we use for 23 23 * the delay loop. 24 24 */ 25 - #define get_cycles() arch_counter_get_cntvct() 25 + #define get_cycles() arch_timer_read_counter() 26 26 27 27 #include <asm-generic/timex.h> 28 28

+7

arch/arm64/include/uapi/asm/kvm.h

··· 196 196 197 197 #define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64) 198 198 199 + /* Physical Timer EL0 Registers */ 200 + #define KVM_REG_ARM_PTIMER_CTL ARM64_SYS_REG(3, 3, 14, 2, 1) 201 + #define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2) 202 + #define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1) 203 + 204 + /* EL0 Virtual Timer Registers */ 199 205 #define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) 200 206 #define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) 201 207 #define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) ··· 234 228 #define KVM_DEV_ARM_ITS_SAVE_TABLES 1 235 229 #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 236 230 #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 231 + #define KVM_DEV_ARM_ITS_CTRL_RESET 4 237 232 238 233 /* Device Control API on vcpu fd */ 239 234 #define KVM_ARM_VCPU_PMU_V3_CTRL 0

+3 -3

arch/arm64/kvm/hyp/switch.c

··· 304 304 __activate_vm(vcpu); 305 305 306 306 __vgic_restore_state(vcpu); 307 - __timer_restore_state(vcpu); 307 + __timer_enable_traps(vcpu); 308 308 309 309 /* 310 310 * We must restore the 32-bit state before the sysregs, thanks ··· 374 374 375 375 __sysreg_save_guest_state(guest_ctxt); 376 376 __sysreg32_save_state(vcpu); 377 - __timer_save_state(vcpu); 377 + __timer_disable_traps(vcpu); 378 378 __vgic_save_state(vcpu); 379 379 380 380 __deactivate_traps(vcpu); ··· 442 442 443 443 vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); 444 444 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); 445 - __timer_save_state(vcpu); 445 + __timer_disable_traps(vcpu); 446 446 __deactivate_traps(vcpu); 447 447 __deactivate_vm(vcpu); 448 448 __sysreg_restore_host_state(host_ctxt);

+3 -85

arch/arm64/kvm/inject_fault.c

··· 33 33 #define LOWER_EL_AArch64_VECTOR 0x400 34 34 #define LOWER_EL_AArch32_VECTOR 0x600 35 35 36 - /* 37 - * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. 38 - */ 39 - static const u8 return_offsets[8][2] = { 40 - [0] = { 0, 0 }, /* Reset, unused */ 41 - [1] = { 4, 2 }, /* Undefined */ 42 - [2] = { 0, 0 }, /* SVC, unused */ 43 - [3] = { 4, 4 }, /* Prefetch abort */ 44 - [4] = { 8, 8 }, /* Data abort */ 45 - [5] = { 0, 0 }, /* HVC, unused */ 46 - [6] = { 4, 4 }, /* IRQ, unused */ 47 - [7] = { 4, 4 }, /* FIQ, unused */ 48 - }; 49 - 50 - static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) 51 - { 52 - unsigned long cpsr; 53 - unsigned long new_spsr_value = *vcpu_cpsr(vcpu); 54 - bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT); 55 - u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; 56 - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); 57 - 58 - cpsr = mode | COMPAT_PSR_I_BIT; 59 - 60 - if (sctlr & (1 << 30)) 61 - cpsr |= COMPAT_PSR_T_BIT; 62 - if (sctlr & (1 << 25)) 63 - cpsr |= COMPAT_PSR_E_BIT; 64 - 65 - *vcpu_cpsr(vcpu) = cpsr; 66 - 67 - /* Note: These now point to the banked copies */ 68 - *vcpu_spsr(vcpu) = new_spsr_value; 69 - *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; 70 - 71 - /* Branch to exception vector */ 72 - if (sctlr & (1 << 13)) 73 - vect_offset += 0xffff0000; 74 - else /* always have security exceptions */ 75 - vect_offset += vcpu_cp15(vcpu, c12_VBAR); 76 - 77 - *vcpu_pc(vcpu) = vect_offset; 78 - } 79 - 80 - static void inject_undef32(struct kvm_vcpu *vcpu) 81 - { 82 - prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4); 83 - } 84 - 85 - /* 86 - * Modelled after TakeDataAbortException() and TakePrefetchAbortException 87 - * pseudocode. 88 - */ 89 - static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, 90 - unsigned long addr) 91 - { 92 - u32 vect_offset; 93 - u32 *far, *fsr; 94 - bool is_lpae; 95 - 96 - if (is_pabt) { 97 - vect_offset = 12; 98 - far = &vcpu_cp15(vcpu, c6_IFAR); 99 - fsr = &vcpu_cp15(vcpu, c5_IFSR); 100 - } else { /* !iabt */ 101 - vect_offset = 16; 102 - far = &vcpu_cp15(vcpu, c6_DFAR); 103 - fsr = &vcpu_cp15(vcpu, c5_DFSR); 104 - } 105 - 106 - prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset); 107 - 108 - *far = addr; 109 - 110 - /* Give the guest an IMPLEMENTATION DEFINED exception */ 111 - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); 112 - if (is_lpae) 113 - *fsr = 1 << 9 | 0x34; 114 - else 115 - *fsr = 0x14; 116 - } 117 - 118 36 enum exception_type { 119 37 except_type_sync = 0, 120 38 except_type_irq = 0x80, ··· 129 211 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) 130 212 { 131 213 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 132 - inject_abt32(vcpu, false, addr); 214 + kvm_inject_dabt32(vcpu, addr); 133 215 else 134 216 inject_abt64(vcpu, false, addr); 135 217 } ··· 145 227 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) 146 228 { 147 229 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 148 - inject_abt32(vcpu, true, addr); 230 + kvm_inject_pabt32(vcpu, addr); 149 231 else 150 232 inject_abt64(vcpu, true, addr); 151 233 } ··· 159 241 void kvm_inject_undefined(struct kvm_vcpu *vcpu) 160 242 { 161 243 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 162 - inject_undef32(vcpu); 244 + kvm_inject_undef32(vcpu); 163 245 else 164 246 inject_undef64(vcpu); 165 247 }

+14 -27

arch/arm64/kvm/sys_regs.c

··· 842 842 struct sys_reg_params *p, 843 843 const struct sys_reg_desc *r) 844 844 { 845 - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 846 845 u64 now = kvm_phys_timer_read(); 846 + u64 cval; 847 847 848 - if (p->is_write) 849 - ptimer->cnt_cval = p->regval + now; 850 - else 851 - p->regval = ptimer->cnt_cval - now; 848 + if (p->is_write) { 849 + kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, 850 + p->regval + now); 851 + } else { 852 + cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); 853 + p->regval = cval - now; 854 + } 852 855 853 856 return true; 854 857 } ··· 860 857 struct sys_reg_params *p, 861 858 const struct sys_reg_desc *r) 862 859 { 863 - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 864 - 865 - if (p->is_write) { 866 - /* ISTATUS bit is read-only */ 867 - ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT; 868 - } else { 869 - u64 now = kvm_phys_timer_read(); 870 - 871 - p->regval = ptimer->cnt_ctl; 872 - /* 873 - * Set ISTATUS bit if it's expired. 874 - * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is 875 - * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit 876 - * regardless of ENABLE bit for our implementation convenience. 877 - */ 878 - if (ptimer->cnt_cval <= now) 879 - p->regval |= ARCH_TIMER_CTRL_IT_STAT; 880 - } 860 + if (p->is_write) 861 + kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval); 862 + else 863 + p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); 881 864 882 865 return true; 883 866 } ··· 872 883 struct sys_reg_params *p, 873 884 const struct sys_reg_desc *r) 874 885 { 875 - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 876 - 877 886 if (p->is_write) 878 - ptimer->cnt_cval = p->regval; 887 + kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval); 879 888 else 880 - p->regval = ptimer->cnt_cval; 889 + p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); 881 890 882 891 return true; 883 892 }

+2 -1

arch/powerpc/include/asm/kvm_book3s.h

··· 216 216 bool writing, bool *writable); 217 217 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, 218 218 unsigned long *rmap, long pte_index, int realmode); 219 - extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize); 219 + extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, 220 + unsigned long gfn, unsigned long psize); 220 221 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, 221 222 unsigned long pte_index); 222 223 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,

+119 -21

arch/powerpc/include/asm/kvm_book3s_64.h

··· 20 20 #ifndef __ASM_KVM_BOOK3S_64_H__ 21 21 #define __ASM_KVM_BOOK3S_64_H__ 22 22 23 + #include <linux/string.h> 24 + #include <asm/bitops.h> 23 25 #include <asm/book3s/64/mmu-hash.h> 24 26 25 27 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */ ··· 109 107 hpte[0] = cpu_to_be64(hpte_v); 110 108 } 111 109 110 + /* 111 + * These functions encode knowledge of the POWER7/8/9 hardware 112 + * interpretations of the HPTE LP (large page size) field. 113 + */ 114 + static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) 115 + { 116 + unsigned int lphi; 117 + 118 + if (!(h & HPTE_V_LARGE)) 119 + return 12; /* 4kB */ 120 + lphi = (l >> 16) & 0xf; 121 + switch ((l >> 12) & 0xf) { 122 + case 0: 123 + return !lphi ? 24 : -1; /* 16MB */ 124 + break; 125 + case 1: 126 + return 16; /* 64kB */ 127 + break; 128 + case 3: 129 + return !lphi ? 34 : -1; /* 16GB */ 130 + break; 131 + case 7: 132 + return (16 << 8) + 12; /* 64kB in 4kB */ 133 + break; 134 + case 8: 135 + if (!lphi) 136 + return (24 << 8) + 16; /* 16MB in 64kkB */ 137 + if (lphi == 3) 138 + return (24 << 8) + 12; /* 16MB in 4kB */ 139 + break; 140 + } 141 + return -1; 142 + } 143 + 144 + static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l) 145 + { 146 + return kvmppc_hpte_page_shifts(h, l) & 0xff; 147 + } 148 + 149 + static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l) 150 + { 151 + int tmp = kvmppc_hpte_page_shifts(h, l); 152 + 153 + if (tmp >= 0x100) 154 + tmp >>= 8; 155 + return tmp; 156 + } 157 + 158 + static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r) 159 + { 160 + return 1ul << kvmppc_hpte_actual_page_shift(v, r); 161 + } 162 + 163 + static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift) 164 + { 165 + switch (base_shift) { 166 + case 12: 167 + switch (actual_shift) { 168 + case 12: 169 + return 0; 170 + case 16: 171 + return 7; 172 + case 24: 173 + return 0x38; 174 + } 175 + break; 176 + case 16: 177 + switch (actual_shift) { 178 + case 16: 179 + return 1; 180 + case 24: 181 + return 8; 182 + } 183 + break; 184 + case 24: 185 + return 0; 186 + } 187 + return -1; 188 + } 189 + 112 190 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, 113 191 unsigned long pte_index) 114 192 { 115 - int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; 116 - unsigned int penc; 193 + int a_pgshift, b_pgshift; 117 194 unsigned long rb = 0, va_low, sllp; 118 - unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1); 119 195 120 - if (v & HPTE_V_LARGE) { 121 - i = hpte_page_sizes[lp]; 122 - b_psize = i & 0xf; 123 - a_psize = i >> 4; 196 + b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r); 197 + if (a_pgshift >= 0x100) { 198 + b_pgshift &= 0xff; 199 + a_pgshift >>= 8; 124 200 } 125 201 126 202 /* ··· 232 152 va_low ^= v >> (SID_SHIFT_1T - 16); 233 153 va_low &= 0x7ff; 234 154 235 - switch (b_psize) { 236 - case MMU_PAGE_4K: 237 - sllp = get_sllp_encoding(a_psize); 238 - rb |= sllp << 5; /* AP field */ 155 + if (b_pgshift == 12) { 156 + if (a_pgshift > 12) { 157 + sllp = (a_pgshift == 16) ? 5 : 4; 158 + rb |= sllp << 5; /* AP field */ 159 + } 239 160 rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */ 240 - break; 241 - default: 242 - { 161 + } else { 243 162 int aval_shift; 244 163 /* 245 164 * remaining bits of AVA/LP fields 246 165 * Also contain the rr bits of LP 247 166 */ 248 - rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000; 167 + rb |= (va_low << b_pgshift) & 0x7ff000; 249 168 /* 250 169 * Now clear not needed LP bits based on actual psize 251 170 */ 252 - rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1); 171 + rb &= ~((1ul << a_pgshift) - 1); 253 172 /* 254 173 * AVAL field 58..77 - base_page_shift bits of va 255 174 * we have space for 58..64 bits, Missing bits should 256 175 * be zero filled. +1 is to take care of L bit shift 257 176 */ 258 - aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1; 177 + aval_shift = 64 - (77 - b_pgshift) + 1; 259 178 rb |= ((va_low << aval_shift) & 0xfe); 260 179 261 180 rb |= 1; /* L field */ 262 - penc = mmu_psize_defs[b_psize].penc[a_psize]; 263 - rb |= penc << 12; /* LP field */ 264 - break; 265 - } 181 + rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */ 266 182 } 267 183 rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ 268 184 return rb; ··· 444 368 { 445 369 /* 128 (2**7) bytes in each HPTEG */ 446 370 return (1UL << (hpt->order - 7)) - 1; 371 + } 372 + 373 + /* Set bits in a dirty bitmap, which is in LE format */ 374 + static inline void set_dirty_bits(unsigned long *map, unsigned long i, 375 + unsigned long npages) 376 + { 377 + 378 + if (npages >= 8) 379 + memset((char *)map + i / 8, 0xff, npages / 8); 380 + else 381 + for (; npages; ++i, --npages) 382 + __set_bit_le(i, map); 383 + } 384 + 385 + static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i, 386 + unsigned long npages) 387 + { 388 + if (npages >= 8) 389 + memset((char *)map + i / 8, 0xff, npages / 8); 390 + else 391 + for (; npages; ++i, --npages) 392 + set_bit_le(i, map); 447 393 } 448 394 449 395 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */

+12 -1

arch/powerpc/include/asm/kvm_book3s_asm.h

··· 82 82 u8 do_nap; 83 83 u8 napped[MAX_SMT_THREADS]; 84 84 struct kvmppc_vcore *vc[MAX_SUBCORES]; 85 + /* Bits for changing lpcr on P9 */ 86 + unsigned long lpcr_req; 87 + unsigned long lpidr_req; 88 + unsigned long host_lpcr; 89 + u32 do_set; 90 + u32 do_restore; 91 + union { 92 + u32 allphases; 93 + u8 phase[4]; 94 + } lpcr_sync; 85 95 }; 86 96 87 97 /* ··· 117 107 u8 hwthread_req; 118 108 u8 hwthread_state; 119 109 u8 host_ipi; 120 - u8 ptid; 110 + u8 ptid; /* thread number within subcore when split */ 111 + u8 tid; /* thread number within whole core */ 121 112 struct kvm_vcpu *kvm_vcpu; 122 113 struct kvmppc_vcore *kvm_vcore; 123 114 void __iomem *xics_phys;

+2 -4

arch/powerpc/include/asm/kvm_host.h

··· 235 235 */ 236 236 #define KVMPPC_RMAP_LOCK_BIT 63 237 237 #define KVMPPC_RMAP_RC_SHIFT 32 238 - #define KVMPPC_RMAP_CHG_SHIFT 48 239 238 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) 240 - #define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT) 241 - #define KVMPPC_RMAP_CHG_ORDER (0x3ful << KVMPPC_RMAP_CHG_SHIFT) 242 239 #define KVMPPC_RMAP_PRESENT 0x100000000ul 243 240 #define KVMPPC_RMAP_INDEX 0xfffffffful 244 241 ··· 273 276 int tlbie_lock; 274 277 unsigned long lpcr; 275 278 unsigned long vrma_slb_v; 276 - int hpte_setup_done; 279 + int mmu_ready; 277 280 atomic_t vcpus_running; 278 281 u32 online_vcores; 279 282 atomic_t hpte_mod_interest; ··· 281 284 cpumask_t cpu_in_guest; 282 285 u8 radix; 283 286 u8 fwnmi_enabled; 287 + bool threads_indep; 284 288 pgd_t *pgtable; 285 289 u64 process_table; 286 290 struct dentry *debugfs_dir;

+3

arch/powerpc/include/asm/kvm_ppc.h

··· 168 168 extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); 169 169 extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); 170 170 extern void kvmppc_free_hpt(struct kvm_hpt_info *info); 171 + extern void kvmppc_rmap_reset(struct kvm *kvm); 171 172 extern long kvmppc_prepare_vrma(struct kvm *kvm, 172 173 struct kvm_userspace_memory_region *mem); 173 174 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, ··· 178 177 struct iommu_group *grp); 179 178 extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, 180 179 struct iommu_group *grp); 180 + extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm); 181 + extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm); 181 182 182 183 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, 183 184 struct kvm_create_spapr_tce_64 *args);

+3

arch/powerpc/kernel/asm-offsets.c

··· 642 642 HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); 643 643 HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); 644 644 HSTATE_FIELD(HSTATE_PTID, ptid); 645 + HSTATE_FIELD(HSTATE_TID, tid); 645 646 HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); 646 647 HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); 647 648 HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]); ··· 668 667 OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); 669 668 OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); 670 669 OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); 670 + OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set); 671 + OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore); 671 672 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 672 673 673 674 #ifdef CONFIG_PPC_BOOK3S_64

+60 -68

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 73 73 struct kvm_hpt_info hpt; 74 74 }; 75 75 76 - static void kvmppc_rmap_reset(struct kvm *kvm); 77 - 78 76 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 79 77 { 80 78 unsigned long hpt = 0; ··· 104 106 /* Allocate reverse map array */ 105 107 rev = vmalloc(sizeof(struct revmap_entry) * npte); 106 108 if (!rev) { 107 - pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); 108 109 if (cma) 109 110 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 110 111 else ··· 134 137 long err = -EBUSY; 135 138 struct kvm_hpt_info info; 136 139 137 - if (kvm_is_radix(kvm)) 138 - return -EINVAL; 139 - 140 140 mutex_lock(&kvm->lock); 141 - if (kvm->arch.hpte_setup_done) { 142 - kvm->arch.hpte_setup_done = 0; 143 - /* order hpte_setup_done vs. vcpus_running */ 141 + if (kvm->arch.mmu_ready) { 142 + kvm->arch.mmu_ready = 0; 143 + /* order mmu_ready vs. vcpus_running */ 144 144 smp_mb(); 145 145 if (atomic_read(&kvm->arch.vcpus_running)) { 146 - kvm->arch.hpte_setup_done = 1; 146 + kvm->arch.mmu_ready = 1; 147 147 goto out; 148 148 } 149 149 } 150 + if (kvm_is_radix(kvm)) { 151 + err = kvmppc_switch_mmu_to_hpt(kvm); 152 + if (err) 153 + goto out; 154 + } 155 + 150 156 if (kvm->arch.hpt.order == order) { 151 157 /* We already have a suitable HPT */ 152 158 ··· 183 183 void kvmppc_free_hpt(struct kvm_hpt_info *info) 184 184 { 185 185 vfree(info->rev); 186 + info->rev = NULL; 186 187 if (info->cma) 187 188 kvm_free_hpt_cma(virt_to_page(info->virt), 188 189 1 << (info->order - PAGE_SHIFT)); ··· 335 334 { 336 335 unsigned long ra_mask; 337 336 338 - ra_mask = hpte_page_size(v, r) - 1; 337 + ra_mask = kvmppc_actual_pgsz(v, r) - 1; 339 338 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 340 339 } 341 340 ··· 350 349 __be64 *hptep; 351 350 int index; 352 351 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 352 + 353 + if (kvm_is_radix(vcpu->kvm)) 354 + return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); 353 355 354 356 /* Get SLB entry */ 355 357 if (virtmode) { ··· 509 505 mmio_update = atomic64_read(&kvm->arch.mmio_update); 510 506 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 511 507 r = vcpu->arch.pgfault_cache->rpte; 512 - psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); 508 + psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], 509 + r); 513 510 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 514 511 gfn_base = gpa_base >> PAGE_SHIFT; 515 512 gpa = gpa_base | (ea & (psize - 1)); ··· 539 534 return RESUME_GUEST; 540 535 541 536 /* Translate the logical address and get the page */ 542 - psize = hpte_page_size(hpte[0], r); 537 + psize = kvmppc_actual_pgsz(hpte[0], r); 543 538 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 544 539 gfn_base = gpa_base >> PAGE_SHIFT; 545 540 gpa = gpa_base | (ea & (psize - 1)); ··· 655 650 /* 656 651 * If the HPT is being resized, don't update the HPTE, 657 652 * instead let the guest retry after the resize operation is complete. 658 - * The synchronization for hpte_setup_done test vs. set is provided 653 + * The synchronization for mmu_ready test vs. set is provided 659 654 * by the HPTE lock. 660 655 */ 661 - if (!kvm->arch.hpte_setup_done) 656 + if (!kvm->arch.mmu_ready) 662 657 goto out_unlock; 663 658 664 659 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || ··· 725 720 goto out_put; 726 721 } 727 722 728 - static void kvmppc_rmap_reset(struct kvm *kvm) 723 + void kvmppc_rmap_reset(struct kvm *kvm) 729 724 { 730 725 struct kvm_memslots *slots; 731 726 struct kvm_memory_slot *memslot; ··· 791 786 792 787 /* Must be called with both HPTE and rmap locked */ 793 788 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 789 + struct kvm_memory_slot *memslot, 794 790 unsigned long *rmapp, unsigned long gfn) 795 791 { 796 792 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); ··· 814 808 815 809 /* Now check and modify the HPTE */ 816 810 ptel = rev[i].guest_rpte; 817 - psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); 811 + psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); 818 812 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 819 813 hpte_rpn(ptel, psize) == gfn) { 820 814 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); ··· 823 817 /* Harvest R and C */ 824 818 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 825 819 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 826 - if (rcbits & HPTE_R_C) 827 - kvmppc_update_rmap_change(rmapp, psize); 820 + if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) 821 + kvmppc_update_dirty_map(memslot, gfn, psize); 828 822 if (rcbits & ~rev[i].guest_rpte) { 829 823 rev[i].guest_rpte = ptel | rcbits; 830 824 note_hpte_modification(kvm, &rev[i]); ··· 862 856 continue; 863 857 } 864 858 865 - kvmppc_unmap_hpte(kvm, i, rmapp, gfn); 859 + kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); 866 860 unlock_rmap(rmapp); 867 861 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 868 862 } ··· 1045 1039 1046 1040 retry: 1047 1041 lock_rmap(rmapp); 1048 - if (*rmapp & KVMPPC_RMAP_CHANGED) { 1049 - long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) 1050 - >> KVMPPC_RMAP_CHG_SHIFT; 1051 - *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); 1052 - npages_dirty = 1; 1053 - if (change_order > PAGE_SHIFT) 1054 - npages_dirty = 1ul << (change_order - PAGE_SHIFT); 1055 - } 1056 1042 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1057 1043 unlock_rmap(rmapp); 1058 1044 return npages_dirty; ··· 1100 1102 rev[i].guest_rpte |= HPTE_R_C; 1101 1103 note_hpte_modification(kvm, &rev[i]); 1102 1104 } 1103 - n = hpte_page_size(v, r); 1105 + n = kvmppc_actual_pgsz(v, r); 1104 1106 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1105 1107 if (n > npages_dirty) 1106 1108 npages_dirty = n; ··· 1136 1138 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1137 1139 struct kvm_memory_slot *memslot, unsigned long *map) 1138 1140 { 1139 - unsigned long i, j; 1141 + unsigned long i; 1140 1142 unsigned long *rmapp; 1141 1143 1142 1144 preempt_disable(); ··· 1148 1150 * since we always put huge-page HPTEs in the rmap chain 1149 1151 * corresponding to their page base address. 1150 1152 */ 1151 - if (npages && map) 1152 - for (j = i; npages; ++j, --npages) 1153 - __set_bit_le(j, map); 1153 + if (npages) 1154 + set_dirty_bits(map, i, npages); 1154 1155 ++rmapp; 1155 1156 } 1156 1157 preempt_enable(); ··· 1193 1196 struct page *page = virt_to_page(va); 1194 1197 struct kvm_memory_slot *memslot; 1195 1198 unsigned long gfn; 1196 - unsigned long *rmap; 1197 1199 int srcu_idx; 1198 1200 1199 1201 put_page(page); ··· 1200 1204 if (!dirty) 1201 1205 return; 1202 1206 1203 - /* We need to mark this page dirty in the rmap chain */ 1207 + /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ 1204 1208 gfn = gpa >> PAGE_SHIFT; 1205 1209 srcu_idx = srcu_read_lock(&kvm->srcu); 1206 1210 memslot = gfn_to_memslot(kvm, gfn); 1207 - if (memslot) { 1208 - if (!kvm_is_radix(kvm)) { 1209 - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1210 - lock_rmap(rmap); 1211 - *rmap |= KVMPPC_RMAP_CHANGED; 1212 - unlock_rmap(rmap); 1213 - } else if (memslot->dirty_bitmap) { 1214 - mark_page_dirty(kvm, gfn); 1215 - } 1216 - } 1211 + if (memslot && memslot->dirty_bitmap) 1212 + set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); 1217 1213 srcu_read_unlock(&kvm->srcu, srcu_idx); 1218 1214 } 1219 1215 ··· 1265 1277 guest_rpte = rev->guest_rpte; 1266 1278 1267 1279 ret = -EIO; 1268 - apsize = hpte_page_size(vpte, guest_rpte); 1280 + apsize = kvmppc_actual_pgsz(vpte, guest_rpte); 1269 1281 if (!apsize) 1270 1282 goto out; 1271 1283 ··· 1280 1292 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1281 1293 1282 1294 lock_rmap(rmapp); 1283 - kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); 1295 + kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); 1284 1296 unlock_rmap(rmapp); 1285 1297 } 1286 1298 ··· 1453 1465 struct kvm_resize_hpt *resize; 1454 1466 int ret; 1455 1467 1456 - if (flags != 0) 1468 + if (flags != 0 || kvm_is_radix(kvm)) 1457 1469 return -EINVAL; 1458 1470 1459 1471 if (shift && ((shift < 18) || (shift > 46))) ··· 1519 1531 struct kvm_resize_hpt *resize; 1520 1532 long ret; 1521 1533 1522 - if (flags != 0) 1534 + if (flags != 0 || kvm_is_radix(kvm)) 1523 1535 return -EINVAL; 1524 1536 1525 1537 if (shift && ((shift < 18) || (shift > 46))) ··· 1531 1543 1532 1544 /* This shouldn't be possible */ 1533 1545 ret = -EIO; 1534 - if (WARN_ON(!kvm->arch.hpte_setup_done)) 1546 + if (WARN_ON(!kvm->arch.mmu_ready)) 1535 1547 goto out_no_hpt; 1536 1548 1537 1549 /* Stop VCPUs from running while we mess with the HPT */ 1538 - kvm->arch.hpte_setup_done = 0; 1550 + kvm->arch.mmu_ready = 0; 1539 1551 smp_mb(); 1540 1552 1541 1553 /* Boot all CPUs out of the guest so they re-read 1542 - * hpte_setup_done */ 1554 + * mmu_ready */ 1543 1555 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1544 1556 1545 1557 ret = -ENXIO; ··· 1562 1574 1563 1575 out: 1564 1576 /* Let VCPUs run again */ 1565 - kvm->arch.hpte_setup_done = 1; 1577 + kvm->arch.mmu_ready = 1; 1566 1578 smp_mb(); 1567 1579 out_no_hpt: 1568 1580 resize_hpt_release(kvm, resize); ··· 1705 1717 1706 1718 if (!access_ok(VERIFY_WRITE, buf, count)) 1707 1719 return -EFAULT; 1720 + if (kvm_is_radix(kvm)) 1721 + return 0; 1708 1722 1709 1723 first_pass = ctx->first_pass; 1710 1724 flags = ctx->flags; ··· 1800 1810 unsigned long tmp[2]; 1801 1811 ssize_t nb; 1802 1812 long int err, ret; 1803 - int hpte_setup; 1813 + int mmu_ready; 1804 1814 1805 1815 if (!access_ok(VERIFY_READ, buf, count)) 1806 1816 return -EFAULT; 1817 + if (kvm_is_radix(kvm)) 1818 + return -EINVAL; 1807 1819 1808 1820 /* lock out vcpus from running while we're doing this */ 1809 1821 mutex_lock(&kvm->lock); 1810 - hpte_setup = kvm->arch.hpte_setup_done; 1811 - if (hpte_setup) { 1812 - kvm->arch.hpte_setup_done = 0; /* temporarily */ 1813 - /* order hpte_setup_done vs. vcpus_running */ 1822 + mmu_ready = kvm->arch.mmu_ready; 1823 + if (mmu_ready) { 1824 + kvm->arch.mmu_ready = 0; /* temporarily */ 1825 + /* order mmu_ready vs. vcpus_running */ 1814 1826 smp_mb(); 1815 1827 if (atomic_read(&kvm->arch.vcpus_running)) { 1816 - kvm->arch.hpte_setup_done = 1; 1828 + kvm->arch.mmu_ready = 1; 1817 1829 mutex_unlock(&kvm->lock); 1818 1830 return -EBUSY; 1819 1831 } ··· 1868 1876 "r=%lx\n", ret, i, v, r); 1869 1877 goto out; 1870 1878 } 1871 - if (!hpte_setup && is_vrma_hpte(v)) { 1879 + if (!mmu_ready && is_vrma_hpte(v)) { 1872 1880 unsigned long psize = hpte_base_page_size(v, r); 1873 1881 unsigned long senc = slb_pgsize_encoding(psize); 1874 1882 unsigned long lpcr; ··· 1877 1885 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1878 1886 lpcr = senc << (LPCR_VRMASD_SH - 4); 1879 1887 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 1880 - hpte_setup = 1; 1888 + mmu_ready = 1; 1881 1889 } 1882 1890 ++i; 1883 1891 hptp += 2; ··· 1893 1901 } 1894 1902 1895 1903 out: 1896 - /* Order HPTE updates vs. hpte_setup_done */ 1904 + /* Order HPTE updates vs. mmu_ready */ 1897 1905 smp_wmb(); 1898 - kvm->arch.hpte_setup_done = hpte_setup; 1906 + kvm->arch.mmu_ready = mmu_ready; 1899 1907 mutex_unlock(&kvm->lock); 1900 1908 1901 1909 if (err) ··· 2004 2012 struct kvm *kvm; 2005 2013 __be64 *hptp; 2006 2014 2015 + kvm = p->kvm; 2016 + if (kvm_is_radix(kvm)) 2017 + return 0; 2018 + 2007 2019 ret = mutex_lock_interruptible(&p->mutex); 2008 2020 if (ret) 2009 2021 return ret; ··· 2030 2034 } 2031 2035 } 2032 2036 2033 - kvm = p->kvm; 2034 2037 i = p->hpt_index; 2035 2038 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2036 2039 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); ··· 2104 2109 2105 2110 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2106 2111 2107 - if (kvm_is_radix(vcpu->kvm)) 2108 - mmu->xlate = kvmppc_mmu_radix_xlate; 2109 - else 2110 - mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2112 + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2111 2113 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 2112 2114 2113 2115 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;

+10 -41

arch/powerpc/kvm/book3s_64_mmu_radix.c

··· 474 474 return ret; 475 475 } 476 476 477 - static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, 478 - unsigned long gfn, unsigned int order) 479 - { 480 - unsigned long i, limit; 481 - unsigned long *dp; 482 - 483 - if (!memslot->dirty_bitmap) 484 - return; 485 - limit = 1ul << order; 486 - if (limit < BITS_PER_LONG) { 487 - for (i = 0; i < limit; ++i) 488 - mark_page_dirty(kvm, gfn + i); 489 - return; 490 - } 491 - dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn); 492 - limit /= BITS_PER_LONG; 493 - for (i = 0; i < limit; ++i) 494 - *dp++ = ~0ul; 495 - } 496 - 497 477 /* Called with kvm->lock held */ 498 478 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 499 479 unsigned long gfn) ··· 488 508 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, 489 509 gpa, shift); 490 510 kvmppc_radix_tlbie_page(kvm, gpa, shift); 491 - if (old & _PAGE_DIRTY) { 492 - if (!shift) 493 - mark_page_dirty(kvm, gfn); 494 - else 495 - mark_pages_dirty(kvm, memslot, 496 - gfn, shift - PAGE_SHIFT); 511 + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { 512 + unsigned long npages = 1; 513 + if (shift) 514 + npages = 1ul << (shift - PAGE_SHIFT); 515 + kvmppc_update_dirty_map(memslot, gfn, npages); 497 516 } 498 517 } 499 518 return 0; ··· 558 579 struct kvm_memory_slot *memslot, unsigned long *map) 559 580 { 560 581 unsigned long i, j; 561 - unsigned long n, *p; 562 582 int npages; 563 - 564 - /* 565 - * Radix accumulates dirty bits in the first half of the 566 - * memslot's dirty_bitmap area, for when pages are paged 567 - * out or modified by the host directly. Pick up these 568 - * bits and add them to the map. 569 - */ 570 - n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long); 571 - p = memslot->dirty_bitmap; 572 - for (i = 0; i < n; ++i) 573 - map[i] |= xchg(&p[i], 0); 574 583 575 584 for (i = 0; i < memslot->npages; i = j) { 576 585 npages = kvm_radix_test_clear_dirty(kvm, memslot, i); ··· 571 604 * real address, if npages > 1 we can skip to i + npages. 572 605 */ 573 606 j = i + 1; 574 - if (npages) 575 - for (j = i; npages; ++j, --npages) 576 - __set_bit_le(j, map); 607 + if (npages) { 608 + set_dirty_bits(map, i, npages); 609 + i = j + npages; 610 + } 577 611 } 578 612 return 0; 579 613 } ··· 662 694 pgd_clear(pgd); 663 695 } 664 696 pgd_free(kvm->mm, kvm->arch.pgtable); 697 + kvm->arch.pgtable = NULL; 665 698 } 666 699 667 700 static void pte_ctor(void *addr)

+1 -1

arch/powerpc/kvm/book3s_64_slb.S

··· 113 113 114 114 /* Remove all SLB entries that are in use. */ 115 115 116 - li r0, r0 116 + li r0, 0 117 117 slbmte r0, r0 118 118 slbia 119 119

+224 -125

arch/powerpc/kvm/book3s_hv.c

··· 19 19 */ 20 20 21 21 #include <linux/kvm_host.h> 22 + #include <linux/kernel.h> 22 23 #include <linux/err.h> 23 24 #include <linux/slab.h> 24 25 #include <linux/preempt.h> ··· 99 98 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); 100 99 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); 101 100 101 + static bool indep_threads_mode = true; 102 + module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); 103 + MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); 104 + 102 105 #ifdef CONFIG_KVM_XICS 103 106 static struct kernel_param_ops module_param_ops = { 104 107 .set = param_set_int, ··· 120 115 121 116 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 122 117 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 118 + static void kvmppc_setup_partition_table(struct kvm *kvm); 123 119 124 120 static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, 125 121 int *ip) ··· 1740 1734 * MMU mode (radix or HPT), unfortunately, but since we only support 1741 1735 * HPT guests on a HPT host so far, that isn't an impediment yet. 1742 1736 */ 1743 - static int threads_per_vcore(void) 1737 + static int threads_per_vcore(struct kvm *kvm) 1744 1738 { 1745 - if (cpu_has_feature(CPU_FTR_ARCH_300)) 1739 + if (kvm->arch.threads_indep) 1746 1740 return 1; 1747 1741 return threads_per_subcore; 1748 1742 } ··· 1780 1774 {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, 1781 1775 }; 1782 1776 1783 - #define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) 1777 + #define N_TIMINGS (ARRAY_SIZE(timings)) 1784 1778 1785 1779 struct debugfs_timings_state { 1786 1780 struct kvm_vcpu *vcpu; ··· 2234 2228 kvmppc_ipi_thread(cpu); 2235 2229 } 2236 2230 2237 - static void kvmppc_wait_for_nap(void) 2231 + static void kvmppc_wait_for_nap(int n_threads) 2238 2232 { 2239 2233 int cpu = smp_processor_id(); 2240 2234 int i, loops; 2241 - int n_threads = threads_per_vcore(); 2242 2235 2243 2236 if (n_threads <= 1) 2244 2237 return; ··· 2324 2319 2325 2320 vc->vcore_state = VCORE_PREEMPT; 2326 2321 vc->pcpu = smp_processor_id(); 2327 - if (vc->num_threads < threads_per_vcore()) { 2322 + if (vc->num_threads < threads_per_vcore(vc->kvm)) { 2328 2323 spin_lock(&lp->lock); 2329 2324 list_add_tail(&vc->preempt_list, &lp->list); 2330 2325 spin_unlock(&lp->lock); ··· 2362 2357 2363 2358 /* 2364 2359 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 2365 - * respectively in 2-way micro-threading (split-core) mode. 2360 + * respectively in 2-way micro-threading (split-core) mode on POWER8. 2366 2361 */ 2367 2362 static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; 2368 2363 ··· 2378 2373 2379 2374 static bool subcore_config_ok(int n_subcores, int n_threads) 2380 2375 { 2381 - /* Can only dynamically split if unsplit to begin with */ 2376 + /* 2377 + * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core 2378 + * mode, with one thread per subcore. 2379 + */ 2380 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 2381 + return n_subcores <= 4 && n_threads == 1; 2382 + 2383 + /* On POWER8, can only dynamically split if unsplit to begin with */ 2382 2384 if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) 2383 2385 return false; 2384 2386 if (n_subcores > MAX_SUBCORES) ··· 2414 2402 int sub; 2415 2403 2416 2404 if (!cpu_has_feature(CPU_FTR_ARCH_207S)) 2405 + return false; 2406 + 2407 + /* POWER9 currently requires all threads to be in the same MMU mode */ 2408 + if (cpu_has_feature(CPU_FTR_ARCH_300) && 2409 + kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) 2417 2410 return false; 2418 2411 2419 2412 if (n_threads < cip->max_subcore_threads) ··· 2649 2632 int target_threads; 2650 2633 int controlled_threads; 2651 2634 int trap; 2635 + bool is_power8; 2636 + bool hpt_on_radix; 2652 2637 2653 2638 /* 2654 2639 * Remove from the list any threads that have a signal pending ··· 2673 2654 * the number of threads per subcore, except on POWER9, 2674 2655 * where it's 1 because the threads are (mostly) independent. 2675 2656 */ 2676 - controlled_threads = threads_per_vcore(); 2657 + controlled_threads = threads_per_vcore(vc->kvm); 2677 2658 2678 2659 /* 2679 2660 * Make sure we are running on primary threads, and that secondary 2680 2661 * threads are offline. Also check if the number of threads in this 2681 2662 * guest are greater than the current system threads per guest. 2663 + * On POWER9, we need to be not in independent-threads mode if 2664 + * this is a HPT guest on a radix host. 2682 2665 */ 2683 - if ((controlled_threads > 1) && 2684 - ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { 2666 + hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); 2667 + if (((controlled_threads > 1) && 2668 + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || 2669 + (hpt_on_radix && vc->kvm->arch.threads_indep)) { 2685 2670 for_each_runnable_thread(i, vcpu, vc) { 2686 2671 vcpu->arch.ret = -EBUSY; 2687 2672 kvmppc_remove_runnable(vc, vcpu); ··· 2722 2699 * Hard-disable interrupts, and check resched flag and signals. 2723 2700 * If we need to reschedule or deliver a signal, clean up 2724 2701 * and return without going into the guest(s). 2725 - * If the hpte_setup_done flag has been cleared, don't go into the 2702 + * If the mmu_ready flag has been cleared, don't go into the 2726 2703 * guest because that means a HPT resize operation is in progress. 2727 2704 */ 2728 2705 local_irq_disable(); 2729 2706 hard_irq_disable(); 2730 2707 if (lazy_irq_pending() || need_resched() || 2731 - recheck_signals(&core_info) || 2732 - (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) { 2708 + recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { 2733 2709 local_irq_enable(); 2734 2710 vc->vcore_state = VCORE_INACTIVE; 2735 2711 /* Unlock all except the primary vcore */ ··· 2750 2728 cmd_bit = stat_bit = 0; 2751 2729 split = core_info.n_subcores; 2752 2730 sip = NULL; 2753 - if (split > 1) { 2754 - /* threads_per_subcore must be MAX_SMT_THREADS (8) here */ 2755 - if (split == 2 && (dynamic_mt_modes & 2)) { 2756 - cmd_bit = HID0_POWER8_1TO2LPAR; 2757 - stat_bit = HID0_POWER8_2LPARMODE; 2758 - } else { 2759 - split = 4; 2760 - cmd_bit = HID0_POWER8_1TO4LPAR; 2761 - stat_bit = HID0_POWER8_4LPARMODE; 2762 - } 2763 - subcore_size = MAX_SMT_THREADS / split; 2731 + is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S) 2732 + && !cpu_has_feature(CPU_FTR_ARCH_300); 2733 + 2734 + if (split > 1 || hpt_on_radix) { 2764 2735 sip = &split_info; 2765 2736 memset(&split_info, 0, sizeof(split_info)); 2766 - split_info.rpr = mfspr(SPRN_RPR); 2767 - split_info.pmmar = mfspr(SPRN_PMMAR); 2768 - split_info.ldbar = mfspr(SPRN_LDBAR); 2769 - split_info.subcore_size = subcore_size; 2770 2737 for (sub = 0; sub < core_info.n_subcores; ++sub) 2771 2738 split_info.vc[sub] = core_info.vc[sub]; 2739 + 2740 + if (is_power8) { 2741 + if (split == 2 && (dynamic_mt_modes & 2)) { 2742 + cmd_bit = HID0_POWER8_1TO2LPAR; 2743 + stat_bit = HID0_POWER8_2LPARMODE; 2744 + } else { 2745 + split = 4; 2746 + cmd_bit = HID0_POWER8_1TO4LPAR; 2747 + stat_bit = HID0_POWER8_4LPARMODE; 2748 + } 2749 + subcore_size = MAX_SMT_THREADS / split; 2750 + split_info.rpr = mfspr(SPRN_RPR); 2751 + split_info.pmmar = mfspr(SPRN_PMMAR); 2752 + split_info.ldbar = mfspr(SPRN_LDBAR); 2753 + split_info.subcore_size = subcore_size; 2754 + } else { 2755 + split_info.subcore_size = 1; 2756 + if (hpt_on_radix) { 2757 + /* Use the split_info for LPCR/LPIDR changes */ 2758 + split_info.lpcr_req = vc->lpcr; 2759 + split_info.lpidr_req = vc->kvm->arch.lpid; 2760 + split_info.host_lpcr = vc->kvm->arch.host_lpcr; 2761 + split_info.do_set = 1; 2762 + } 2763 + } 2764 + 2772 2765 /* order writes to split_info before kvm_split_mode pointer */ 2773 2766 smp_wmb(); 2774 2767 } 2775 - for (thr = 0; thr < controlled_threads; ++thr) 2776 - paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; 2777 2768 2778 - /* Initiate micro-threading (split-core) if required */ 2769 + for (thr = 0; thr < controlled_threads; ++thr) { 2770 + paca[pcpu + thr].kvm_hstate.tid = thr; 2771 + paca[pcpu + thr].kvm_hstate.napping = 0; 2772 + paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; 2773 + } 2774 + 2775 + /* Initiate micro-threading (split-core) on POWER8 if required */ 2779 2776 if (cmd_bit) { 2780 2777 unsigned long hid0 = mfspr(SPRN_HID0); 2781 2778 ··· 2813 2772 /* Start all the threads */ 2814 2773 active = 0; 2815 2774 for (sub = 0; sub < core_info.n_subcores; ++sub) { 2816 - thr = subcore_thread_map[sub]; 2775 + thr = is_power8 ? subcore_thread_map[sub] : sub; 2817 2776 thr0_done = false; 2818 2777 active |= 1 << thr; 2819 2778 pvc = core_info.vc[sub]; ··· 2840 2799 * the vcore pointer in the PACA of the secondaries. 2841 2800 */ 2842 2801 smp_mb(); 2843 - if (cmd_bit) 2844 - split_info.do_nap = 1; /* ask secondaries to nap when done */ 2845 2802 2846 2803 /* 2847 2804 * When doing micro-threading, poke the inactive threads as well. 2848 2805 * This gets them to the nap instruction after kvm_do_nap, 2849 2806 * which reduces the time taken to unsplit later. 2807 + * For POWER9 HPT guest on radix host, we need all the secondary 2808 + * threads woken up so they can do the LPCR/LPIDR change. 2850 2809 */ 2851 - if (split > 1) 2810 + if (cmd_bit || hpt_on_radix) { 2811 + split_info.do_nap = 1; /* ask secondaries to nap when done */ 2852 2812 for (thr = 1; thr < threads_per_subcore; ++thr) 2853 2813 if (!(active & (1 << thr))) 2854 2814 kvmppc_ipi_thread(pcpu + thr); 2815 + } 2855 2816 2856 2817 vc->vcore_state = VCORE_RUNNING; 2857 2818 preempt_disable(); ··· 2887 2844 vc->vcore_state = VCORE_EXITING; 2888 2845 2889 2846 /* wait for secondary threads to finish writing their state to memory */ 2890 - kvmppc_wait_for_nap(); 2847 + kvmppc_wait_for_nap(controlled_threads); 2891 2848 2892 2849 /* Return to whole-core mode if we split the core earlier */ 2893 - if (split > 1) { 2850 + if (cmd_bit) { 2894 2851 unsigned long hid0 = mfspr(SPRN_HID0); 2895 2852 unsigned long loops = 0; 2896 2853 ··· 2906 2863 cpu_relax(); 2907 2864 ++loops; 2908 2865 } 2909 - split_info.do_nap = 0; 2866 + } else if (hpt_on_radix) { 2867 + /* Wait for all threads to have seen final sync */ 2868 + for (thr = 1; thr < controlled_threads; ++thr) { 2869 + while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) { 2870 + HMT_low(); 2871 + barrier(); 2872 + } 2873 + HMT_medium(); 2874 + } 2910 2875 } 2876 + split_info.do_nap = 0; 2911 2877 2912 2878 kvmppc_set_host_core(pcpu); 2913 2879 ··· 3125 3073 trace_kvmppc_vcore_wakeup(do_sleep, block_ns); 3126 3074 } 3127 3075 3076 + static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) 3077 + { 3078 + int r = 0; 3079 + struct kvm *kvm = vcpu->kvm; 3080 + 3081 + mutex_lock(&kvm->lock); 3082 + if (!kvm->arch.mmu_ready) { 3083 + if (!kvm_is_radix(kvm)) 3084 + r = kvmppc_hv_setup_htab_rma(vcpu); 3085 + if (!r) { 3086 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 3087 + kvmppc_setup_partition_table(kvm); 3088 + kvm->arch.mmu_ready = 1; 3089 + } 3090 + } 3091 + mutex_unlock(&kvm->lock); 3092 + return r; 3093 + } 3094 + 3128 3095 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3129 3096 { 3130 3097 int n_ceded, i, r; ··· 3200 3129 3201 3130 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 3202 3131 !signal_pending(current)) { 3203 - /* See if the HPT and VRMA are ready to go */ 3204 - if (!kvm_is_radix(vcpu->kvm) && 3205 - !vcpu->kvm->arch.hpte_setup_done) { 3132 + /* See if the MMU is ready to go */ 3133 + if (!vcpu->kvm->arch.mmu_ready) { 3206 3134 spin_unlock(&vc->lock); 3207 - r = kvmppc_hv_setup_htab_rma(vcpu); 3135 + r = kvmhv_setup_mmu(vcpu); 3208 3136 spin_lock(&vc->lock); 3209 3137 if (r) { 3210 3138 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3211 - kvm_run->fail_entry.hardware_entry_failure_reason = 0; 3139 + kvm_run->fail_entry. 3140 + hardware_entry_failure_reason = 0; 3212 3141 vcpu->arch.ret = r; 3213 3142 break; 3214 3143 } ··· 3290 3219 unsigned long ebb_regs[3] = {}; /* shut up GCC */ 3291 3220 unsigned long user_tar = 0; 3292 3221 unsigned int user_vrsave; 3222 + struct kvm *kvm; 3293 3223 3294 3224 if (!vcpu->arch.sane) { 3295 3225 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ··· 3328 3256 return -EINTR; 3329 3257 } 3330 3258 3331 - atomic_inc(&vcpu->kvm->arch.vcpus_running); 3332 - /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ 3259 + kvm = vcpu->kvm; 3260 + atomic_inc(&kvm->arch.vcpus_running); 3261 + /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */ 3333 3262 smp_mb(); 3334 3263 3335 3264 flush_all_to_thread(current); ··· 3358 3285 trace_kvm_hcall_exit(vcpu, r); 3359 3286 kvmppc_core_prepare_to_enter(vcpu); 3360 3287 } else if (r == RESUME_PAGE_FAULT) { 3361 - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 3288 + srcu_idx = srcu_read_lock(&kvm->srcu); 3362 3289 r = kvmppc_book3s_hv_page_fault(run, vcpu, 3363 3290 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 3364 - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 3291 + srcu_read_unlock(&kvm->srcu, srcu_idx); 3365 3292 } else if (r == RESUME_PASSTHROUGH) { 3366 3293 if (WARN_ON(xive_enabled())) 3367 3294 r = H_SUCCESS; ··· 3381 3308 mtspr(SPRN_VRSAVE, user_vrsave); 3382 3309 3383 3310 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 3384 - atomic_dec(&vcpu->kvm->arch.vcpus_running); 3311 + atomic_dec(&kvm->arch.vcpus_running); 3385 3312 return r; 3386 3313 } 3387 3314 3388 3315 static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, 3389 - int linux_psize) 3316 + int shift, int sllp) 3390 3317 { 3391 - struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; 3392 - 3393 - if (!def->shift) 3394 - return; 3395 - (*sps)->page_shift = def->shift; 3396 - (*sps)->slb_enc = def->sllp; 3397 - (*sps)->enc[0].page_shift = def->shift; 3398 - (*sps)->enc[0].pte_enc = def->penc[linux_psize]; 3318 + (*sps)->page_shift = shift; 3319 + (*sps)->slb_enc = sllp; 3320 + (*sps)->enc[0].page_shift = shift; 3321 + (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift); 3399 3322 /* 3400 - * Add 16MB MPSS support if host supports it 3323 + * Add 16MB MPSS support (may get filtered out by userspace) 3401 3324 */ 3402 - if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { 3403 - (*sps)->enc[1].page_shift = 24; 3404 - (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; 3325 + if (shift != 24) { 3326 + int penc = kvmppc_pgsize_lp_encoding(shift, 24); 3327 + if (penc != -1) { 3328 + (*sps)->enc[1].page_shift = 24; 3329 + (*sps)->enc[1].pte_enc = penc; 3330 + } 3405 3331 } 3406 3332 (*sps)++; 3407 3333 } ··· 3411 3339 struct kvm_ppc_one_seg_page_size *sps; 3412 3340 3413 3341 /* 3414 - * Since we don't yet support HPT guests on a radix host, 3415 - * return an error if the host uses radix. 3416 - */ 3417 - if (radix_enabled()) 3418 - return -EINVAL; 3419 - 3420 - /* 3421 3342 * POWER7, POWER8 and POWER9 all support 32 storage keys for data. 3422 3343 * POWER7 doesn't support keys for instruction accesses, 3423 3344 * POWER8 and POWER9 do. ··· 3418 3353 info->data_keys = 32; 3419 3354 info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; 3420 3355 3421 - info->flags = KVM_PPC_PAGE_SIZES_REAL; 3422 - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) 3423 - info->flags |= KVM_PPC_1T_SEGMENTS; 3424 - info->slb_size = mmu_slb_size; 3356 + /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */ 3357 + info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS; 3358 + info->slb_size = 32; 3425 3359 3426 3360 /* We only support these sizes for now, and no muti-size segments */ 3427 3361 sps = &info->sps[0]; 3428 - kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); 3429 - kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); 3430 - kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); 3362 + kvmppc_add_seg_page_size(&sps, 12, 0); 3363 + kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); 3364 + kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); 3431 3365 3432 3366 return 0; 3433 3367 } ··· 3441 3377 struct kvm_memory_slot *memslot; 3442 3378 int i, r; 3443 3379 unsigned long n; 3444 - unsigned long *buf; 3380 + unsigned long *buf, *p; 3445 3381 struct kvm_vcpu *vcpu; 3446 3382 3447 3383 mutex_lock(&kvm->slots_lock); ··· 3457 3393 goto out; 3458 3394 3459 3395 /* 3460 - * Use second half of bitmap area because radix accumulates 3461 - * bits in the first half. 3396 + * Use second half of bitmap area because both HPT and radix 3397 + * accumulate bits in the first half. 3462 3398 */ 3463 3399 n = kvm_dirty_bitmap_bytes(memslot); 3464 3400 buf = memslot->dirty_bitmap + n / sizeof(long); ··· 3470 3406 r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf); 3471 3407 if (r) 3472 3408 goto out; 3409 + 3410 + /* 3411 + * We accumulate dirty bits in the first half of the 3412 + * memslot's dirty_bitmap area, for when pages are paged 3413 + * out or modified by the host directly. Pick up these 3414 + * bits and add them to the map. 3415 + */ 3416 + p = memslot->dirty_bitmap; 3417 + for (i = 0; i < n / sizeof(long); ++i) 3418 + buf[i] |= xchg(&p[i], 0); 3473 3419 3474 3420 /* Harvest dirty bits from VPA and DTL updates */ 3475 3421 /* Note: we never modify the SLB shadow buffer areas */ ··· 3512 3438 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, 3513 3439 unsigned long npages) 3514 3440 { 3515 - /* 3516 - * For now, if radix_enabled() then we only support radix guests, 3517 - * and in that case we don't need the rmap array. 3518 - */ 3519 - if (radix_enabled()) { 3520 - slot->arch.rmap = NULL; 3521 - return 0; 3522 - } 3523 - 3524 3441 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 3525 3442 if (!slot->arch.rmap) 3526 3443 return -ENOMEM; ··· 3532 3467 const struct kvm_memory_slot *new) 3533 3468 { 3534 3469 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 3535 - struct kvm_memslots *slots; 3536 - struct kvm_memory_slot *memslot; 3537 3470 3538 3471 /* 3539 3472 * If we are making a new memslot, it might make ··· 3541 3478 */ 3542 3479 if (npages) 3543 3480 atomic64_inc(&kvm->arch.mmio_update); 3544 - 3545 - if (npages && old->npages && !kvm_is_radix(kvm)) { 3546 - /* 3547 - * If modifying a memslot, reset all the rmap dirty bits. 3548 - * If this is a new memslot, we don't need to do anything 3549 - * since the rmap array starts out as all zeroes, 3550 - * i.e. no pages are dirty. 3551 - */ 3552 - slots = kvm_memslots(kvm); 3553 - memslot = id_to_memslot(slots, mem->slot); 3554 - kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); 3555 - } 3556 3481 } 3557 3482 3558 3483 /* ··· 3596 3545 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); 3597 3546 } 3598 3547 3548 + /* 3549 + * Set up HPT (hashed page table) and RMA (real-mode area). 3550 + * Must be called with kvm->lock held. 3551 + */ 3599 3552 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 3600 3553 { 3601 3554 int err = 0; ··· 3610 3555 unsigned long lpcr = 0, senc; 3611 3556 unsigned long psize, porder; 3612 3557 int srcu_idx; 3613 - 3614 - mutex_lock(&kvm->lock); 3615 - if (kvm->arch.hpte_setup_done) 3616 - goto out; /* another vcpu beat us to it */ 3617 3558 3618 3559 /* Allocate hashed page table (if not done already) and reset it */ 3619 3560 if (!kvm->arch.hpt.virt) { ··· 3669 3618 /* the -4 is to account for senc values starting at 0x10 */ 3670 3619 lpcr = senc << (LPCR_VRMASD_SH - 4); 3671 3620 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 3672 - } else { 3673 - kvmppc_setup_partition_table(kvm); 3674 3621 } 3675 3622 3676 - /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ 3623 + /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */ 3677 3624 smp_wmb(); 3678 - kvm->arch.hpte_setup_done = 1; 3679 3625 err = 0; 3680 3626 out_srcu: 3681 3627 srcu_read_unlock(&kvm->srcu, srcu_idx); 3682 3628 out: 3683 - mutex_unlock(&kvm->lock); 3684 3629 return err; 3685 3630 3686 3631 up_out: 3687 3632 up_read(&current->mm->mmap_sem); 3688 3633 goto out_srcu; 3634 + } 3635 + 3636 + /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ 3637 + int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) 3638 + { 3639 + kvmppc_free_radix(kvm); 3640 + kvmppc_update_lpcr(kvm, LPCR_VPM1, 3641 + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 3642 + kvmppc_rmap_reset(kvm); 3643 + kvm->arch.radix = 0; 3644 + kvm->arch.process_table = 0; 3645 + return 0; 3646 + } 3647 + 3648 + /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ 3649 + int kvmppc_switch_mmu_to_radix(struct kvm *kvm) 3650 + { 3651 + int err; 3652 + 3653 + err = kvmppc_init_vm_radix(kvm); 3654 + if (err) 3655 + return err; 3656 + 3657 + kvmppc_free_hpt(&kvm->arch.hpt); 3658 + kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, 3659 + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); 3660 + kvm->arch.radix = 1; 3661 + return 0; 3689 3662 } 3690 3663 3691 3664 #ifdef CONFIG_KVM_XICS ··· 3855 3780 } 3856 3781 3857 3782 /* 3858 - * For now, if the host uses radix, the guest must be radix. 3783 + * If the host uses radix, the guest starts out as radix. 3859 3784 */ 3860 3785 if (radix_enabled()) { 3861 3786 kvm->arch.radix = 1; 3787 + kvm->arch.mmu_ready = 1; 3862 3788 lpcr &= ~LPCR_VPM1; 3863 3789 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; 3864 3790 ret = kvmppc_init_vm_radix(kvm); ··· 3879 3803 * Work out how many sets the TLB has, for the use of 3880 3804 * the TLB invalidation loop in book3s_hv_rmhandlers.S. 3881 3805 */ 3882 - if (kvm_is_radix(kvm)) 3806 + if (radix_enabled()) 3883 3807 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ 3884 3808 else if (cpu_has_feature(CPU_FTR_ARCH_300)) 3885 3809 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ ··· 3891 3815 /* 3892 3816 * Track that we now have a HV mode VM active. This blocks secondary 3893 3817 * CPU threads from coming online. 3894 - * On POWER9, we only need to do this for HPT guests on a radix 3895 - * host, which is not yet supported. 3818 + * On POWER9, we only need to do this if the "indep_threads_mode" 3819 + * module parameter has been set to N. 3896 3820 */ 3897 - if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3821 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 3822 + kvm->arch.threads_indep = indep_threads_mode; 3823 + if (!kvm->arch.threads_indep) 3898 3824 kvm_hv_vm_activated(); 3899 3825 3900 3826 /* ··· 3936 3858 { 3937 3859 debugfs_remove_recursive(kvm->arch.debugfs_dir); 3938 3860 3939 - if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3861 + if (!kvm->arch.threads_indep) 3940 3862 kvm_hv_vm_deactivated(); 3941 3863 3942 3864 kvmppc_free_vcores(kvm); ··· 4271 4193 { 4272 4194 unsigned long lpcr; 4273 4195 int radix; 4196 + int err; 4274 4197 4275 4198 /* If not on a POWER9, reject it */ 4276 4199 if (!cpu_has_feature(CPU_FTR_ARCH_300)) ··· 4281 4202 if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) 4282 4203 return -EINVAL; 4283 4204 4284 - /* We can't change a guest to/from radix yet */ 4285 - radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); 4286 - if (radix != kvm_is_radix(kvm)) 4287 - return -EINVAL; 4288 - 4289 4205 /* GR (guest radix) bit in process_table field must match */ 4206 + radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); 4290 4207 if (!!(cfg->process_table & PATB_GR) != radix) 4291 4208 return -EINVAL; 4292 4209 ··· 4290 4215 if ((cfg->process_table & PRTS_MASK) > 24) 4291 4216 return -EINVAL; 4292 4217 4218 + /* We can change a guest to/from radix now, if the host is radix */ 4219 + if (radix && !radix_enabled()) 4220 + return -EINVAL; 4221 + 4293 4222 mutex_lock(&kvm->lock); 4223 + if (radix != kvm_is_radix(kvm)) { 4224 + if (kvm->arch.mmu_ready) { 4225 + kvm->arch.mmu_ready = 0; 4226 + /* order mmu_ready vs. vcpus_running */ 4227 + smp_mb(); 4228 + if (atomic_read(&kvm->arch.vcpus_running)) { 4229 + kvm->arch.mmu_ready = 1; 4230 + err = -EBUSY; 4231 + goto out_unlock; 4232 + } 4233 + } 4234 + if (radix) 4235 + err = kvmppc_switch_mmu_to_radix(kvm); 4236 + else 4237 + err = kvmppc_switch_mmu_to_hpt(kvm); 4238 + if (err) 4239 + goto out_unlock; 4240 + } 4241 + 4294 4242 kvm->arch.process_table = cfg->process_table; 4295 4243 kvmppc_setup_partition_table(kvm); 4296 4244 4297 4245 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; 4298 4246 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); 4299 - mutex_unlock(&kvm->lock); 4247 + err = 0; 4300 4248 4301 - return 0; 4249 + out_unlock: 4250 + mutex_unlock(&kvm->lock); 4251 + return err; 4302 4252 } 4303 4253 4304 4254 static struct kvmppc_ops kvm_ops_hv = { ··· 4465 4365 MODULE_LICENSE("GPL"); 4466 4366 MODULE_ALIAS_MISCDEV(KVM_MINOR); 4467 4367 MODULE_ALIAS("devname:kvm"); 4468 -

+116 -1

arch/powerpc/kvm/book3s_hv_builtin.c

··· 278 278 struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; 279 279 int ptid = local_paca->kvm_hstate.ptid; 280 280 struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode; 281 - int me, ee, i; 281 + int me, ee, i, t; 282 + int cpu0; 282 283 283 284 /* Set our bit in the threads-exiting-guest map in the 0xff00 284 285 bits of vcore->entry_exit_map */ ··· 320 319 ee | VCORE_EXIT_REQ) != ee); 321 320 if ((ee >> 8) == 0) 322 321 kvmhv_interrupt_vcore(vc, ee); 322 + } 323 + 324 + /* 325 + * On POWER9 when running a HPT guest on a radix host (sip != NULL), 326 + * we have to interrupt inactive CPU threads to get them to 327 + * restore the host LPCR value. 328 + */ 329 + if (sip->lpcr_req) { 330 + if (cmpxchg(&sip->do_restore, 0, 1) == 0) { 331 + vc = local_paca->kvm_hstate.kvm_vcore; 332 + cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid; 333 + for (t = 1; t < threads_per_core; ++t) { 334 + if (sip->napped[t]) 335 + kvmhv_rm_send_ipi(cpu0 + t); 336 + } 337 + } 323 338 } 324 339 } 325 340 ··· 546 529 547 530 unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) 548 531 { 532 + if (!kvmppc_xics_enabled(vcpu)) 533 + return H_TOO_HARD; 549 534 if (xive_enabled()) { 550 535 if (is_rm()) 551 536 return xive_rm_h_xirr(vcpu); ··· 560 541 561 542 unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) 562 543 { 544 + if (!kvmppc_xics_enabled(vcpu)) 545 + return H_TOO_HARD; 563 546 vcpu->arch.gpr[5] = get_tb(); 564 547 if (xive_enabled()) { 565 548 if (is_rm()) ··· 575 554 576 555 unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) 577 556 { 557 + if (!kvmppc_xics_enabled(vcpu)) 558 + return H_TOO_HARD; 578 559 if (xive_enabled()) { 579 560 if (is_rm()) 580 561 return xive_rm_h_ipoll(vcpu, server); ··· 590 567 int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, 591 568 unsigned long mfrr) 592 569 { 570 + if (!kvmppc_xics_enabled(vcpu)) 571 + return H_TOO_HARD; 593 572 if (xive_enabled()) { 594 573 if (is_rm()) 595 574 return xive_rm_h_ipi(vcpu, server, mfrr); ··· 604 579 605 580 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) 606 581 { 582 + if (!kvmppc_xics_enabled(vcpu)) 583 + return H_TOO_HARD; 607 584 if (xive_enabled()) { 608 585 if (is_rm()) 609 586 return xive_rm_h_cppr(vcpu, cppr); ··· 618 591 619 592 int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) 620 593 { 594 + if (!kvmppc_xics_enabled(vcpu)) 595 + return H_TOO_HARD; 621 596 if (xive_enabled()) { 622 597 if (is_rm()) 623 598 return xive_rm_h_eoi(vcpu, xirr); ··· 630 601 return xics_rm_h_eoi(vcpu, xirr); 631 602 } 632 603 #endif /* CONFIG_KVM_XICS */ 604 + 605 + void kvmppc_bad_interrupt(struct pt_regs *regs) 606 + { 607 + die("Bad interrupt in KVM entry/exit code", regs, SIGABRT); 608 + panic("Bad KVM trap"); 609 + } 610 + 611 + /* 612 + * Functions used to switch LPCR HR and UPRT bits on all threads 613 + * when entering and exiting HPT guests on a radix host. 614 + */ 615 + 616 + #define PHASE_REALMODE 1 /* in real mode */ 617 + #define PHASE_SET_LPCR 2 /* have set LPCR */ 618 + #define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */ 619 + #define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */ 620 + 621 + #define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p)) 622 + 623 + static void wait_for_sync(struct kvm_split_mode *sip, int phase) 624 + { 625 + int thr = local_paca->kvm_hstate.tid; 626 + 627 + sip->lpcr_sync.phase[thr] |= phase; 628 + phase = ALL(phase); 629 + while ((sip->lpcr_sync.allphases & phase) != phase) { 630 + HMT_low(); 631 + barrier(); 632 + } 633 + HMT_medium(); 634 + } 635 + 636 + void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip) 637 + { 638 + unsigned long rb, set; 639 + 640 + /* wait for every other thread to get to real mode */ 641 + wait_for_sync(sip, PHASE_REALMODE); 642 + 643 + /* Set LPCR and LPIDR */ 644 + mtspr(SPRN_LPCR, sip->lpcr_req); 645 + mtspr(SPRN_LPID, sip->lpidr_req); 646 + isync(); 647 + 648 + /* Invalidate the TLB on thread 0 */ 649 + if (local_paca->kvm_hstate.tid == 0) { 650 + sip->do_set = 0; 651 + asm volatile("ptesync" : : : "memory"); 652 + for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) { 653 + rb = TLBIEL_INVAL_SET_LPID + 654 + (set << TLBIEL_INVAL_SET_SHIFT); 655 + asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : : 656 + "r" (rb), "r" (0)); 657 + } 658 + asm volatile("ptesync" : : : "memory"); 659 + } 660 + 661 + /* indicate that we have done so and wait for others */ 662 + wait_for_sync(sip, PHASE_SET_LPCR); 663 + /* order read of sip->lpcr_sync.allphases vs. sip->do_set */ 664 + smp_rmb(); 665 + } 666 + 667 + /* 668 + * Called when a thread that has been in the guest needs 669 + * to reload the host LPCR value - but only on POWER9 when 670 + * running a HPT guest on a radix host. 671 + */ 672 + void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) 673 + { 674 + /* we're out of the guest... */ 675 + wait_for_sync(sip, PHASE_OUT_OF_GUEST); 676 + 677 + mtspr(SPRN_LPID, 0); 678 + mtspr(SPRN_LPCR, sip->host_lpcr); 679 + isync(); 680 + 681 + if (local_paca->kvm_hstate.tid == 0) { 682 + sip->do_restore = 0; 683 + smp_wmb(); /* order store of do_restore vs. phase */ 684 + } 685 + 686 + wait_for_sync(sip, PHASE_RESET_LPCR); 687 + smp_mb(); 688 + local_paca->kvm_hstate.kvm_split_mode = NULL; 689 + }

+40 -25

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 107 107 } 108 108 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 109 109 110 - /* Update the changed page order field of an rmap entry */ 111 - void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize) 110 + /* Update the dirty bitmap of a memslot */ 111 + void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, 112 + unsigned long gfn, unsigned long psize) 112 113 { 113 - unsigned long order; 114 + unsigned long npages; 114 115 115 - if (!psize) 116 + if (!psize || !memslot->dirty_bitmap) 116 117 return; 117 - order = ilog2(psize); 118 - order <<= KVMPPC_RMAP_CHG_SHIFT; 119 - if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER)) 120 - *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order; 118 + npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE; 119 + gfn -= memslot->base_gfn; 120 + set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages); 121 121 } 122 - EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change); 122 + EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map); 123 + 124 + static void kvmppc_set_dirty_from_hpte(struct kvm *kvm, 125 + unsigned long hpte_v, unsigned long hpte_gr) 126 + { 127 + struct kvm_memory_slot *memslot; 128 + unsigned long gfn; 129 + unsigned long psize; 130 + 131 + psize = kvmppc_actual_pgsz(hpte_v, hpte_gr); 132 + gfn = hpte_rpn(hpte_gr, psize); 133 + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); 134 + if (memslot && memslot->dirty_bitmap) 135 + kvmppc_update_dirty_map(memslot, gfn, psize); 136 + } 123 137 124 138 /* Returns a pointer to the revmap entry for the page mapped by a HPTE */ 125 139 static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v, 126 - unsigned long hpte_gr) 140 + unsigned long hpte_gr, 141 + struct kvm_memory_slot **memslotp, 142 + unsigned long *gfnp) 127 143 { 128 144 struct kvm_memory_slot *memslot; 129 145 unsigned long *rmap; 130 146 unsigned long gfn; 131 147 132 - gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr)); 148 + gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr)); 133 149 memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); 150 + if (memslotp) 151 + *memslotp = memslot; 152 + if (gfnp) 153 + *gfnp = gfn; 134 154 if (!memslot) 135 155 return NULL; 136 156 ··· 167 147 unsigned long ptel, head; 168 148 unsigned long *rmap; 169 149 unsigned long rcbits; 150 + struct kvm_memory_slot *memslot; 151 + unsigned long gfn; 170 152 171 153 rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); 172 154 ptel = rev->guest_rpte |= rcbits; 173 - rmap = revmap_for_hpte(kvm, hpte_v, ptel); 155 + rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn); 174 156 if (!rmap) 175 157 return; 176 158 lock_rmap(rmap); ··· 191 169 } 192 170 *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; 193 171 if (rcbits & HPTE_R_C) 194 - kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r)); 172 + kvmppc_update_dirty_map(memslot, gfn, 173 + kvmppc_actual_pgsz(hpte_v, hpte_r)); 195 174 unlock_rmap(rmap); 196 175 } 197 176 ··· 216 193 217 194 if (kvm_is_radix(kvm)) 218 195 return H_FUNCTION; 219 - psize = hpte_page_size(pteh, ptel); 196 + psize = kvmppc_actual_pgsz(pteh, ptel); 220 197 if (!psize) 221 198 return H_PARAMETER; 222 199 writing = hpte_is_writable(ptel); ··· 820 797 gr |= r & (HPTE_R_R | HPTE_R_C); 821 798 if (r & HPTE_R_R) { 822 799 kvmppc_clear_ref_hpte(kvm, hpte, pte_index); 823 - rmap = revmap_for_hpte(kvm, v, gr); 800 + rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL); 824 801 if (rmap) { 825 802 lock_rmap(rmap); 826 803 *rmap |= KVMPPC_RMAP_REFERENCED; ··· 842 819 __be64 *hpte; 843 820 unsigned long v, r, gr; 844 821 struct revmap_entry *rev; 845 - unsigned long *rmap; 846 822 long ret = H_NOT_FOUND; 847 823 848 824 if (kvm_is_radix(kvm)) ··· 870 848 r = be64_to_cpu(hpte[1]); 871 849 gr |= r & (HPTE_R_R | HPTE_R_C); 872 850 if (r & HPTE_R_C) { 873 - unsigned long psize = hpte_page_size(v, r); 874 851 hpte[1] = cpu_to_be64(r & ~HPTE_R_C); 875 852 eieio(); 876 - rmap = revmap_for_hpte(kvm, v, gr); 877 - if (rmap) { 878 - lock_rmap(rmap); 879 - *rmap |= KVMPPC_RMAP_CHANGED; 880 - kvmppc_update_rmap_change(rmap, psize); 881 - unlock_rmap(rmap); 882 - } 853 + kvmppc_set_dirty_from_hpte(kvm, v, gr); 883 854 } 884 855 } 885 856 vcpu->arch.gpr[4] = gr; ··· 1029 1014 * Check the HPTE again, including base page size 1030 1015 */ 1031 1016 if ((v & valid) && (v & mask) == val && 1032 - hpte_base_page_size(v, r) == (1ul << pshift)) 1017 + kvmppc_hpte_base_page_shift(v, r) == pshift) 1033 1018 /* Return with the HPTE still locked */ 1034 1019 return (hash << 3) + (i >> 1); 1035 1020

+192 -5

arch/powerpc/kvm/book3s_hv_rmhandlers.S

··· 31 31 #include <asm/tm.h> 32 32 #include <asm/opal.h> 33 33 #include <asm/xive-regs.h> 34 + #include <asm/thread_info.h> 34 35 35 36 /* Sign-extend HDEC if not on POWER9 */ 36 37 #define EXTEND_HDEC(reg) \ ··· 82 81 RFI 83 82 84 83 kvmppc_call_hv_entry: 84 + BEGIN_FTR_SECTION 85 + /* On P9, do LPCR setting, if necessary */ 86 + ld r3, HSTATE_SPLIT_MODE(r13) 87 + cmpdi r3, 0 88 + beq 46f 89 + lwz r4, KVM_SPLIT_DO_SET(r3) 90 + cmpwi r4, 0 91 + beq 46f 92 + bl kvmhv_p9_set_lpcr 93 + nop 94 + 46: 95 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 96 + 85 97 ld r4, HSTATE_KVM_VCPU(r13) 86 98 bl kvmppc_hv_entry 87 99 ··· 401 387 ld r6, HSTATE_SPLIT_MODE(r13) 402 388 cmpdi r6, 0 403 389 beq 63f 390 + BEGIN_FTR_SECTION 404 391 ld r0, KVM_SPLIT_RPR(r6) 405 392 mtspr SPRN_RPR, r0 406 393 ld r0, KVM_SPLIT_PMMAR(r6) ··· 409 394 ld r0, KVM_SPLIT_LDBAR(r6) 410 395 mtspr SPRN_LDBAR, r0 411 396 isync 397 + FTR_SECTION_ELSE 398 + /* On P9 we use the split_info for coordinating LPCR changes */ 399 + lwz r4, KVM_SPLIT_DO_SET(r6) 400 + cmpwi r4, 0 401 + beq 63f 402 + mr r3, r6 403 + bl kvmhv_p9_set_lpcr 404 + nop 405 + ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 412 406 63: 413 407 /* Order load of vcpu after load of vcore */ 414 408 lwsync ··· 488 464 ld r3, HSTATE_SPLIT_MODE(r13) 489 465 cmpdi r3, 0 490 466 beq kvm_no_guest 467 + lwz r0, KVM_SPLIT_DO_SET(r3) 468 + cmpwi r0, 0 469 + bne kvmhv_do_set 470 + lwz r0, KVM_SPLIT_DO_RESTORE(r3) 471 + cmpwi r0, 0 472 + bne kvmhv_do_restore 491 473 lbz r0, KVM_SPLIT_DO_NAP(r3) 492 474 cmpwi r0, 0 493 475 beq kvm_no_guest ··· 504 474 505 475 54: li r0, KVM_HWTHREAD_IN_KVM 506 476 stb r0, HSTATE_HWTHREAD_STATE(r13) 477 + b kvm_no_guest 478 + 479 + kvmhv_do_set: 480 + /* Set LPCR, LPIDR etc. on P9 */ 481 + HMT_MEDIUM 482 + bl kvmhv_p9_set_lpcr 483 + nop 484 + b kvm_no_guest 485 + 486 + kvmhv_do_restore: 487 + HMT_MEDIUM 488 + bl kvmhv_p9_restore_lpcr 489 + nop 507 490 b kvm_no_guest 508 491 509 492 /* ··· 556 513 /* Set kvm_split_mode.napped[tid] = 1 */ 557 514 ld r3, HSTATE_SPLIT_MODE(r13) 558 515 li r0, 1 559 - lhz r4, PACAPACAINDEX(r13) 560 - clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */ 516 + lbz r4, HSTATE_TID(r13) 561 517 addi r4, r4, KVM_SPLIT_NAPPED 562 518 stbx r0, r3, r4 563 519 /* Check the do_nap flag again after setting napped[] */ ··· 1953 1911 19: lis r8,0x7fff /* MAX_INT@h */ 1954 1912 mtspr SPRN_HDEC,r8 1955 1913 1956 - 16: ld r8,KVM_HOST_LPCR(r4) 1914 + 16: 1915 + BEGIN_FTR_SECTION 1916 + /* On POWER9 with HPT-on-radix we need to wait for all other threads */ 1917 + ld r3, HSTATE_SPLIT_MODE(r13) 1918 + cmpdi r3, 0 1919 + beq 47f 1920 + lwz r8, KVM_SPLIT_DO_RESTORE(r3) 1921 + cmpwi r8, 0 1922 + beq 47f 1923 + stw r12, STACK_SLOT_TRAP(r1) 1924 + bl kvmhv_p9_restore_lpcr 1925 + nop 1926 + lwz r12, STACK_SLOT_TRAP(r1) 1927 + b 48f 1928 + 47: 1929 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 1930 + ld r8,KVM_HOST_LPCR(r4) 1957 1931 mtspr SPRN_LPCR,r8 1958 1932 isync 1959 - 1933 + 48: 1960 1934 /* load host SLB entries */ 1961 1935 BEGIN_MMU_FTR_SECTION 1962 1936 b 0f ··· 3191 3133 /* 3192 3134 * We come here if we get any exception or interrupt while we are 3193 3135 * executing host real mode code while in guest MMU context. 3194 - * For now just spin, but we should do something better. 3136 + * r12 is (CR << 32) | vector 3137 + * r13 points to our PACA 3138 + * r12 is saved in HSTATE_SCRATCH0(r13) 3139 + * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE 3140 + * r9 is saved in HSTATE_SCRATCH2(r13) 3141 + * r13 is saved in HSPRG1 3142 + * cfar is saved in HSTATE_CFAR(r13) 3143 + * ppr is saved in HSTATE_PPR(r13) 3195 3144 */ 3196 3145 kvmppc_bad_host_intr: 3146 + /* 3147 + * Switch to the emergency stack, but start half-way down in 3148 + * case we were already on it. 3149 + */ 3150 + mr r9, r1 3151 + std r1, PACAR1(r13) 3152 + ld r1, PACAEMERGSP(r13) 3153 + subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE 3154 + std r9, 0(r1) 3155 + std r0, GPR0(r1) 3156 + std r9, GPR1(r1) 3157 + std r2, GPR2(r1) 3158 + SAVE_4GPRS(3, r1) 3159 + SAVE_2GPRS(7, r1) 3160 + srdi r0, r12, 32 3161 + clrldi r12, r12, 32 3162 + std r0, _CCR(r1) 3163 + std r12, _TRAP(r1) 3164 + andi. r0, r12, 2 3165 + beq 1f 3166 + mfspr r3, SPRN_HSRR0 3167 + mfspr r4, SPRN_HSRR1 3168 + mfspr r5, SPRN_HDAR 3169 + mfspr r6, SPRN_HDSISR 3170 + b 2f 3171 + 1: mfspr r3, SPRN_SRR0 3172 + mfspr r4, SPRN_SRR1 3173 + mfspr r5, SPRN_DAR 3174 + mfspr r6, SPRN_DSISR 3175 + 2: std r3, _NIP(r1) 3176 + std r4, _MSR(r1) 3177 + std r5, _DAR(r1) 3178 + std r6, _DSISR(r1) 3179 + ld r9, HSTATE_SCRATCH2(r13) 3180 + ld r12, HSTATE_SCRATCH0(r13) 3181 + GET_SCRATCH0(r0) 3182 + SAVE_4GPRS(9, r1) 3183 + std r0, GPR13(r1) 3184 + SAVE_NVGPRS(r1) 3185 + ld r5, HSTATE_CFAR(r13) 3186 + std r5, ORIG_GPR3(r1) 3187 + mflr r3 3188 + #ifdef CONFIG_RELOCATABLE 3189 + ld r4, HSTATE_SCRATCH1(r13) 3190 + #else 3191 + mfctr r4 3192 + #endif 3193 + mfxer r5 3194 + lbz r6, PACASOFTIRQEN(r13) 3195 + std r3, _LINK(r1) 3196 + std r4, _CTR(r1) 3197 + std r5, _XER(r1) 3198 + std r6, SOFTE(r1) 3199 + ld r2, PACATOC(r13) 3200 + LOAD_REG_IMMEDIATE(3, 0x7265677368657265) 3201 + std r3, STACK_FRAME_OVERHEAD-16(r1) 3202 + 3203 + /* 3204 + * On POWER9 do a minimal restore of the MMU and call C code, 3205 + * which will print a message and panic. 3206 + * XXX On POWER7 and POWER8, we just spin here since we don't 3207 + * know what the other threads are doing (and we don't want to 3208 + * coordinate with them) - but at least we now have register state 3209 + * in memory that we might be able to look at from another CPU. 3210 + */ 3211 + BEGIN_FTR_SECTION 3197 3212 b . 3213 + END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 3214 + ld r9, HSTATE_KVM_VCPU(r13) 3215 + ld r10, VCPU_KVM(r9) 3216 + 3217 + li r0, 0 3218 + mtspr SPRN_AMR, r0 3219 + mtspr SPRN_IAMR, r0 3220 + mtspr SPRN_CIABR, r0 3221 + mtspr SPRN_DAWRX, r0 3222 + 3223 + /* Flush the ERAT on radix P9 DD1 guest exit */ 3224 + BEGIN_FTR_SECTION 3225 + PPC_INVALIDATE_ERAT 3226 + END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) 3227 + 3228 + BEGIN_MMU_FTR_SECTION 3229 + b 4f 3230 + END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) 3231 + 3232 + slbmte r0, r0 3233 + slbia 3234 + ptesync 3235 + ld r8, PACA_SLBSHADOWPTR(r13) 3236 + .rept SLB_NUM_BOLTED 3237 + li r3, SLBSHADOW_SAVEAREA 3238 + LDX_BE r5, r8, r3 3239 + addi r3, r3, 8 3240 + LDX_BE r6, r8, r3 3241 + andis. r7, r5, SLB_ESID_V@h 3242 + beq 3f 3243 + slbmte r6, r5 3244 + 3: addi r8, r8, 16 3245 + .endr 3246 + 3247 + 4: lwz r7, KVM_HOST_LPID(r10) 3248 + mtspr SPRN_LPID, r7 3249 + mtspr SPRN_PID, r0 3250 + ld r8, KVM_HOST_LPCR(r10) 3251 + mtspr SPRN_LPCR, r8 3252 + isync 3253 + li r0, KVM_GUEST_MODE_NONE 3254 + stb r0, HSTATE_IN_GUEST(r13) 3255 + 3256 + /* 3257 + * Turn on the MMU and jump to C code 3258 + */ 3259 + bcl 20, 31, .+4 3260 + 5: mflr r3 3261 + addi r3, r3, 9f - 5b 3262 + ld r4, PACAKMSR(r13) 3263 + mtspr SPRN_SRR0, r3 3264 + mtspr SPRN_SRR1, r4 3265 + rfid 3266 + 9: addi r3, r1, STACK_FRAME_OVERHEAD 3267 + bl kvmppc_bad_interrupt 3268 + b 9b 3198 3269 3199 3270 /* 3200 3271 * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken

+13 -3

arch/powerpc/kvm/book3s_pr.c

··· 1326 1326 kvmppc_set_pvr_pr(vcpu, sregs->pvr); 1327 1327 1328 1328 vcpu3s->sdr1 = sregs->u.s.sdr1; 1329 + #ifdef CONFIG_PPC_BOOK3S_64 1329 1330 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { 1331 + /* Flush all SLB entries */ 1332 + vcpu->arch.mmu.slbmte(vcpu, 0, 0); 1333 + vcpu->arch.mmu.slbia(vcpu); 1334 + 1330 1335 for (i = 0; i < 64; i++) { 1331 - vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, 1332 - sregs->u.s.ppc64.slb[i].slbe); 1336 + u64 rb = sregs->u.s.ppc64.slb[i].slbe; 1337 + u64 rs = sregs->u.s.ppc64.slb[i].slbv; 1338 + 1339 + if (rb & SLB_ESID_V) 1340 + vcpu->arch.mmu.slbmte(vcpu, rs, rb); 1333 1341 } 1334 - } else { 1342 + } else 1343 + #endif 1344 + { 1335 1345 for (i = 0; i < 16; i++) { 1336 1346 vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); 1337 1347 }

+2

arch/powerpc/kvm/book3s_pr_papr.c

··· 419 419 case H_PROTECT: 420 420 case H_BULK_REMOVE: 421 421 case H_PUT_TCE: 422 + case H_PUT_TCE_INDIRECT: 423 + case H_STUFF_TCE: 422 424 case H_CEDE: 423 425 case H_LOGICAL_CI_LOAD: 424 426 case H_LOGICAL_CI_STORE:

+1 -1

arch/powerpc/kvm/e500_mmu_host.c

··· 377 377 378 378 start = vma->vm_pgoff; 379 379 end = start + 380 - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); 380 + vma_pages(vma); 381 381 382 382 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); 383 383

+1 -2

arch/powerpc/kvm/powerpc.c

··· 590 590 r = !!(hv_enabled && radix_enabled()); 591 591 break; 592 592 case KVM_CAP_PPC_MMU_HASH_V3: 593 - r = !!(hv_enabled && !radix_enabled() && 594 - cpu_has_feature(CPU_FTR_ARCH_300)); 593 + r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); 595 594 break; 596 595 #endif 597 596 case KVM_CAP_SYNC_MMU:

+21 -4

arch/s390/include/asm/kvm_host.h

··· 685 685 __u8 dea_kw; 686 686 }; 687 687 688 + #define APCB0_MASK_SIZE 1 689 + struct kvm_s390_apcb0 { 690 + __u64 apm[APCB0_MASK_SIZE]; /* 0x0000 */ 691 + __u64 aqm[APCB0_MASK_SIZE]; /* 0x0008 */ 692 + __u64 adm[APCB0_MASK_SIZE]; /* 0x0010 */ 693 + __u64 reserved18; /* 0x0018 */ 694 + }; 695 + 696 + #define APCB1_MASK_SIZE 4 697 + struct kvm_s390_apcb1 { 698 + __u64 apm[APCB1_MASK_SIZE]; /* 0x0000 */ 699 + __u64 aqm[APCB1_MASK_SIZE]; /* 0x0020 */ 700 + __u64 adm[APCB1_MASK_SIZE]; /* 0x0040 */ 701 + __u64 reserved60[4]; /* 0x0060 */ 702 + }; 703 + 688 704 struct kvm_s390_crypto_cb { 689 - __u8 reserved00[72]; /* 0x0000 */ 690 - __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ 691 - __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ 692 - __u8 reserved80[128]; /* 0x0080 */ 705 + struct kvm_s390_apcb0 apcb0; /* 0x0000 */ 706 + __u8 reserved20[0x0048 - 0x0020]; /* 0x0020 */ 707 + __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ 708 + __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ 709 + struct kvm_s390_apcb1 apcb1; /* 0x0080 */ 693 710 }; 694 711 695 712 /*

+22 -4

arch/s390/kvm/interrupt.c

··· 213 213 vcpu->arch.local_int.pending_irqs; 214 214 } 215 215 216 + static inline int isc_to_irq_type(unsigned long isc) 217 + { 218 + return IRQ_PEND_IO_ISC_0 + isc; 219 + } 220 + 221 + static inline int irq_type_to_isc(unsigned long irq_type) 222 + { 223 + return irq_type - IRQ_PEND_IO_ISC_0; 224 + } 225 + 216 226 static unsigned long disable_iscs(struct kvm_vcpu *vcpu, 217 227 unsigned long active_mask) 218 228 { ··· 230 220 231 221 for (i = 0; i <= MAX_ISC; i++) 232 222 if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i))) 233 - active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i)); 223 + active_mask &= ~(1UL << (isc_to_irq_type(i))); 234 224 235 225 return active_mask; 236 226 } ··· 911 901 fi = &vcpu->kvm->arch.float_int; 912 902 913 903 spin_lock(&fi->lock); 914 - isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0]; 904 + isc_list = &fi->lists[irq_type_to_isc(irq_type)]; 915 905 inti = list_first_entry_or_null(isc_list, 916 906 struct kvm_s390_interrupt_info, 917 907 list); ··· 1084 1074 * in kvm_vcpu_block without having the waitqueue set (polling) 1085 1075 */ 1086 1076 vcpu->valid_wakeup = true; 1077 + /* 1078 + * This is mostly to document, that the read in swait_active could 1079 + * be moved before other stores, leading to subtle races. 1080 + * All current users do not store or use an atomic like update 1081 + */ 1082 + smp_mb__after_atomic(); 1087 1083 if (swait_active(&vcpu->wq)) { 1088 1084 /* 1089 1085 * The vcpu gave up the cpu voluntarily, mark it as a good ··· 1411 1395 list_del_init(&iter->list); 1412 1396 fi->counters[FIRQ_CNTR_IO] -= 1; 1413 1397 if (list_empty(isc_list)) 1414 - clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); 1398 + clear_bit(isc_to_irq_type(isc), &fi->pending_irqs); 1415 1399 spin_unlock(&fi->lock); 1416 1400 return iter; 1417 1401 } ··· 1538 1522 isc = int_word_to_isc(inti->io.io_int_word); 1539 1523 list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; 1540 1524 list_add_tail(&inti->list, list); 1541 - set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); 1525 + set_bit(isc_to_irq_type(isc), &fi->pending_irqs); 1542 1526 spin_unlock(&fi->lock); 1543 1527 return 0; 1544 1528 } ··· 2191 2175 return -EINVAL; 2192 2176 if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid))) 2193 2177 return -EFAULT; 2178 + if (!schid) 2179 + return -EINVAL; 2194 2180 kfree(kvm_s390_get_io_int(kvm, isc_mask, schid)); 2195 2181 /* 2196 2182 * If userspace is conforming to the architecture, we can have at most

+1

arch/s390/kvm/kvm-s390.c

··· 395 395 case KVM_CAP_S390_USER_INSTR0: 396 396 case KVM_CAP_S390_CMMA_MIGRATION: 397 397 case KVM_CAP_S390_AIS: 398 + case KVM_CAP_S390_AIS_MIGRATION: 398 399 r = 1; 399 400 break; 400 401 case KVM_CAP_S390_MEM_OP:

+18 -32

arch/s390/kvm/vsie.c

··· 443 443 * 444 444 * Returns: - 0 on success 445 445 * - -EINVAL if the gpa is not valid guest storage 446 - * - -ENOMEM if out of memory 447 446 */ 448 447 static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) 449 448 { 450 449 struct page *page; 451 - hva_t hva; 452 - int rc; 453 450 454 - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); 455 - if (kvm_is_error_hva(hva)) 451 + page = gfn_to_page(kvm, gpa_to_gfn(gpa)); 452 + if (is_error_page(page)) 456 453 return -EINVAL; 457 - rc = get_user_pages_fast(hva, 1, 1, &page); 458 - if (rc < 0) 459 - return rc; 460 - else if (rc != 1) 461 - return -ENOMEM; 462 454 *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); 463 455 return 0; 464 456 } ··· 458 466 /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ 459 467 static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) 460 468 { 461 - struct page *page; 462 - 463 - page = virt_to_page(hpa); 464 - set_page_dirty_lock(page); 465 - put_page(page); 469 + kvm_release_pfn_dirty(hpa >> PAGE_SHIFT); 466 470 /* mark the page always as dirty for migration */ 467 471 mark_page_dirty(kvm, gpa_to_gfn(gpa)); 468 472 } ··· 545 557 rc = set_validity_icpt(scb_s, 0x003bU); 546 558 if (!rc) { 547 559 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 548 - if (rc == -EINVAL) 560 + if (rc) 549 561 rc = set_validity_icpt(scb_s, 0x0034U); 550 562 } 551 563 if (rc) ··· 562 574 } 563 575 /* 256 bytes cannot cross page boundaries */ 564 576 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 565 - if (rc == -EINVAL) 577 + if (rc) { 566 578 rc = set_validity_icpt(scb_s, 0x0080U); 567 - if (rc) 568 579 goto unpin; 580 + } 569 581 scb_s->itdba = hpa; 570 582 } 571 583 ··· 580 592 * if this block gets bigger, we have to shadow it. 581 593 */ 582 594 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 583 - if (rc == -EINVAL) 595 + if (rc) { 584 596 rc = set_validity_icpt(scb_s, 0x1310U); 585 - if (rc) 586 597 goto unpin; 598 + } 587 599 scb_s->gvrd = hpa; 588 600 } 589 601 ··· 595 607 } 596 608 /* 64 bytes cannot cross page boundaries */ 597 609 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 598 - if (rc == -EINVAL) 610 + if (rc) { 599 611 rc = set_validity_icpt(scb_s, 0x0043U); 600 - /* Validity 0x0044 will be checked by SIE */ 601 - if (rc) 602 612 goto unpin; 613 + } 614 + /* Validity 0x0044 will be checked by SIE */ 603 615 scb_s->riccbd = hpa; 604 616 } 605 617 if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { ··· 623 635 * cross page boundaries 624 636 */ 625 637 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 626 - if (rc == -EINVAL) 638 + if (rc) { 627 639 rc = set_validity_icpt(scb_s, 0x10b0U); 628 - if (rc) 629 640 goto unpin; 641 + } 630 642 scb_s->sdnxo = hpa | sdnxc; 631 643 } 632 644 return 0; ··· 651 663 * 652 664 * Returns: - 0 if the scb was pinned. 653 665 * - > 0 if control has to be given to guest 2 654 - * - -ENOMEM if out of memory 655 666 */ 656 667 static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, 657 668 gpa_t gpa) ··· 659 672 int rc; 660 673 661 674 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 662 - if (rc == -EINVAL) { 675 + if (rc) { 663 676 rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 664 - if (!rc) 665 - rc = 1; 677 + WARN_ON_ONCE(rc); 678 + return 1; 666 679 } 667 - if (!rc) 668 - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; 669 - return rc; 680 + vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; 681 + return 0; 670 682 } 671 683 672 684 /*

+2

arch/x86/include/asm/kvm_emulate.h

··· 226 226 227 227 unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); 228 228 void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); 229 + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); 230 + 229 231 }; 230 232 231 233 typedef u32 __attribute__((vector_size(16))) sse128_t;

+8

arch/x86/include/asm/kvm_host.h

··· 1061 1061 void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); 1062 1062 1063 1063 void (*setup_mce)(struct kvm_vcpu *vcpu); 1064 + 1065 + int (*smi_allowed)(struct kvm_vcpu *vcpu); 1066 + int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); 1067 + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); 1068 + int (*enable_smi_window)(struct kvm_vcpu *vcpu); 1064 1069 }; 1065 1070 1066 1071 struct kvm_arch_async_pf { ··· 1430 1425 return BAD_APICID; 1431 1426 #endif 1432 1427 } 1428 + 1429 + #define put_smstate(type, buf, offset, val) \ 1430 + *(type *)((buf) + (offset) - 0x7e00) = val 1433 1431 1434 1432 #endif /* _ASM_X86_KVM_HOST_H */

+2 -2

arch/x86/include/asm/vmx.h

··· 70 70 #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 71 71 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 72 72 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 73 - #define SECONDARY_EXEC_RDRAND 0x00000800 73 + #define SECONDARY_EXEC_RDRAND_EXITING 0x00000800 74 74 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 75 75 #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 76 76 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 77 - #define SECONDARY_EXEC_RDSEED 0x00010000 77 + #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 78 78 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 79 79 #define SECONDARY_EXEC_XSAVES 0x00100000 80 80 #define SECONDARY_EXEC_TSC_SCALING 0x02000000

+9

arch/x86/kvm/emulate.c

··· 2591 2591 ctxt->ops->set_msr(ctxt, MSR_EFER, efer); 2592 2592 2593 2593 smbase = ctxt->ops->get_smbase(ctxt); 2594 + 2595 + /* 2596 + * Give pre_leave_smm() a chance to make ISA-specific changes to the 2597 + * vCPU state (e.g. enter guest mode) before loading state from the SMM 2598 + * state-save area. 2599 + */ 2600 + if (ctxt->ops->pre_leave_smm(ctxt, smbase)) 2601 + return X86EMUL_UNHANDLEABLE; 2602 + 2594 2603 if (emulator_has_longmode(ctxt)) 2595 2604 ret = rsm_load_state_64(ctxt, smbase + 0x8000); 2596 2605 else

+70 -21

arch/x86/kvm/lapic.c

··· 1301 1301 apic->divide_count); 1302 1302 } 1303 1303 1304 + static void limit_periodic_timer_frequency(struct kvm_lapic *apic) 1305 + { 1306 + /* 1307 + * Do not allow the guest to program periodic timers with small 1308 + * interval, since the hrtimers are not throttled by the host 1309 + * scheduler. 1310 + */ 1311 + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { 1312 + s64 min_period = min_timer_period_us * 1000LL; 1313 + 1314 + if (apic->lapic_timer.period < min_period) { 1315 + pr_info_ratelimited( 1316 + "kvm: vcpu %i: requested %lld ns " 1317 + "lapic timer period limited to %lld ns\n", 1318 + apic->vcpu->vcpu_id, 1319 + apic->lapic_timer.period, min_period); 1320 + apic->lapic_timer.period = min_period; 1321 + } 1322 + } 1323 + } 1324 + 1304 1325 static void apic_update_lvtt(struct kvm_lapic *apic) 1305 1326 { 1306 1327 u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & 1307 1328 apic->lapic_timer.timer_mode_mask; 1308 1329 1309 1330 if (apic->lapic_timer.timer_mode != timer_mode) { 1331 + if (apic_lvtt_tscdeadline(apic) != (timer_mode == 1332 + APIC_LVT_TIMER_TSCDEADLINE)) { 1333 + hrtimer_cancel(&apic->lapic_timer.timer); 1334 + kvm_lapic_set_reg(apic, APIC_TMICT, 0); 1335 + apic->lapic_timer.period = 0; 1336 + apic->lapic_timer.tscdeadline = 0; 1337 + } 1310 1338 apic->lapic_timer.timer_mode = timer_mode; 1311 - hrtimer_cancel(&apic->lapic_timer.timer); 1339 + limit_periodic_timer_frequency(apic); 1312 1340 } 1313 1341 } 1314 1342 ··· 1458 1430 HRTIMER_MODE_ABS_PINNED); 1459 1431 } 1460 1432 1433 + static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) 1434 + { 1435 + ktime_t now, remaining; 1436 + u64 ns_remaining_old, ns_remaining_new; 1437 + 1438 + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) 1439 + * APIC_BUS_CYCLE_NS * apic->divide_count; 1440 + limit_periodic_timer_frequency(apic); 1441 + 1442 + now = ktime_get(); 1443 + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); 1444 + if (ktime_to_ns(remaining) < 0) 1445 + remaining = 0; 1446 + 1447 + ns_remaining_old = ktime_to_ns(remaining); 1448 + ns_remaining_new = mul_u64_u32_div(ns_remaining_old, 1449 + apic->divide_count, old_divisor); 1450 + 1451 + apic->lapic_timer.tscdeadline += 1452 + nsec_to_cycles(apic->vcpu, ns_remaining_new) - 1453 + nsec_to_cycles(apic->vcpu, ns_remaining_old); 1454 + apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); 1455 + } 1456 + 1461 1457 static bool set_target_expiration(struct kvm_lapic *apic) 1462 1458 { 1463 1459 ktime_t now; ··· 1491 1439 apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) 1492 1440 * APIC_BUS_CYCLE_NS * apic->divide_count; 1493 1441 1494 - if (!apic->lapic_timer.period) 1442 + if (!apic->lapic_timer.period) { 1443 + apic->lapic_timer.tscdeadline = 0; 1495 1444 return false; 1496 - 1497 - /* 1498 - * Do not allow the guest to program periodic timers with small 1499 - * interval, since the hrtimers are not throttled by the host 1500 - * scheduler. 1501 - */ 1502 - if (apic_lvtt_period(apic)) { 1503 - s64 min_period = min_timer_period_us * 1000LL; 1504 - 1505 - if (apic->lapic_timer.period < min_period) { 1506 - pr_info_ratelimited( 1507 - "kvm: vcpu %i: requested %lld ns " 1508 - "lapic timer period limited to %lld ns\n", 1509 - apic->vcpu->vcpu_id, 1510 - apic->lapic_timer.period, min_period); 1511 - apic->lapic_timer.period = min_period; 1512 - } 1513 1445 } 1446 + 1447 + limit_periodic_timer_frequency(apic); 1514 1448 1515 1449 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 1516 1450 PRIx64 ", " ··· 1551 1513 return false; 1552 1514 1553 1515 if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) 1516 + return false; 1517 + 1518 + if (!ktimer->tscdeadline) 1554 1519 return false; 1555 1520 1556 1521 r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); ··· 1779 1738 start_apic_timer(apic); 1780 1739 break; 1781 1740 1782 - case APIC_TDCR: 1741 + case APIC_TDCR: { 1742 + uint32_t old_divisor = apic->divide_count; 1743 + 1783 1744 if (val & 4) 1784 1745 apic_debug("KVM_WRITE:TDCR %x\n", val); 1785 1746 kvm_lapic_set_reg(apic, APIC_TDCR, val); 1786 1747 update_divide_count(apic); 1748 + if (apic->divide_count != old_divisor && 1749 + apic->lapic_timer.period) { 1750 + hrtimer_cancel(&apic->lapic_timer.timer); 1751 + update_target_expiration(apic, old_divisor); 1752 + restart_apic_timer(apic); 1753 + } 1787 1754 break; 1788 - 1755 + } 1789 1756 case APIC_ESR: 1790 1757 if (apic_x2apic_mode(apic) && val != 0) { 1791 1758 apic_debug("KVM_WRITE:ESR not zero %x\n", val);

+54 -61

arch/x86/kvm/mmu.c

··· 150 150 /* make pte_list_desc fit well in cache line */ 151 151 #define PTE_LIST_EXT 3 152 152 153 + /* 154 + * Return values of handle_mmio_page_fault and mmu.page_fault: 155 + * RET_PF_RETRY: let CPU fault again on the address. 156 + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. 157 + * 158 + * For handle_mmio_page_fault only: 159 + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. 160 + */ 161 + enum { 162 + RET_PF_RETRY = 0, 163 + RET_PF_EMULATE = 1, 164 + RET_PF_INVALID = 2, 165 + }; 166 + 153 167 struct pte_list_desc { 154 168 u64 *sptes[PTE_LIST_EXT]; 155 169 struct pte_list_desc *more; ··· 2438 2424 2439 2425 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 2440 2426 { 2441 - return __shadow_walk_next(iterator, *iterator->sptep); 2427 + __shadow_walk_next(iterator, *iterator->sptep); 2442 2428 } 2443 2429 2444 2430 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, ··· 2808 2794 return ret; 2809 2795 } 2810 2796 2811 - static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, 2812 - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, 2813 - bool speculative, bool host_writable) 2797 + static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, 2798 + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, 2799 + bool speculative, bool host_writable) 2814 2800 { 2815 2801 int was_rmapped = 0; 2816 2802 int rmap_count; 2817 - bool emulate = false; 2803 + int ret = RET_PF_RETRY; 2818 2804 2819 2805 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2820 2806 *sptep, write_fault, gfn); ··· 2844 2830 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, 2845 2831 true, host_writable)) { 2846 2832 if (write_fault) 2847 - emulate = true; 2833 + ret = RET_PF_EMULATE; 2848 2834 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2849 2835 } 2850 2836 2851 2837 if (unlikely(is_mmio_spte(*sptep))) 2852 - emulate = true; 2838 + ret = RET_PF_EMULATE; 2853 2839 2854 2840 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2855 2841 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", ··· 2869 2855 2870 2856 kvm_release_pfn_clean(pfn); 2871 2857 2872 - return emulate; 2858 + return ret; 2873 2859 } 2874 2860 2875 2861 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, ··· 3008 2994 * Do not cache the mmio info caused by writing the readonly gfn 3009 2995 * into the spte otherwise read access on readonly gfn also can 3010 2996 * caused mmio page fault and treat it as mmio access. 3011 - * Return 1 to tell kvm to emulate it. 3012 2997 */ 3013 2998 if (pfn == KVM_PFN_ERR_RO_FAULT) 3014 - return 1; 2999 + return RET_PF_EMULATE; 3015 3000 3016 3001 if (pfn == KVM_PFN_ERR_HWPOISON) { 3017 3002 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); 3018 - return 0; 3003 + return RET_PF_RETRY; 3019 3004 } 3020 3005 3021 3006 return -EFAULT; ··· 3299 3286 } 3300 3287 3301 3288 if (fast_page_fault(vcpu, v, level, error_code)) 3302 - return 0; 3289 + return RET_PF_RETRY; 3303 3290 3304 3291 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3305 3292 smp_rmb(); 3306 3293 3307 3294 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 3308 - return 0; 3295 + return RET_PF_RETRY; 3309 3296 3310 3297 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) 3311 3298 return r; ··· 3325 3312 out_unlock: 3326 3313 spin_unlock(&vcpu->kvm->mmu_lock); 3327 3314 kvm_release_pfn_clean(pfn); 3328 - return 0; 3315 + return RET_PF_RETRY; 3329 3316 } 3330 3317 3331 3318 ··· 3672 3659 return reserved; 3673 3660 } 3674 3661 3675 - /* 3676 - * Return values of handle_mmio_page_fault: 3677 - * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction 3678 - * directly. 3679 - * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page 3680 - * fault path update the mmio spte. 3681 - * RET_MMIO_PF_RETRY: let CPU fault again on the address. 3682 - * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). 3683 - */ 3684 - enum { 3685 - RET_MMIO_PF_EMULATE = 1, 3686 - RET_MMIO_PF_INVALID = 2, 3687 - RET_MMIO_PF_RETRY = 0, 3688 - RET_MMIO_PF_BUG = -1 3689 - }; 3690 - 3691 3662 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3692 3663 { 3693 3664 u64 spte; 3694 3665 bool reserved; 3695 3666 3696 3667 if (mmio_info_in_cache(vcpu, addr, direct)) 3697 - return RET_MMIO_PF_EMULATE; 3668 + return RET_PF_EMULATE; 3698 3669 3699 3670 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); 3700 3671 if (WARN_ON(reserved)) 3701 - return RET_MMIO_PF_BUG; 3672 + return -EINVAL; 3702 3673 3703 3674 if (is_mmio_spte(spte)) { 3704 3675 gfn_t gfn = get_mmio_spte_gfn(spte); 3705 3676 unsigned access = get_mmio_spte_access(spte); 3706 3677 3707 3678 if (!check_mmio_spte(vcpu, spte)) 3708 - return RET_MMIO_PF_INVALID; 3679 + return RET_PF_INVALID; 3709 3680 3710 3681 if (direct) 3711 3682 addr = 0; 3712 3683 3713 3684 trace_handle_mmio_page_fault(addr, gfn, access); 3714 3685 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3715 - return RET_MMIO_PF_EMULATE; 3686 + return RET_PF_EMULATE; 3716 3687 } 3717 3688 3718 3689 /* 3719 3690 * If the page table is zapped by other cpus, let CPU fault again on 3720 3691 * the address. 3721 3692 */ 3722 - return RET_MMIO_PF_RETRY; 3693 + return RET_PF_RETRY; 3723 3694 } 3724 3695 EXPORT_SYMBOL_GPL(handle_mmio_page_fault); 3725 3696 ··· 3753 3756 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3754 3757 3755 3758 if (page_fault_handle_page_track(vcpu, error_code, gfn)) 3756 - return 1; 3759 + return RET_PF_EMULATE; 3757 3760 3758 3761 r = mmu_topup_memory_caches(vcpu); 3759 3762 if (r) ··· 3817 3820 } 3818 3821 3819 3822 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 3820 - u64 fault_address, char *insn, int insn_len, 3821 - bool need_unprotect) 3823 + u64 fault_address, char *insn, int insn_len) 3822 3824 { 3823 3825 int r = 1; 3824 3826 ··· 3825 3829 default: 3826 3830 trace_kvm_page_fault(fault_address, error_code); 3827 3831 3828 - if (need_unprotect && kvm_event_needs_reinjection(vcpu)) 3832 + if (kvm_event_needs_reinjection(vcpu)) 3829 3833 kvm_mmu_unprotect_page_virt(vcpu, fault_address); 3830 3834 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 3831 3835 insn_len); ··· 3872 3876 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3873 3877 3874 3878 if (page_fault_handle_page_track(vcpu, error_code, gfn)) 3875 - return 1; 3879 + return RET_PF_EMULATE; 3876 3880 3877 3881 r = mmu_topup_memory_caches(vcpu); 3878 3882 if (r) ··· 3889 3893 } 3890 3894 3891 3895 if (fast_page_fault(vcpu, gpa, level, error_code)) 3892 - return 0; 3896 + return RET_PF_RETRY; 3893 3897 3894 3898 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3895 3899 smp_rmb(); 3896 3900 3897 3901 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3898 - return 0; 3902 + return RET_PF_RETRY; 3899 3903 3900 3904 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) 3901 3905 return r; ··· 3915 3919 out_unlock: 3916 3920 spin_unlock(&vcpu->kvm->mmu_lock); 3917 3921 kvm_release_pfn_clean(pfn); 3918 - return 0; 3922 + return RET_PF_RETRY; 3919 3923 } 3920 3924 3921 3925 static void nonpaging_init_context(struct kvm_vcpu *vcpu, ··· 4914 4918 vcpu->arch.gpa_val = cr2; 4915 4919 } 4916 4920 4921 + r = RET_PF_INVALID; 4917 4922 if (unlikely(error_code & PFERR_RSVD_MASK)) { 4918 4923 r = handle_mmio_page_fault(vcpu, cr2, direct); 4919 - if (r == RET_MMIO_PF_EMULATE) { 4924 + if (r == RET_PF_EMULATE) { 4920 4925 emulation_type = 0; 4921 4926 goto emulate; 4922 4927 } 4923 - if (r == RET_MMIO_PF_RETRY) 4924 - return 1; 4925 - if (r < 0) 4926 - return r; 4927 - /* Must be RET_MMIO_PF_INVALID. */ 4928 4928 } 4929 4929 4930 - r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), 4931 - false); 4930 + if (r == RET_PF_INVALID) { 4931 + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), 4932 + false); 4933 + WARN_ON(r == RET_PF_INVALID); 4934 + } 4935 + 4936 + if (r == RET_PF_RETRY) 4937 + return 1; 4932 4938 if (r < 0) 4933 4939 return r; 4934 - if (!r) 4935 - return 1; 4936 4940 4937 4941 /* 4938 4942 * Before emulating the instruction, check if the error code ··· 4989 4993 static void free_mmu_pages(struct kvm_vcpu *vcpu) 4990 4994 { 4991 4995 free_page((unsigned long)vcpu->arch.mmu.pae_root); 4992 - if (vcpu->arch.mmu.lm_root != NULL) 4993 - free_page((unsigned long)vcpu->arch.mmu.lm_root); 4996 + free_page((unsigned long)vcpu->arch.mmu.lm_root); 4994 4997 } 4995 4998 4996 4999 static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ··· 5459 5464 5460 5465 static void mmu_destroy_caches(void) 5461 5466 { 5462 - if (pte_list_desc_cache) 5463 - kmem_cache_destroy(pte_list_desc_cache); 5464 - if (mmu_page_header_cache) 5465 - kmem_cache_destroy(mmu_page_header_cache); 5467 + kmem_cache_destroy(pte_list_desc_cache); 5468 + kmem_cache_destroy(mmu_page_header_cache); 5466 5469 } 5467 5470 5468 5471 int kvm_mmu_module_init(void) ··· 5469 5476 5470 5477 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 5471 5478 sizeof(struct pte_list_desc), 5472 - 0, 0, NULL); 5479 + 0, SLAB_ACCOUNT, NULL); 5473 5480 if (!pte_list_desc_cache) 5474 5481 goto nomem; 5475 5482 5476 5483 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 5477 5484 sizeof(struct kvm_mmu_page), 5478 - 0, 0, NULL); 5485 + 0, SLAB_ACCOUNT, NULL); 5479 5486 if (!mmu_page_header_cache) 5480 5487 goto nomem; 5481 5488

+1 -2

arch/x86/kvm/mmu.h

··· 66 66 bool accessed_dirty); 67 67 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); 68 68 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 69 - u64 fault_address, char *insn, int insn_len, 70 - bool need_unprotect); 69 + u64 fault_address, char *insn, int insn_len); 71 70 72 71 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 73 72 {

+9 -9

arch/x86/kvm/paging_tmpl.h

··· 593 593 struct kvm_mmu_page *sp = NULL; 594 594 struct kvm_shadow_walk_iterator it; 595 595 unsigned direct_access, access = gw->pt_access; 596 - int top_level, emulate; 596 + int top_level, ret; 597 597 598 598 direct_access = gw->pte_access; 599 599 ··· 659 659 } 660 660 661 661 clear_sp_write_flooding_count(it.sptep); 662 - emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, 663 - it.level, gw->gfn, pfn, prefault, map_writable); 662 + ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, 663 + it.level, gw->gfn, pfn, prefault, map_writable); 664 664 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 665 665 666 - return emulate; 666 + return ret; 667 667 668 668 out_gpte_changed: 669 669 kvm_release_pfn_clean(pfn); 670 - return 0; 670 + return RET_PF_RETRY; 671 671 } 672 672 673 673 /* ··· 762 762 if (!prefault) 763 763 inject_page_fault(vcpu, &walker.fault); 764 764 765 - return 0; 765 + return RET_PF_RETRY; 766 766 } 767 767 768 768 if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { 769 769 shadow_page_table_clear_flood(vcpu, addr); 770 - return 1; 770 + return RET_PF_EMULATE; 771 771 } 772 772 773 773 vcpu->arch.write_fault_to_shadow_pgtable = false; ··· 789 789 790 790 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, 791 791 &map_writable)) 792 - return 0; 792 + return RET_PF_RETRY; 793 793 794 794 if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) 795 795 return r; ··· 834 834 out_unlock: 835 835 spin_unlock(&vcpu->kvm->mmu_lock); 836 836 kvm_release_pfn_clean(pfn); 837 - return 0; 837 + return RET_PF_RETRY; 838 838 } 839 839 840 840 static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)

+171 -70

arch/x86/kvm/svm.c

··· 1034 1034 } 1035 1035 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1036 1036 1037 - if (!vcpu) 1038 - return 0; 1039 - 1040 1037 /* Note: 1041 1038 * At this point, the IOMMU should have already set the pending 1042 1039 * bit in the vAPIC backing page. So, we just need to schedule 1043 1040 * in the vcpu. 1044 1041 */ 1045 - if (vcpu->mode == OUTSIDE_GUEST_MODE) 1042 + if (vcpu) 1046 1043 kvm_vcpu_wake_up(vcpu); 1047 1044 1048 1045 return 0; ··· 2141 2144 2142 2145 return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, 2143 2146 svm->vmcb->control.insn_bytes, 2144 - svm->vmcb->control.insn_len, !npt_enabled); 2147 + svm->vmcb->control.insn_len); 2148 + } 2149 + 2150 + static int npf_interception(struct vcpu_svm *svm) 2151 + { 2152 + u64 fault_address = svm->vmcb->control.exit_info_2; 2153 + u64 error_code = svm->vmcb->control.exit_info_1; 2154 + 2155 + trace_kvm_page_fault(fault_address, error_code); 2156 + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, 2157 + svm->vmcb->control.insn_bytes, 2158 + svm->vmcb->control.insn_len); 2145 2159 } 2146 2160 2147 2161 static int db_interception(struct vcpu_svm *svm) ··· 2924 2916 return true; 2925 2917 } 2926 2918 2927 - static bool nested_svm_vmrun(struct vcpu_svm *svm) 2919 + static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, 2920 + struct vmcb *nested_vmcb, struct page *page) 2928 2921 { 2929 - struct vmcb *nested_vmcb; 2930 - struct vmcb *hsave = svm->nested.hsave; 2931 - struct vmcb *vmcb = svm->vmcb; 2932 - struct page *page; 2933 - u64 vmcb_gpa; 2934 - 2935 - vmcb_gpa = svm->vmcb->save.rax; 2936 - 2937 - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2938 - if (!nested_vmcb) 2939 - return false; 2940 - 2941 - if (!nested_vmcb_checks(nested_vmcb)) { 2942 - nested_vmcb->control.exit_code = SVM_EXIT_ERR; 2943 - nested_vmcb->control.exit_code_hi = 0; 2944 - nested_vmcb->control.exit_info_1 = 0; 2945 - nested_vmcb->control.exit_info_2 = 0; 2946 - 2947 - nested_svm_unmap(page); 2948 - 2949 - return false; 2950 - } 2951 - 2952 - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, 2953 - nested_vmcb->save.rip, 2954 - nested_vmcb->control.int_ctl, 2955 - nested_vmcb->control.event_inj, 2956 - nested_vmcb->control.nested_ctl); 2957 - 2958 - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, 2959 - nested_vmcb->control.intercept_cr >> 16, 2960 - nested_vmcb->control.intercept_exceptions, 2961 - nested_vmcb->control.intercept); 2962 - 2963 - /* Clear internal status */ 2964 - kvm_clear_exception_queue(&svm->vcpu); 2965 - kvm_clear_interrupt_queue(&svm->vcpu); 2966 - 2967 - /* 2968 - * Save the old vmcb, so we don't need to pick what we save, but can 2969 - * restore everything when a VMEXIT occurs 2970 - */ 2971 - hsave->save.es = vmcb->save.es; 2972 - hsave->save.cs = vmcb->save.cs; 2973 - hsave->save.ss = vmcb->save.ss; 2974 - hsave->save.ds = vmcb->save.ds; 2975 - hsave->save.gdtr = vmcb->save.gdtr; 2976 - hsave->save.idtr = vmcb->save.idtr; 2977 - hsave->save.efer = svm->vcpu.arch.efer; 2978 - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2979 - hsave->save.cr4 = svm->vcpu.arch.cr4; 2980 - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); 2981 - hsave->save.rip = kvm_rip_read(&svm->vcpu); 2982 - hsave->save.rsp = vmcb->save.rsp; 2983 - hsave->save.rax = vmcb->save.rax; 2984 - if (npt_enabled) 2985 - hsave->save.cr3 = vmcb->save.cr3; 2986 - else 2987 - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); 2988 - 2989 - copy_vmcb_control_area(hsave, vmcb); 2990 - 2991 2922 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) 2992 2923 svm->vcpu.arch.hflags |= HF_HIF_MASK; 2993 2924 else ··· 3019 3072 enable_gif(svm); 3020 3073 3021 3074 mark_all_dirty(svm->vmcb); 3075 + } 3076 + 3077 + static bool nested_svm_vmrun(struct vcpu_svm *svm) 3078 + { 3079 + struct vmcb *nested_vmcb; 3080 + struct vmcb *hsave = svm->nested.hsave; 3081 + struct vmcb *vmcb = svm->vmcb; 3082 + struct page *page; 3083 + u64 vmcb_gpa; 3084 + 3085 + vmcb_gpa = svm->vmcb->save.rax; 3086 + 3087 + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 3088 + if (!nested_vmcb) 3089 + return false; 3090 + 3091 + if (!nested_vmcb_checks(nested_vmcb)) { 3092 + nested_vmcb->control.exit_code = SVM_EXIT_ERR; 3093 + nested_vmcb->control.exit_code_hi = 0; 3094 + nested_vmcb->control.exit_info_1 = 0; 3095 + nested_vmcb->control.exit_info_2 = 0; 3096 + 3097 + nested_svm_unmap(page); 3098 + 3099 + return false; 3100 + } 3101 + 3102 + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, 3103 + nested_vmcb->save.rip, 3104 + nested_vmcb->control.int_ctl, 3105 + nested_vmcb->control.event_inj, 3106 + nested_vmcb->control.nested_ctl); 3107 + 3108 + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, 3109 + nested_vmcb->control.intercept_cr >> 16, 3110 + nested_vmcb->control.intercept_exceptions, 3111 + nested_vmcb->control.intercept); 3112 + 3113 + /* Clear internal status */ 3114 + kvm_clear_exception_queue(&svm->vcpu); 3115 + kvm_clear_interrupt_queue(&svm->vcpu); 3116 + 3117 + /* 3118 + * Save the old vmcb, so we don't need to pick what we save, but can 3119 + * restore everything when a VMEXIT occurs 3120 + */ 3121 + hsave->save.es = vmcb->save.es; 3122 + hsave->save.cs = vmcb->save.cs; 3123 + hsave->save.ss = vmcb->save.ss; 3124 + hsave->save.ds = vmcb->save.ds; 3125 + hsave->save.gdtr = vmcb->save.gdtr; 3126 + hsave->save.idtr = vmcb->save.idtr; 3127 + hsave->save.efer = svm->vcpu.arch.efer; 3128 + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 3129 + hsave->save.cr4 = svm->vcpu.arch.cr4; 3130 + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); 3131 + hsave->save.rip = kvm_rip_read(&svm->vcpu); 3132 + hsave->save.rsp = vmcb->save.rsp; 3133 + hsave->save.rax = vmcb->save.rax; 3134 + if (npt_enabled) 3135 + hsave->save.cr3 = vmcb->save.cr3; 3136 + else 3137 + hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); 3138 + 3139 + copy_vmcb_control_area(hsave, vmcb); 3140 + 3141 + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); 3022 3142 3023 3143 return true; 3024 3144 } ··· 3187 3173 3188 3174 /* 3189 3175 * If VGIF is enabled, the STGI intercept is only added to 3190 - * detect the opening of the NMI window; remove it now. 3176 + * detect the opening of the SMI/NMI window; remove it now. 3191 3177 */ 3192 3178 if (vgif_enabled(svm)) 3193 3179 clr_intercept(svm, INTERCEPT_STGI); ··· 4145 4131 [SVM_EXIT_MONITOR] = monitor_interception, 4146 4132 [SVM_EXIT_MWAIT] = mwait_interception, 4147 4133 [SVM_EXIT_XSETBV] = xsetbv_interception, 4148 - [SVM_EXIT_NPF] = pf_interception, 4134 + [SVM_EXIT_NPF] = npf_interception, 4149 4135 [SVM_EXIT_RSM] = emulate_on_interception, 4150 4136 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 4151 4137 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, ··· 5407 5393 vcpu->arch.mcg_cap &= 0x1ff; 5408 5394 } 5409 5395 5396 + static int svm_smi_allowed(struct kvm_vcpu *vcpu) 5397 + { 5398 + struct vcpu_svm *svm = to_svm(vcpu); 5399 + 5400 + /* Per APM Vol.2 15.22.2 "Response to SMI" */ 5401 + if (!gif_set(svm)) 5402 + return 0; 5403 + 5404 + if (is_guest_mode(&svm->vcpu) && 5405 + svm->nested.intercept & (1ULL << INTERCEPT_SMI)) { 5406 + /* TODO: Might need to set exit_info_1 and exit_info_2 here */ 5407 + svm->vmcb->control.exit_code = SVM_EXIT_SMI; 5408 + svm->nested.exit_required = true; 5409 + return 0; 5410 + } 5411 + 5412 + return 1; 5413 + } 5414 + 5415 + static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 5416 + { 5417 + struct vcpu_svm *svm = to_svm(vcpu); 5418 + int ret; 5419 + 5420 + if (is_guest_mode(vcpu)) { 5421 + /* FED8h - SVM Guest */ 5422 + put_smstate(u64, smstate, 0x7ed8, 1); 5423 + /* FEE0h - SVM Guest VMCB Physical Address */ 5424 + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); 5425 + 5426 + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 5427 + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 5428 + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 5429 + 5430 + ret = nested_svm_vmexit(svm); 5431 + if (ret) 5432 + return ret; 5433 + } 5434 + return 0; 5435 + } 5436 + 5437 + static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) 5438 + { 5439 + struct vcpu_svm *svm = to_svm(vcpu); 5440 + struct vmcb *nested_vmcb; 5441 + struct page *page; 5442 + struct { 5443 + u64 guest; 5444 + u64 vmcb; 5445 + } svm_state_save; 5446 + int ret; 5447 + 5448 + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save, 5449 + sizeof(svm_state_save)); 5450 + if (ret) 5451 + return ret; 5452 + 5453 + if (svm_state_save.guest) { 5454 + vcpu->arch.hflags &= ~HF_SMM_MASK; 5455 + nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page); 5456 + if (nested_vmcb) 5457 + enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page); 5458 + else 5459 + ret = 1; 5460 + vcpu->arch.hflags |= HF_SMM_MASK; 5461 + } 5462 + return ret; 5463 + } 5464 + 5465 + static int enable_smi_window(struct kvm_vcpu *vcpu) 5466 + { 5467 + struct vcpu_svm *svm = to_svm(vcpu); 5468 + 5469 + if (!gif_set(svm)) { 5470 + if (vgif_enabled(svm)) 5471 + set_intercept(svm, INTERCEPT_STGI); 5472 + /* STGI will cause a vm exit */ 5473 + return 1; 5474 + } 5475 + return 0; 5476 + } 5477 + 5410 5478 static struct kvm_x86_ops svm_x86_ops __ro_after_init = { 5411 5479 .cpu_has_kvm_support = has_svm, 5412 5480 .disabled_by_bios = is_disabled, ··· 5599 5503 .deliver_posted_interrupt = svm_deliver_avic_intr, 5600 5504 .update_pi_irte = svm_update_pi_irte, 5601 5505 .setup_mce = svm_setup_mce, 5506 + 5507 + .smi_allowed = svm_smi_allowed, 5508 + .pre_enter_smm = svm_pre_enter_smm, 5509 + .pre_leave_smm = svm_pre_leave_smm, 5510 + .enable_smi_window = enable_smi_window, 5602 5511 }; 5603 5512 5604 5513 static int __init svm_init(void)

+131 -77

arch/x86/kvm/vmx.c

··· 486 486 u64 nested_vmx_cr4_fixed1; 487 487 u64 nested_vmx_vmcs_enum; 488 488 u64 nested_vmx_vmfunc_controls; 489 + 490 + /* SMM related state */ 491 + struct { 492 + /* in VMX operation on SMM entry? */ 493 + bool vmxon; 494 + /* in guest mode on SMM entry? */ 495 + bool guest_mode; 496 + } smm; 489 497 }; 490 498 491 499 #define POSTED_INTR_ON 0 ··· 908 900 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 909 901 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); 910 902 static bool vmx_xsaves_supported(void); 911 - static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 912 903 static void vmx_set_segment(struct kvm_vcpu *vcpu, 913 904 struct kvm_segment *var, int seg); 914 905 static void vmx_get_segment(struct kvm_vcpu *vcpu, 915 906 struct kvm_segment *var, int seg); 916 907 static bool guest_state_valid(struct kvm_vcpu *vcpu); 917 908 static u32 vmx_segment_access_rights(struct kvm_segment *var); 918 - static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); 919 909 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 920 - static int alloc_identity_pagetable(struct kvm *kvm); 921 910 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); 922 911 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); 923 912 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, ··· 1603 1598 1604 1599 static inline void ept_sync_global(void) 1605 1600 { 1606 - if (cpu_has_vmx_invept_global()) 1607 - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); 1601 + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); 1608 1602 } 1609 1603 1610 1604 static inline void ept_sync_context(u64 eptp) 1611 1605 { 1612 - if (enable_ept) { 1613 - if (cpu_has_vmx_invept_context()) 1614 - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1615 - else 1616 - ept_sync_global(); 1617 - } 1606 + if (cpu_has_vmx_invept_context()) 1607 + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1608 + else 1609 + ept_sync_global(); 1618 1610 } 1619 1611 1620 1612 static __always_inline void vmcs_check16(unsigned long field) ··· 2833 2831 SECONDARY_EXEC_ENABLE_PML; 2834 2832 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; 2835 2833 } 2836 - } else 2837 - vmx->nested.nested_vmx_ept_caps = 0; 2834 + } 2838 2835 2839 2836 if (cpu_has_vmx_vmfunc()) { 2840 2837 vmx->nested.nested_vmx_secondary_ctls_high |= ··· 2842 2841 * Advertise EPTP switching unconditionally 2843 2842 * since we emulate it 2844 2843 */ 2845 - vmx->nested.nested_vmx_vmfunc_controls = 2846 - VMX_VMFUNC_EPTP_SWITCHING; 2844 + if (enable_ept) 2845 + vmx->nested.nested_vmx_vmfunc_controls = 2846 + VMX_VMFUNC_EPTP_SWITCHING; 2847 2847 } 2848 2848 2849 2849 /* ··· 2858 2856 SECONDARY_EXEC_ENABLE_VPID; 2859 2857 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | 2860 2858 VMX_VPID_EXTENT_SUPPORTED_MASK; 2861 - } else 2862 - vmx->nested.nested_vmx_vpid_caps = 0; 2859 + } 2863 2860 2864 2861 if (enable_unrestricted_guest) 2865 2862 vmx->nested.nested_vmx_secondary_ctls_high |= ··· 3545 3544 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 3546 3545 } 3547 3546 kvm_cpu_vmxon(phys_addr); 3548 - ept_sync_global(); 3547 + if (enable_ept) 3548 + ept_sync_global(); 3549 3549 3550 3550 return 0; 3551 3551 } ··· 3659 3657 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3660 3658 SECONDARY_EXEC_SHADOW_VMCS | 3661 3659 SECONDARY_EXEC_XSAVES | 3662 - SECONDARY_EXEC_RDSEED | 3663 - SECONDARY_EXEC_RDRAND | 3660 + SECONDARY_EXEC_RDSEED_EXITING | 3661 + SECONDARY_EXEC_RDRAND_EXITING | 3664 3662 SECONDARY_EXEC_ENABLE_PML | 3665 3663 SECONDARY_EXEC_TSC_SCALING | 3666 3664 SECONDARY_EXEC_ENABLE_VMFUNC; ··· 3681 3679 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3682 3680 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3683 3681 3682 + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 3683 + &vmx_capability.ept, &vmx_capability.vpid); 3684 + 3684 3685 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 3685 3686 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 3686 3687 enabled */ 3687 3688 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 3688 3689 CPU_BASED_CR3_STORE_EXITING | 3689 3690 CPU_BASED_INVLPG_EXITING); 3690 - rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 3691 - vmx_capability.ept, vmx_capability.vpid); 3691 + } else if (vmx_capability.ept) { 3692 + vmx_capability.ept = 0; 3693 + pr_warn_once("EPT CAP should not exist if not support " 3694 + "1-setting enable EPT VM-execution control\n"); 3695 + } 3696 + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 3697 + vmx_capability.vpid) { 3698 + vmx_capability.vpid = 0; 3699 + pr_warn_once("VPID CAP should not exist if not support " 3700 + "1-setting enable VPID VM-execution control\n"); 3692 3701 } 3693 3702 3694 3703 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; ··· 4794 4781 kvm_pfn_t identity_map_pfn; 4795 4782 u32 tmp; 4796 4783 4797 - if (!enable_ept) 4798 - return 0; 4799 - 4800 4784 /* Protect kvm->arch.ept_identity_pagetable_done. */ 4801 4785 mutex_lock(&kvm->slots_lock); 4802 4786 4803 4787 if (likely(kvm->arch.ept_identity_pagetable_done)) 4804 4788 goto out2; 4805 4789 4790 + if (!kvm->arch.ept_identity_map_addr) 4791 + kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4806 4792 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 4807 4793 4808 - r = alloc_identity_pagetable(kvm); 4794 + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 4795 + kvm->arch.ept_identity_map_addr, PAGE_SIZE); 4809 4796 if (r < 0) 4810 4797 goto out2; 4811 4798 ··· 4874 4861 kvm->arch.apic_access_page_done = true; 4875 4862 out: 4876 4863 mutex_unlock(&kvm->slots_lock); 4877 - return r; 4878 - } 4879 - 4880 - static int alloc_identity_pagetable(struct kvm *kvm) 4881 - { 4882 - /* Called with kvm->slots_lock held. */ 4883 - 4884 - int r = 0; 4885 - 4886 - BUG_ON(kvm->arch.ept_identity_pagetable_done); 4887 - 4888 - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 4889 - kvm->arch.ept_identity_map_addr, PAGE_SIZE); 4890 - 4891 4864 return r; 4892 4865 } 4893 4866 ··· 5281 5282 static bool vmx_rdrand_supported(void) 5282 5283 { 5283 5284 return vmcs_config.cpu_based_2nd_exec_ctrl & 5284 - SECONDARY_EXEC_RDRAND; 5285 + SECONDARY_EXEC_RDRAND_EXITING; 5285 5286 } 5286 5287 5287 5288 static bool vmx_rdseed_supported(void) 5288 5289 { 5289 5290 return vmcs_config.cpu_based_2nd_exec_ctrl & 5290 - SECONDARY_EXEC_RDSEED; 5291 + SECONDARY_EXEC_RDSEED_EXITING; 5291 5292 } 5292 5293 5293 5294 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) ··· 5381 5382 if (vmx_rdrand_supported()) { 5382 5383 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); 5383 5384 if (rdrand_enabled) 5384 - exec_control &= ~SECONDARY_EXEC_RDRAND; 5385 + exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; 5385 5386 5386 5387 if (nested) { 5387 5388 if (rdrand_enabled) 5388 5389 vmx->nested.nested_vmx_secondary_ctls_high |= 5389 - SECONDARY_EXEC_RDRAND; 5390 + SECONDARY_EXEC_RDRAND_EXITING; 5390 5391 else 5391 5392 vmx->nested.nested_vmx_secondary_ctls_high &= 5392 - ~SECONDARY_EXEC_RDRAND; 5393 + ~SECONDARY_EXEC_RDRAND_EXITING; 5393 5394 } 5394 5395 } 5395 5396 5396 5397 if (vmx_rdseed_supported()) { 5397 5398 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); 5398 5399 if (rdseed_enabled) 5399 - exec_control &= ~SECONDARY_EXEC_RDSEED; 5400 + exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; 5400 5401 5401 5402 if (nested) { 5402 5403 if (rdseed_enabled) 5403 5404 vmx->nested.nested_vmx_secondary_ctls_high |= 5404 - SECONDARY_EXEC_RDSEED; 5405 + SECONDARY_EXEC_RDSEED_EXITING; 5405 5406 else 5406 5407 vmx->nested.nested_vmx_secondary_ctls_high &= 5407 - ~SECONDARY_EXEC_RDSEED; 5408 + ~SECONDARY_EXEC_RDSEED_EXITING; 5408 5409 } 5409 5410 } 5410 5411 ··· 5425 5426 /* 5426 5427 * Sets up the vmcs for emulated real mode. 5427 5428 */ 5428 - static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 5429 + static void vmx_vcpu_setup(struct vcpu_vmx *vmx) 5429 5430 { 5430 5431 #ifdef CONFIG_X86_64 5431 5432 unsigned long a; ··· 5538 5539 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 5539 5540 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 5540 5541 } 5541 - 5542 - return 0; 5543 5542 } 5544 5543 5545 5544 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) ··· 5601 5604 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 5602 5605 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 5603 5606 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 5607 + if (kvm_mpx_supported()) 5608 + vmcs_write64(GUEST_BNDCFGS, 0); 5604 5609 5605 5610 setup_msrs(vmx); 5606 5611 ··· 5911 5912 cr2 = vmcs_readl(EXIT_QUALIFICATION); 5912 5913 /* EPT won't cause page fault directly */ 5913 5914 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); 5914 - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, 5915 - true); 5915 + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5916 5916 } 5917 5917 5918 5918 ex_no = intr_info & INTR_INFO_VECTOR_MASK; ··· 6745 6747 6746 6748 if (!cpu_has_vmx_ept() || 6747 6749 !cpu_has_vmx_ept_4levels() || 6748 - !cpu_has_vmx_ept_mt_wb()) { 6750 + !cpu_has_vmx_ept_mt_wb() || 6751 + !cpu_has_vmx_invept_global()) 6749 6752 enable_ept = 0; 6750 - enable_unrestricted_guest = 0; 6751 - enable_ept_ad_bits = 0; 6752 - } 6753 6753 6754 6754 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 6755 6755 enable_ept_ad_bits = 0; 6756 6756 6757 - if (!cpu_has_vmx_unrestricted_guest()) 6757 + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 6758 6758 enable_unrestricted_guest = 0; 6759 6759 6760 6760 if (!cpu_has_vmx_flexpriority()) ··· 6772 6776 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 6773 6777 kvm_disable_largepages(); 6774 6778 6775 - if (!cpu_has_vmx_ple()) 6779 + if (!cpu_has_vmx_ple()) { 6776 6780 ple_gap = 0; 6781 + ple_window = 0; 6782 + ple_window_grow = 0; 6783 + ple_window_max = 0; 6784 + ple_window_shrink = 0; 6785 + } 6777 6786 6778 6787 if (!cpu_has_vmx_apicv()) { 6779 6788 enable_apicv = 0; ··· 8416 8415 case EXIT_REASON_RDPMC: 8417 8416 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 8418 8417 case EXIT_REASON_RDRAND: 8419 - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); 8418 + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 8420 8419 case EXIT_REASON_RDSEED: 8421 - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); 8420 + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 8422 8421 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 8423 8422 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 8424 8423 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: ··· 9476 9475 vmx->loaded_vmcs = vmcs; 9477 9476 vmx_vcpu_put(vcpu); 9478 9477 vmx_vcpu_load(vcpu, cpu); 9479 - vcpu->cpu = cpu; 9480 9478 put_cpu(); 9481 9479 } 9482 9480 ··· 9556 9556 cpu = get_cpu(); 9557 9557 vmx_vcpu_load(&vmx->vcpu, cpu); 9558 9558 vmx->vcpu.cpu = cpu; 9559 - err = vmx_vcpu_setup(vmx); 9559 + vmx_vcpu_setup(vmx); 9560 9560 vmx_vcpu_put(&vmx->vcpu); 9561 9561 put_cpu(); 9562 - if (err) 9563 - goto free_vmcs; 9564 9562 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { 9565 9563 err = alloc_apic_access_page(kvm); 9566 9564 if (err) ··· 9566 9568 } 9567 9569 9568 9570 if (enable_ept) { 9569 - if (!kvm->arch.ept_identity_map_addr) 9570 - kvm->arch.ept_identity_map_addr = 9571 - VMX_EPT_IDENTITY_PAGETABLE_ADDR; 9572 9571 err = init_rmode_identity_map(kvm); 9573 9572 if (err) 9574 9573 goto free_vmcs; ··· 11320 11325 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 11321 11326 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 11322 11327 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 11328 + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 11329 + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 11323 11330 11324 11331 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 11325 11332 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) ··· 11418 11421 leave_guest_mode(vcpu); 11419 11422 11420 11423 if (likely(!vmx->fail)) { 11421 - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 11422 - exit_qualification); 11424 + if (exit_reason == -1) 11425 + sync_vmcs12(vcpu, vmcs12); 11426 + else 11427 + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 11428 + exit_qualification); 11423 11429 11424 11430 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, 11425 11431 vmcs12->vm_exit_msr_store_count)) ··· 11486 11486 */ 11487 11487 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 11488 11488 11489 - if (enable_shadow_vmcs) 11489 + if (enable_shadow_vmcs && exit_reason != -1) 11490 11490 vmx->nested.sync_shadow_vmcs = true; 11491 11491 11492 11492 /* in case we halted in L2 */ ··· 11510 11510 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 11511 11511 } 11512 11512 11513 - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 11514 - vmcs12->exit_qualification, 11515 - vmcs12->idt_vectoring_info_field, 11516 - vmcs12->vm_exit_intr_info, 11517 - vmcs12->vm_exit_intr_error_code, 11518 - KVM_ISA_VMX); 11513 + if (exit_reason != -1) 11514 + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 11515 + vmcs12->exit_qualification, 11516 + vmcs12->idt_vectoring_info_field, 11517 + vmcs12->vm_exit_intr_info, 11518 + vmcs12->vm_exit_intr_error_code, 11519 + KVM_ISA_VMX); 11519 11520 11520 11521 load_vmcs12_host_state(vcpu, vmcs12); 11521 11522 ··· 11939 11938 ~FEATURE_CONTROL_LMCE; 11940 11939 } 11941 11940 11941 + static int vmx_smi_allowed(struct kvm_vcpu *vcpu) 11942 + { 11943 + /* we need a nested vmexit to enter SMM, postpone if run is pending */ 11944 + if (to_vmx(vcpu)->nested.nested_run_pending) 11945 + return 0; 11946 + return 1; 11947 + } 11948 + 11949 + static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 11950 + { 11951 + struct vcpu_vmx *vmx = to_vmx(vcpu); 11952 + 11953 + vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 11954 + if (vmx->nested.smm.guest_mode) 11955 + nested_vmx_vmexit(vcpu, -1, 0, 0); 11956 + 11957 + vmx->nested.smm.vmxon = vmx->nested.vmxon; 11958 + vmx->nested.vmxon = false; 11959 + return 0; 11960 + } 11961 + 11962 + static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) 11963 + { 11964 + struct vcpu_vmx *vmx = to_vmx(vcpu); 11965 + int ret; 11966 + 11967 + if (vmx->nested.smm.vmxon) { 11968 + vmx->nested.vmxon = true; 11969 + vmx->nested.smm.vmxon = false; 11970 + } 11971 + 11972 + if (vmx->nested.smm.guest_mode) { 11973 + vcpu->arch.hflags &= ~HF_SMM_MASK; 11974 + ret = enter_vmx_non_root_mode(vcpu, false); 11975 + vcpu->arch.hflags |= HF_SMM_MASK; 11976 + if (ret) 11977 + return ret; 11978 + 11979 + vmx->nested.smm.guest_mode = false; 11980 + } 11981 + return 0; 11982 + } 11983 + 11984 + static int enable_smi_window(struct kvm_vcpu *vcpu) 11985 + { 11986 + return 0; 11987 + } 11988 + 11942 11989 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { 11943 11990 .cpu_has_kvm_support = cpu_has_kvm_support, 11944 11991 .disabled_by_bios = vmx_disabled_by_bios, ··· 12112 12063 #endif 12113 12064 12114 12065 .setup_mce = vmx_setup_mce, 12066 + 12067 + .smi_allowed = vmx_smi_allowed, 12068 + .pre_enter_smm = vmx_pre_enter_smm, 12069 + .pre_leave_smm = vmx_pre_leave_smm, 12070 + .enable_smi_window = enable_smi_window, 12115 12071 }; 12116 12072 12117 12073 static int __init vmx_init(void)

+67 -27

arch/x86/kvm/x86.c

··· 2006 2006 KVMCLOCK_SYNC_PERIOD); 2007 2007 } 2008 2008 2009 - static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 2009 + static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2010 2010 { 2011 2011 u64 mcg_cap = vcpu->arch.mcg_cap; 2012 2012 unsigned bank_num = mcg_cap & 0xff; 2013 + u32 msr = msr_info->index; 2014 + u64 data = msr_info->data; 2013 2015 2014 2016 switch (msr) { 2015 2017 case MSR_IA32_MCG_STATUS: ··· 2035 2033 */ 2036 2034 if ((offset & 0x3) == 0 && 2037 2035 data != 0 && (data | (1 << 10)) != ~(u64)0) 2036 + return -1; 2037 + if (!msr_info->host_initiated && 2038 + (offset & 0x3) == 1 && data != 0) 2038 2039 return -1; 2039 2040 vcpu->arch.mce_banks[offset] = data; 2040 2041 break; ··· 2288 2283 case MSR_IA32_MCG_CTL: 2289 2284 case MSR_IA32_MCG_STATUS: 2290 2285 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2291 - return set_msr_mce(vcpu, msr, data); 2286 + return set_msr_mce(vcpu, msr_info); 2292 2287 2293 2288 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 2294 2289 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: ··· 4039 4034 case KVM_SET_IDENTITY_MAP_ADDR: { 4040 4035 u64 ident_addr; 4041 4036 4037 + mutex_lock(&kvm->lock); 4038 + r = -EINVAL; 4039 + if (kvm->created_vcpus) 4040 + goto set_identity_unlock; 4042 4041 r = -EFAULT; 4043 4042 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 4044 - goto out; 4043 + goto set_identity_unlock; 4045 4044 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 4045 + set_identity_unlock: 4046 + mutex_unlock(&kvm->lock); 4046 4047 break; 4047 4048 } 4048 4049 case KVM_SET_NR_MMU_PAGES: ··· 5286 5275 kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); 5287 5276 } 5288 5277 5278 + static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) 5279 + { 5280 + return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); 5281 + } 5282 + 5289 5283 static const struct x86_emulate_ops emulate_ops = { 5290 5284 .read_gpr = emulator_read_gpr, 5291 5285 .write_gpr = emulator_write_gpr, ··· 5332 5316 .set_nmi_mask = emulator_set_nmi_mask, 5333 5317 .get_hflags = emulator_get_hflags, 5334 5318 .set_hflags = emulator_set_hflags, 5319 + .pre_leave_smm = emulator_pre_leave_smm, 5335 5320 }; 5336 5321 5337 5322 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) ··· 6443 6426 } 6444 6427 6445 6428 kvm_x86_ops->queue_exception(vcpu); 6446 - } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 6429 + } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { 6447 6430 vcpu->arch.smi_pending = false; 6448 6431 enter_smm(vcpu); 6449 6432 } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { ··· 6489 6472 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); 6490 6473 kvm_make_request(KVM_REQ_EVENT, vcpu); 6491 6474 } 6492 - 6493 - #define put_smstate(type, buf, offset, val) \ 6494 - *(type *)((buf) + (offset) - 0x7e00) = val 6495 6475 6496 6476 static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) 6497 6477 { ··· 6655 6641 u32 cr0; 6656 6642 6657 6643 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); 6658 - vcpu->arch.hflags |= HF_SMM_MASK; 6659 6644 memset(buf, 0, 512); 6660 6645 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 6661 6646 enter_smm_save_state_64(vcpu, buf); 6662 6647 else 6663 6648 enter_smm_save_state_32(vcpu, buf); 6664 6649 6650 + /* 6651 + * Give pre_enter_smm() a chance to make ISA-specific changes to the 6652 + * vCPU state (e.g. leave guest mode) after we've saved the state into 6653 + * the SMM state-save area. 6654 + */ 6655 + kvm_x86_ops->pre_enter_smm(vcpu, buf); 6656 + 6657 + vcpu->arch.hflags |= HF_SMM_MASK; 6665 6658 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); 6666 6659 6667 6660 if (kvm_x86_ops->get_nmi_mask(vcpu)) ··· 6897 6876 if (inject_pending_event(vcpu, req_int_win) != 0) 6898 6877 req_immediate_exit = true; 6899 6878 else { 6900 - /* Enable NMI/IRQ window open exits if needed. 6879 + /* Enable SMI/NMI/IRQ window open exits if needed. 6901 6880 * 6902 - * SMIs have two cases: 1) they can be nested, and 6903 - * then there is nothing to do here because RSM will 6904 - * cause a vmexit anyway; 2) or the SMI can be pending 6905 - * because inject_pending_event has completed the 6906 - * injection of an IRQ or NMI from the previous vmexit, 6907 - * and then we request an immediate exit to inject the SMI. 6881 + * SMIs have three cases: 6882 + * 1) They can be nested, and then there is nothing to 6883 + * do here because RSM will cause a vmexit anyway. 6884 + * 2) There is an ISA-specific reason why SMI cannot be 6885 + * injected, and the moment when this changes can be 6886 + * intercepted. 6887 + * 3) Or the SMI can be pending because 6888 + * inject_pending_event has completed the injection 6889 + * of an IRQ or NMI from the previous vmexit, and 6890 + * then we request an immediate exit to inject the 6891 + * SMI. 6908 6892 */ 6909 6893 if (vcpu->arch.smi_pending && !is_smm(vcpu)) 6910 - req_immediate_exit = true; 6894 + if (!kvm_x86_ops->enable_smi_window(vcpu)) 6895 + req_immediate_exit = true; 6911 6896 if (vcpu->arch.nmi_pending) 6912 6897 kvm_x86_ops->enable_nmi_window(vcpu); 6913 6898 if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) ··· 7825 7798 kvm_async_pf_hash_reset(vcpu); 7826 7799 vcpu->arch.apf.halted = false; 7827 7800 7801 + if (kvm_mpx_supported()) { 7802 + void *mpx_state_buffer; 7803 + 7804 + /* 7805 + * To avoid have the INIT path from kvm_apic_has_events() that be 7806 + * called with loaded FPU and does not let userspace fix the state. 7807 + */ 7808 + kvm_put_guest_fpu(vcpu); 7809 + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, 7810 + XFEATURE_MASK_BNDREGS); 7811 + if (mpx_state_buffer) 7812 + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); 7813 + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, 7814 + XFEATURE_MASK_BNDCSR); 7815 + if (mpx_state_buffer) 7816 + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); 7817 + } 7818 + 7828 7819 if (!init_event) { 7829 7820 kvm_pmu_reset(vcpu); 7830 7821 vcpu->arch.smbase = 0x30000; 7831 7822 7832 7823 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; 7833 7824 vcpu->arch.msr_misc_features_enables = 0; 7825 + 7826 + vcpu->arch.xcr0 = XFEATURE_MASK_FP; 7834 7827 } 7835 7828 7836 7829 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 7837 7830 vcpu->arch.regs_avail = ~0; 7838 7831 vcpu->arch.regs_dirty = ~0; 7832 + 7833 + vcpu->arch.ia32_xss = 0; 7839 7834 7840 7835 kvm_x86_ops->vcpu_reset(vcpu, init_event); 7841 7836 } ··· 8023 7974 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 8024 7975 { 8025 7976 struct page *page; 8026 - struct kvm *kvm; 8027 7977 int r; 8028 7978 8029 - BUG_ON(vcpu->kvm == NULL); 8030 - kvm = vcpu->kvm; 8031 - 8032 7979 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); 8033 - vcpu->arch.pv.pv_unhalted = false; 8034 7980 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 8035 - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 7981 + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 8036 7982 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 8037 7983 else 8038 7984 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; ··· 8045 8001 if (r < 0) 8046 8002 goto fail_free_pio_data; 8047 8003 8048 - if (irqchip_in_kernel(kvm)) { 8004 + if (irqchip_in_kernel(vcpu->kvm)) { 8049 8005 r = kvm_create_lapic(vcpu); 8050 8006 if (r < 0) 8051 8007 goto fail_mmu_destroy; ··· 8067 8023 8068 8024 fx_init(vcpu); 8069 8025 8070 - vcpu->arch.ia32_tsc_adjust_msr = 0x0; 8071 - vcpu->arch.pv_time_enabled = false; 8072 - 8073 - vcpu->arch.guest_supported_xcr0 = 0; 8074 8026 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 8075 8027 8076 8028 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);

+31 -4

drivers/clocksource/arm_arch_timer.c

··· 159 159 * if we don't have the cp15 accessors we won't have a problem. 160 160 */ 161 161 u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct; 162 + EXPORT_SYMBOL_GPL(arch_timer_read_counter); 162 163 163 164 static u64 arch_counter_read(struct clocksource *cs) 164 165 { ··· 219 218 return __fsl_a008585_read_reg(cntv_tval_el0); 220 219 } 221 220 221 + static u64 notrace fsl_a008585_read_cntpct_el0(void) 222 + { 223 + return __fsl_a008585_read_reg(cntpct_el0); 224 + } 225 + 222 226 static u64 notrace fsl_a008585_read_cntvct_el0(void) 223 227 { 224 228 return __fsl_a008585_read_reg(cntvct_el0); ··· 265 259 return __hisi_161010101_read_reg(cntv_tval_el0); 266 260 } 267 261 262 + static u64 notrace hisi_161010101_read_cntpct_el0(void) 263 + { 264 + return __hisi_161010101_read_reg(cntpct_el0); 265 + } 266 + 268 267 static u64 notrace hisi_161010101_read_cntvct_el0(void) 269 268 { 270 269 return __hisi_161010101_read_reg(cntvct_el0); ··· 300 289 #endif 301 290 302 291 #ifdef CONFIG_ARM64_ERRATUM_858921 292 + static u64 notrace arm64_858921_read_cntpct_el0(void) 293 + { 294 + u64 old, new; 295 + 296 + old = read_sysreg(cntpct_el0); 297 + new = read_sysreg(cntpct_el0); 298 + return (((old ^ new) >> 32) & 1) ? old : new; 299 + } 300 + 303 301 static u64 notrace arm64_858921_read_cntvct_el0(void) 304 302 { 305 303 u64 old, new; ··· 330 310 struct clock_event_device *clk) 331 311 { 332 312 unsigned long ctrl; 333 - u64 cval = evt + arch_counter_get_cntvct(); 313 + u64 cval; 334 314 335 315 ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); 336 316 ctrl |= ARCH_TIMER_CTRL_ENABLE; 337 317 ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; 338 318 339 - if (access == ARCH_TIMER_PHYS_ACCESS) 319 + if (access == ARCH_TIMER_PHYS_ACCESS) { 320 + cval = evt + arch_counter_get_cntpct(); 340 321 write_sysreg(cval, cntp_cval_el0); 341 - else 322 + } else { 323 + cval = evt + arch_counter_get_cntvct(); 342 324 write_sysreg(cval, cntv_cval_el0); 325 + } 343 326 344 327 arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); 345 328 } ··· 369 346 .desc = "Freescale erratum a005858", 370 347 .read_cntp_tval_el0 = fsl_a008585_read_cntp_tval_el0, 371 348 .read_cntv_tval_el0 = fsl_a008585_read_cntv_tval_el0, 349 + .read_cntpct_el0 = fsl_a008585_read_cntpct_el0, 372 350 .read_cntvct_el0 = fsl_a008585_read_cntvct_el0, 373 351 .set_next_event_phys = erratum_set_next_event_tval_phys, 374 352 .set_next_event_virt = erratum_set_next_event_tval_virt, ··· 382 358 .desc = "HiSilicon erratum 161010101", 383 359 .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0, 384 360 .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0, 361 + .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, 385 362 .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, 386 363 .set_next_event_phys = erratum_set_next_event_tval_phys, 387 364 .set_next_event_virt = erratum_set_next_event_tval_virt, ··· 393 368 .desc = "HiSilicon erratum 161010101", 394 369 .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0, 395 370 .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0, 371 + .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, 396 372 .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, 397 373 .set_next_event_phys = erratum_set_next_event_tval_phys, 398 374 .set_next_event_virt = erratum_set_next_event_tval_virt, ··· 404 378 .match_type = ate_match_local_cap_id, 405 379 .id = (void *)ARM64_WORKAROUND_858921, 406 380 .desc = "ARM erratum 858921", 381 + .read_cntpct_el0 = arm64_858921_read_cntpct_el0, 407 382 .read_cntvct_el0 = arm64_858921_read_cntvct_el0, 408 383 }, 409 384 #endif ··· 928 901 929 902 /* Register the CP15 based counter if we have one */ 930 903 if (type & ARCH_TIMER_TYPE_CP15) { 931 - if (IS_ENABLED(CONFIG_ARM64) || 904 + if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || 932 905 arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) 933 906 arch_timer_read_counter = arch_counter_get_cntvct; 934 907 else

+6 -2

drivers/irqchip/irq-gic-v3.c

··· 1260 1260 goto out_unmap_rdist; 1261 1261 1262 1262 gic_populate_ppi_partitions(node); 1263 - gic_of_setup_kvm_info(node); 1263 + 1264 + if (static_key_true(&supports_deactivate)) 1265 + gic_of_setup_kvm_info(node); 1264 1266 return 0; 1265 1267 1266 1268 out_unmap_rdist: ··· 1551 1549 goto out_fwhandle_free; 1552 1550 1553 1551 acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle); 1554 - gic_acpi_setup_kvm_info(); 1552 + 1553 + if (static_key_true(&supports_deactivate)) 1554 + gic_acpi_setup_kvm_info(); 1555 1555 1556 1556 return 0; 1557 1557

+4 -2

drivers/irqchip/irq-gic.c

··· 1420 1420 if (ret) 1421 1421 return; 1422 1422 1423 - gic_set_kvm_info(&gic_v2_kvm_info); 1423 + if (static_key_true(&supports_deactivate)) 1424 + gic_set_kvm_info(&gic_v2_kvm_info); 1424 1425 } 1425 1426 1426 1427 int __init ··· 1653 1652 if (IS_ENABLED(CONFIG_ARM_GIC_V2M)) 1654 1653 gicv2m_init(NULL, gic_data[0].domain); 1655 1654 1656 - gic_acpi_setup_kvm_info(); 1655 + if (static_key_true(&supports_deactivate)) 1656 + gic_acpi_setup_kvm_info(); 1657 1657 1658 1658 return 0; 1659 1659 }

+19 -7

include/kvm/arm_arch_timer.h

··· 31 31 /* Timer IRQ */ 32 32 struct kvm_irq_level irq; 33 33 34 - /* Active IRQ state caching */ 35 - bool active_cleared_last; 34 + /* 35 + * We have multiple paths which can save/restore the timer state 36 + * onto the hardware, so we need some way of keeping track of 37 + * where the latest state is. 38 + * 39 + * loaded == true: State is loaded on the hardware registers. 40 + * loaded == false: State is stored in memory. 41 + */ 42 + bool loaded; 36 43 37 44 /* Virtual offset */ 38 45 u64 cntvoff; ··· 50 43 struct arch_timer_context ptimer; 51 44 52 45 /* Background timer used when the guest is not running */ 53 - struct hrtimer timer; 46 + struct hrtimer bg_timer; 54 47 55 48 /* Work queued with the above timer expires */ 56 49 struct work_struct expired; 57 50 58 - /* Background timer active */ 59 - bool armed; 51 + /* Physical timer emulation */ 52 + struct hrtimer phys_timer; 60 53 61 54 /* Is the timer enabled */ 62 55 bool enabled; ··· 66 59 int kvm_timer_enable(struct kvm_vcpu *vcpu); 67 60 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); 68 61 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); 69 - void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); 70 62 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); 71 63 bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); 72 64 void kvm_timer_update_run(struct kvm_vcpu *vcpu); ··· 78 72 int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); 79 73 int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); 80 74 81 - bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); 75 + bool kvm_timer_is_pending(struct kvm_vcpu *vcpu); 76 + 82 77 void kvm_timer_schedule(struct kvm_vcpu *vcpu); 83 78 void kvm_timer_unschedule(struct kvm_vcpu *vcpu); 84 79 85 80 u64 kvm_phys_timer_read(void); 86 81 82 + void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); 87 83 void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); 88 84 89 85 void kvm_timer_init_vhe(void); 90 86 91 87 #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) 92 88 #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) 89 + 90 + void enable_el1_phys_timer_access(void); 91 + void disable_el1_phys_timer_access(void); 92 + 93 93 #endif

+1

include/linux/kvm_host.h

··· 667 667 bool *writable); 668 668 669 669 void kvm_release_pfn_clean(kvm_pfn_t pfn); 670 + void kvm_release_pfn_dirty(kvm_pfn_t pfn); 670 671 void kvm_set_pfn_dirty(kvm_pfn_t pfn); 671 672 void kvm_set_pfn_accessed(kvm_pfn_t pfn); 672 673 void kvm_get_pfn(kvm_pfn_t pfn);

+1

include/uapi/linux/kvm.h

··· 931 931 #define KVM_CAP_PPC_SMT_POSSIBLE 147 932 932 #define KVM_CAP_HYPERV_SYNIC2 148 933 933 #define KVM_CAP_HYPERV_VP_INDEX 149 934 + #define KVM_CAP_S390_AIS_MIGRATION 150 934 935 935 936 #ifdef KVM_CAP_IRQ_ROUTING 936 937

+17 -13

tools/kvm/kvm_stat/kvm_stat

··· 19 19 20 20 The data is sampled from the KVM's debugfs entries and its perf events. 21 21 """ 22 + from __future__ import print_function 22 23 23 24 import curses 24 25 import sys 26 + import locale 25 27 import os 26 28 import time 27 29 import optparse ··· 226 224 'DISABLE': 0x00002401, 227 225 'RESET': 0x00002403, 228 226 } 227 + 228 + ENCODING = locale.getpreferredencoding(False) 229 229 230 230 231 231 class Arch(object): ··· 670 666 """Returns 'event name: current value' for all enabled events.""" 671 667 ret = defaultdict(int) 672 668 for group in self.group_leaders: 673 - for name, val in group.read().iteritems(): 669 + for name, val in group.read().items(): 674 670 if name in self._fields: 675 671 ret[name] += val 676 672 return ret ··· 959 955 except: 960 956 raise Exception 961 957 for line in child.stdout: 962 - line = line.lstrip().split(' ', 1) 958 + line = line.decode(ENCODING).lstrip().split(' ', 1) 963 959 # perform a sanity check before calling the more expensive 964 960 # function to possibly extract the guest name 965 961 if ' -name ' in line[1]: ··· 1009 1005 name = '' 1010 1006 try: 1011 1007 line = open('/proc/{}/cmdline' 1012 - .format(pid), 'rb').read().split('\0') 1008 + .format(pid), 'r').read().split('\0') 1013 1009 parms = line[line.index('-name') + 1].split(',') 1014 1010 while '' in parms: 1015 1011 # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results ··· 1174 1170 .format(self.stats.fields_filter)) 1175 1171 self.screen.addstr(3, 0, "New regex: ") 1176 1172 curses.echo() 1177 - regex = self.screen.getstr() 1173 + regex = self.screen.getstr().decode(ENCODING) 1178 1174 curses.noecho() 1179 1175 if len(regex) == 0: 1180 1176 self.stats.fields_filter = DEFAULT_REGEX ··· 1208 1204 1209 1205 curses.echo() 1210 1206 self.screen.addstr(3, 0, "Pid [0 or pid]: ") 1211 - pid = self.screen.getstr() 1207 + pid = self.screen.getstr().decode(ENCODING) 1212 1208 curses.noecho() 1213 1209 1214 1210 try: ··· 1237 1233 self.screen.addstr(2, 0, 'Change delay from %.1fs to ' % 1238 1234 self._delay_regular) 1239 1235 curses.echo() 1240 - val = self.screen.getstr() 1236 + val = self.screen.getstr().decode(ENCODING) 1241 1237 curses.noecho() 1242 1238 1243 1239 try: ··· 1277 1273 self.print_all_gnames(7) 1278 1274 curses.echo() 1279 1275 self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") 1280 - gname = self.screen.getstr() 1276 + gname = self.screen.getstr().decode(ENCODING) 1281 1277 curses.noecho() 1282 1278 1283 1279 if not gname: ··· 1373 1369 s = stats.get() 1374 1370 for key in sorted(s.keys()): 1375 1371 values = s[key] 1376 - print '%-42s%10d%10d' % (key, values[0], values[1]) 1372 + print('%-42s%10d%10d' % (key, values[0], values[1])) 1377 1373 except KeyboardInterrupt: 1378 1374 pass 1379 1375 1380 1376 1381 1377 def log(stats): 1382 1378 """Prints statistics as reiterating key block, multiple value blocks.""" 1383 - keys = sorted(stats.get().iterkeys()) 1379 + keys = sorted(stats.get().keys()) 1384 1380 1385 1381 def banner(): 1386 1382 for k in keys: 1387 - print '%s' % k, 1388 - print 1383 + print(k, end=' ') 1384 + print() 1389 1385 1390 1386 def statline(): 1391 1387 s = stats.get() 1392 1388 for k in keys: 1393 - print ' %9d' % s[k][1], 1394 - print 1389 + print(' %9d' % s[k][1], end=' ') 1390 + print() 1395 1391 line = 0 1396 1392 banner_repeat = 20 1397 1393 while True:

+92 -5

virt/kvm/arm/aarch32.c

··· 25 25 #include <asm/kvm_emulate.h> 26 26 #include <asm/kvm_hyp.h> 27 27 28 - #ifndef CONFIG_ARM64 29 - #define COMPAT_PSR_T_BIT PSR_T_BIT 30 - #define COMPAT_PSR_IT_MASK PSR_IT_MASK 31 - #endif 32 - 33 28 /* 34 29 * stolen from arch/arm/kernel/opcodes.c 35 30 * ··· 144 149 else 145 150 *vcpu_pc(vcpu) += 4; 146 151 kvm_adjust_itstate(vcpu); 152 + } 153 + 154 + /* 155 + * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. 156 + */ 157 + static const u8 return_offsets[8][2] = { 158 + [0] = { 0, 0 }, /* Reset, unused */ 159 + [1] = { 4, 2 }, /* Undefined */ 160 + [2] = { 0, 0 }, /* SVC, unused */ 161 + [3] = { 4, 4 }, /* Prefetch abort */ 162 + [4] = { 8, 8 }, /* Data abort */ 163 + [5] = { 0, 0 }, /* HVC, unused */ 164 + [6] = { 4, 4 }, /* IRQ, unused */ 165 + [7] = { 4, 4 }, /* FIQ, unused */ 166 + }; 167 + 168 + static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) 169 + { 170 + unsigned long cpsr; 171 + unsigned long new_spsr_value = *vcpu_cpsr(vcpu); 172 + bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT); 173 + u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; 174 + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); 175 + 176 + cpsr = mode | COMPAT_PSR_I_BIT; 177 + 178 + if (sctlr & (1 << 30)) 179 + cpsr |= COMPAT_PSR_T_BIT; 180 + if (sctlr & (1 << 25)) 181 + cpsr |= COMPAT_PSR_E_BIT; 182 + 183 + *vcpu_cpsr(vcpu) = cpsr; 184 + 185 + /* Note: These now point to the banked copies */ 186 + *vcpu_spsr(vcpu) = new_spsr_value; 187 + *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; 188 + 189 + /* Branch to exception vector */ 190 + if (sctlr & (1 << 13)) 191 + vect_offset += 0xffff0000; 192 + else /* always have security exceptions */ 193 + vect_offset += vcpu_cp15(vcpu, c12_VBAR); 194 + 195 + *vcpu_pc(vcpu) = vect_offset; 196 + } 197 + 198 + void kvm_inject_undef32(struct kvm_vcpu *vcpu) 199 + { 200 + prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4); 201 + } 202 + 203 + /* 204 + * Modelled after TakeDataAbortException() and TakePrefetchAbortException 205 + * pseudocode. 206 + */ 207 + static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, 208 + unsigned long addr) 209 + { 210 + u32 vect_offset; 211 + u32 *far, *fsr; 212 + bool is_lpae; 213 + 214 + if (is_pabt) { 215 + vect_offset = 12; 216 + far = &vcpu_cp15(vcpu, c6_IFAR); 217 + fsr = &vcpu_cp15(vcpu, c5_IFSR); 218 + } else { /* !iabt */ 219 + vect_offset = 16; 220 + far = &vcpu_cp15(vcpu, c6_DFAR); 221 + fsr = &vcpu_cp15(vcpu, c5_DFSR); 222 + } 223 + 224 + prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset); 225 + 226 + *far = addr; 227 + 228 + /* Give the guest an IMPLEMENTATION DEFINED exception */ 229 + is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); 230 + if (is_lpae) 231 + *fsr = 1 << 9 | 0x34; 232 + else 233 + *fsr = 0x14; 234 + } 235 + 236 + void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) 237 + { 238 + inject_abt32(vcpu, false, addr); 239 + } 240 + 241 + void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr) 242 + { 243 + inject_abt32(vcpu, true, addr); 147 244 }

+316 -142

virt/kvm/arm/arch_timer.c

··· 46 46 .level = 1, 47 47 }; 48 48 49 - void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) 50 - { 51 - vcpu_vtimer(vcpu)->active_cleared_last = false; 52 - } 49 + static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); 50 + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, 51 + struct arch_timer_context *timer_ctx); 52 + static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); 53 53 54 54 u64 kvm_phys_timer_read(void) 55 55 { 56 56 return timecounter->cc->read(timecounter->cc); 57 57 } 58 58 59 - static bool timer_is_armed(struct arch_timer_cpu *timer) 59 + static void soft_timer_start(struct hrtimer *hrt, u64 ns) 60 60 { 61 - return timer->armed; 62 - } 63 - 64 - /* timer_arm: as in "arm the timer", not as in ARM the company */ 65 - static void timer_arm(struct arch_timer_cpu *timer, u64 ns) 66 - { 67 - timer->armed = true; 68 - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), 61 + hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), 69 62 HRTIMER_MODE_ABS); 70 63 } 71 64 72 - static void timer_disarm(struct arch_timer_cpu *timer) 65 + static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) 73 66 { 74 - if (timer_is_armed(timer)) { 75 - hrtimer_cancel(&timer->timer); 76 - cancel_work_sync(&timer->expired); 77 - timer->armed = false; 78 - } 67 + hrtimer_cancel(hrt); 68 + if (work) 69 + cancel_work_sync(work); 70 + } 71 + 72 + static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu) 73 + { 74 + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 75 + 76 + /* 77 + * When using a userspace irqchip with the architected timers, we must 78 + * prevent continuously exiting from the guest, and therefore mask the 79 + * physical interrupt by disabling it on the host interrupt controller 80 + * when the virtual level is high, such that the guest can make 81 + * forward progress. Once we detect the output level being 82 + * de-asserted, we unmask the interrupt again so that we exit from the 83 + * guest when the timer fires. 84 + */ 85 + if (vtimer->irq.level) 86 + disable_percpu_irq(host_vtimer_irq); 87 + else 88 + enable_percpu_irq(host_vtimer_irq, 0); 79 89 } 80 90 81 91 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) 82 92 { 83 93 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; 94 + struct arch_timer_context *vtimer; 84 95 85 - /* 86 - * We disable the timer in the world switch and let it be 87 - * handled by kvm_timer_sync_hwstate(). Getting a timer 88 - * interrupt at this point is a sure sign of some major 89 - * breakage. 90 - */ 91 - pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu); 96 + if (!vcpu) { 97 + pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n"); 98 + return IRQ_NONE; 99 + } 100 + vtimer = vcpu_vtimer(vcpu); 101 + 102 + if (!vtimer->irq.level) { 103 + vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); 104 + if (kvm_timer_irq_can_fire(vtimer)) 105 + kvm_timer_update_irq(vcpu, true, vtimer); 106 + } 107 + 108 + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) 109 + kvm_vtimer_update_mask_user(vcpu); 110 + 92 111 return IRQ_HANDLED; 93 112 } 94 113 ··· 177 158 return min(min_virt, min_phys); 178 159 } 179 160 180 - static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) 161 + static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) 181 162 { 182 163 struct arch_timer_cpu *timer; 183 164 struct kvm_vcpu *vcpu; 184 165 u64 ns; 185 166 186 - timer = container_of(hrt, struct arch_timer_cpu, timer); 167 + timer = container_of(hrt, struct arch_timer_cpu, bg_timer); 187 168 vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); 188 169 189 170 /* ··· 201 182 return HRTIMER_NORESTART; 202 183 } 203 184 204 - bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) 185 + static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) 186 + { 187 + struct arch_timer_context *ptimer; 188 + struct arch_timer_cpu *timer; 189 + struct kvm_vcpu *vcpu; 190 + u64 ns; 191 + 192 + timer = container_of(hrt, struct arch_timer_cpu, phys_timer); 193 + vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); 194 + ptimer = vcpu_ptimer(vcpu); 195 + 196 + /* 197 + * Check that the timer has really expired from the guest's 198 + * PoV (NTP on the host may have forced it to expire 199 + * early). If not ready, schedule for a later time. 200 + */ 201 + ns = kvm_timer_compute_delta(ptimer); 202 + if (unlikely(ns)) { 203 + hrtimer_forward_now(hrt, ns_to_ktime(ns)); 204 + return HRTIMER_RESTART; 205 + } 206 + 207 + kvm_timer_update_irq(vcpu, true, ptimer); 208 + return HRTIMER_NORESTART; 209 + } 210 + 211 + static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) 205 212 { 206 213 u64 cval, now; 207 214 ··· 238 193 now = kvm_phys_timer_read() - timer_ctx->cntvoff; 239 194 240 195 return cval <= now; 196 + } 197 + 198 + bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) 199 + { 200 + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 201 + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 202 + 203 + if (vtimer->irq.level || ptimer->irq.level) 204 + return true; 205 + 206 + /* 207 + * When this is called from withing the wait loop of kvm_vcpu_block(), 208 + * the software view of the timer state is up to date (timer->loaded 209 + * is false), and so we can simply check if the timer should fire now. 210 + */ 211 + if (!vtimer->loaded && kvm_timer_should_fire(vtimer)) 212 + return true; 213 + 214 + return kvm_timer_should_fire(ptimer); 241 215 } 242 216 243 217 /* ··· 282 218 { 283 219 int ret; 284 220 285 - timer_ctx->active_cleared_last = false; 286 221 timer_ctx->irq.level = new_level; 287 222 trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, 288 223 timer_ctx->irq.level); ··· 295 232 } 296 233 } 297 234 235 + /* Schedule the background timer for the emulated timer. */ 236 + static void phys_timer_emulate(struct kvm_vcpu *vcpu) 237 + { 238 + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 239 + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 240 + 241 + /* 242 + * If the timer can fire now we have just raised the IRQ line and we 243 + * don't need to have a soft timer scheduled for the future. If the 244 + * timer cannot fire at all, then we also don't need a soft timer. 245 + */ 246 + if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { 247 + soft_timer_cancel(&timer->phys_timer, NULL); 248 + return; 249 + } 250 + 251 + soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer)); 252 + } 253 + 298 254 /* 299 - * Check if there was a change in the timer state (should we raise or lower 300 - * the line level to the GIC). 255 + * Check if there was a change in the timer state, so that we should either 256 + * raise or lower the line level to the GIC or schedule a background timer to 257 + * emulate the physical timer. 301 258 */ 302 259 static void kvm_timer_update_state(struct kvm_vcpu *vcpu) 303 260 { ··· 325 242 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 326 243 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 327 244 328 - /* 329 - * If userspace modified the timer registers via SET_ONE_REG before 330 - * the vgic was initialized, we mustn't set the vtimer->irq.level value 331 - * because the guest would never see the interrupt. Instead wait 332 - * until we call this function from kvm_timer_flush_hwstate. 333 - */ 334 245 if (unlikely(!timer->enabled)) 335 246 return; 336 247 ··· 333 256 334 257 if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) 335 258 kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); 259 + 260 + phys_timer_emulate(vcpu); 336 261 } 337 262 338 - /* Schedule the background timer for the emulated timer. */ 339 - static void kvm_timer_emulate(struct kvm_vcpu *vcpu, 340 - struct arch_timer_context *timer_ctx) 263 + static void vtimer_save_state(struct kvm_vcpu *vcpu) 341 264 { 342 265 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 266 + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 267 + unsigned long flags; 343 268 344 - if (kvm_timer_should_fire(timer_ctx)) 345 - return; 269 + local_irq_save(flags); 346 270 347 - if (!kvm_timer_irq_can_fire(timer_ctx)) 348 - return; 271 + if (!vtimer->loaded) 272 + goto out; 349 273 350 - /* The timer has not yet expired, schedule a background timer */ 351 - timer_arm(timer, kvm_timer_compute_delta(timer_ctx)); 274 + if (timer->enabled) { 275 + vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); 276 + vtimer->cnt_cval = read_sysreg_el0(cntv_cval); 277 + } 278 + 279 + /* Disable the virtual timer */ 280 + write_sysreg_el0(0, cntv_ctl); 281 + 282 + vtimer->loaded = false; 283 + out: 284 + local_irq_restore(flags); 352 285 } 353 286 354 287 /* ··· 372 285 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 373 286 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 374 287 375 - BUG_ON(timer_is_armed(timer)); 288 + vtimer_save_state(vcpu); 376 289 377 290 /* 378 291 * No need to schedule a background timer if any guest timer has ··· 393 306 * The guest timers have not yet expired, schedule a background timer. 394 307 * Set the earliest expiration time among the guest timers. 395 308 */ 396 - timer_arm(timer, kvm_timer_earliest_exp(vcpu)); 309 + soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); 310 + } 311 + 312 + static void vtimer_restore_state(struct kvm_vcpu *vcpu) 313 + { 314 + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 315 + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 316 + unsigned long flags; 317 + 318 + local_irq_save(flags); 319 + 320 + if (vtimer->loaded) 321 + goto out; 322 + 323 + if (timer->enabled) { 324 + write_sysreg_el0(vtimer->cnt_cval, cntv_cval); 325 + isb(); 326 + write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); 327 + } 328 + 329 + vtimer->loaded = true; 330 + out: 331 + local_irq_restore(flags); 397 332 } 398 333 399 334 void kvm_timer_unschedule(struct kvm_vcpu *vcpu) 400 335 { 401 336 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 402 - timer_disarm(timer); 337 + 338 + vtimer_restore_state(vcpu); 339 + 340 + soft_timer_cancel(&timer->bg_timer, &timer->expired); 403 341 } 404 342 405 - static void kvm_timer_flush_hwstate_vgic(struct kvm_vcpu *vcpu) 343 + static void set_cntvoff(u64 cntvoff) 344 + { 345 + u32 low = lower_32_bits(cntvoff); 346 + u32 high = upper_32_bits(cntvoff); 347 + 348 + /* 349 + * Since kvm_call_hyp doesn't fully support the ARM PCS especially on 350 + * 32-bit systems, but rather passes register by register shifted one 351 + * place (we put the function address in r0/x0), we cannot simply pass 352 + * a 64-bit value as an argument, but have to split the value in two 353 + * 32-bit halves. 354 + */ 355 + kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); 356 + } 357 + 358 + static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu) 406 359 { 407 360 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 408 361 bool phys_active; 409 362 int ret; 410 363 411 - /* 412 - * If we enter the guest with the virtual input level to the VGIC 413 - * asserted, then we have already told the VGIC what we need to, and 414 - * we don't need to exit from the guest until the guest deactivates 415 - * the already injected interrupt, so therefore we should set the 416 - * hardware active state to prevent unnecessary exits from the guest. 417 - * 418 - * Also, if we enter the guest with the virtual timer interrupt active, 419 - * then it must be active on the physical distributor, because we set 420 - * the HW bit and the guest must be able to deactivate the virtual and 421 - * physical interrupt at the same time. 422 - * 423 - * Conversely, if the virtual input level is deasserted and the virtual 424 - * interrupt is not active, then always clear the hardware active state 425 - * to ensure that hardware interrupts from the timer triggers a guest 426 - * exit. 427 - */ 428 364 phys_active = vtimer->irq.level || 429 - kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); 430 - 431 - /* 432 - * We want to avoid hitting the (re)distributor as much as 433 - * possible, as this is a potentially expensive MMIO access 434 - * (not to mention locks in the irq layer), and a solution for 435 - * this is to cache the "active" state in memory. 436 - * 437 - * Things to consider: we cannot cache an "active set" state, 438 - * because the HW can change this behind our back (it becomes 439 - * "clear" in the HW). We must then restrict the caching to 440 - * the "clear" state. 441 - * 442 - * The cache is invalidated on: 443 - * - vcpu put, indicating that the HW cannot be trusted to be 444 - * in a sane state on the next vcpu load, 445 - * - any change in the interrupt state 446 - * 447 - * Usage conditions: 448 - * - cached value is "active clear" 449 - * - value to be programmed is "active clear" 450 - */ 451 - if (vtimer->active_cleared_last && !phys_active) 452 - return; 365 + kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); 453 366 454 367 ret = irq_set_irqchip_state(host_vtimer_irq, 455 368 IRQCHIP_STATE_ACTIVE, 456 369 phys_active); 457 370 WARN_ON(ret); 371 + } 458 372 459 - vtimer->active_cleared_last = !phys_active; 373 + static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu) 374 + { 375 + kvm_vtimer_update_mask_user(vcpu); 376 + } 377 + 378 + void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) 379 + { 380 + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 381 + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 382 + 383 + if (unlikely(!timer->enabled)) 384 + return; 385 + 386 + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) 387 + kvm_timer_vcpu_load_user(vcpu); 388 + else 389 + kvm_timer_vcpu_load_vgic(vcpu); 390 + 391 + set_cntvoff(vtimer->cntvoff); 392 + 393 + vtimer_restore_state(vcpu); 394 + 395 + if (has_vhe()) 396 + disable_el1_phys_timer_access(); 397 + 398 + /* Set the background timer for the physical timer emulation. */ 399 + phys_timer_emulate(vcpu); 460 400 } 461 401 462 402 bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) ··· 503 389 ptimer->irq.level != plevel; 504 390 } 505 391 506 - static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu) 507 - { 508 - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 509 - 510 - /* 511 - * To prevent continuously exiting from the guest, we mask the 512 - * physical interrupt such that the guest can make forward progress. 513 - * Once we detect the output level being deasserted, we unmask the 514 - * interrupt again so that we exit from the guest when the timer 515 - * fires. 516 - */ 517 - if (vtimer->irq.level) 518 - disable_percpu_irq(host_vtimer_irq); 519 - else 520 - enable_percpu_irq(host_vtimer_irq, 0); 521 - } 522 - 523 - /** 524 - * kvm_timer_flush_hwstate - prepare timers before running the vcpu 525 - * @vcpu: The vcpu pointer 526 - * 527 - * Check if the virtual timer has expired while we were running in the host, 528 - * and inject an interrupt if that was the case, making sure the timer is 529 - * masked or disabled on the host so that we keep executing. Also schedule a 530 - * software timer for the physical timer if it is enabled. 531 - */ 532 - void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) 392 + void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) 533 393 { 534 394 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 535 395 536 396 if (unlikely(!timer->enabled)) 537 397 return; 538 398 539 - kvm_timer_update_state(vcpu); 399 + if (has_vhe()) 400 + enable_el1_phys_timer_access(); 540 401 541 - /* Set the background timer for the physical timer emulation. */ 542 - kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); 402 + vtimer_save_state(vcpu); 543 403 544 - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) 545 - kvm_timer_flush_hwstate_user(vcpu); 546 - else 547 - kvm_timer_flush_hwstate_vgic(vcpu); 404 + /* 405 + * Cancel the physical timer emulation, because the only case where we 406 + * need it after a vcpu_put is in the context of a sleeping VCPU, and 407 + * in that case we already factor in the deadline for the physical 408 + * timer when scheduling the bg_timer. 409 + * 410 + * In any case, we re-schedule the hrtimer for the physical timer when 411 + * coming back to the VCPU thread in kvm_timer_vcpu_load(). 412 + */ 413 + soft_timer_cancel(&timer->phys_timer, NULL); 414 + 415 + /* 416 + * The kernel may decide to run userspace after calling vcpu_put, so 417 + * we reset cntvoff to 0 to ensure a consistent read between user 418 + * accesses to the virtual counter and kernel access to the physical 419 + * counter. 420 + */ 421 + set_cntvoff(0); 422 + } 423 + 424 + static void unmask_vtimer_irq(struct kvm_vcpu *vcpu) 425 + { 426 + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 427 + 428 + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { 429 + kvm_vtimer_update_mask_user(vcpu); 430 + return; 431 + } 432 + 433 + /* 434 + * If the guest disabled the timer without acking the interrupt, then 435 + * we must make sure the physical and virtual active states are in 436 + * sync by deactivating the physical interrupt, because otherwise we 437 + * wouldn't see the next timer interrupt in the host. 438 + */ 439 + if (!kvm_vgic_map_is_active(vcpu, vtimer->irq.irq)) { 440 + int ret; 441 + ret = irq_set_irqchip_state(host_vtimer_irq, 442 + IRQCHIP_STATE_ACTIVE, 443 + false); 444 + WARN_ON(ret); 445 + } 548 446 } 549 447 550 448 /** ··· 568 442 */ 569 443 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) 570 444 { 571 - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 445 + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 572 446 573 447 /* 574 - * This is to cancel the background timer for the physical timer 575 - * emulation if it is set. 448 + * If we entered the guest with the vtimer output asserted we have to 449 + * check if the guest has modified the timer so that we should lower 450 + * the line at this point. 576 451 */ 577 - timer_disarm(timer); 578 - 579 - /* 580 - * The guest could have modified the timer registers or the timer 581 - * could have expired, update the timer state. 582 - */ 583 - kvm_timer_update_state(vcpu); 452 + if (vtimer->irq.level) { 453 + vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); 454 + vtimer->cnt_cval = read_sysreg_el0(cntv_cval); 455 + if (!kvm_timer_should_fire(vtimer)) { 456 + kvm_timer_update_irq(vcpu, false, vtimer); 457 + unmask_vtimer_irq(vcpu); 458 + } 459 + } 584 460 } 585 461 586 462 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) ··· 633 505 vcpu_ptimer(vcpu)->cntvoff = 0; 634 506 635 507 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); 636 - hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 637 - timer->timer.function = kvm_timer_expire; 508 + hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 509 + timer->bg_timer.function = kvm_bg_timer_expire; 510 + 511 + hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 512 + timer->phys_timer.function = kvm_phys_timer_expire; 638 513 639 514 vtimer->irq.irq = default_vtimer_irq.irq; 640 515 ptimer->irq.irq = default_ptimer_irq.irq; ··· 651 520 int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) 652 521 { 653 522 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 523 + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 654 524 655 525 switch (regid) { 656 526 case KVM_REG_ARM_TIMER_CTL: 657 - vtimer->cnt_ctl = value; 527 + vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; 658 528 break; 659 529 case KVM_REG_ARM_TIMER_CNT: 660 530 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); ··· 663 531 case KVM_REG_ARM_TIMER_CVAL: 664 532 vtimer->cnt_cval = value; 665 533 break; 534 + case KVM_REG_ARM_PTIMER_CTL: 535 + ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; 536 + break; 537 + case KVM_REG_ARM_PTIMER_CVAL: 538 + ptimer->cnt_cval = value; 539 + break; 540 + 666 541 default: 667 542 return -1; 668 543 } ··· 678 539 return 0; 679 540 } 680 541 542 + static u64 read_timer_ctl(struct arch_timer_context *timer) 543 + { 544 + /* 545 + * Set ISTATUS bit if it's expired. 546 + * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is 547 + * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit 548 + * regardless of ENABLE bit for our implementation convenience. 549 + */ 550 + if (!kvm_timer_compute_delta(timer)) 551 + return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT; 552 + else 553 + return timer->cnt_ctl; 554 + } 555 + 681 556 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) 682 557 { 558 + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 683 559 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 684 560 685 561 switch (regid) { 686 562 case KVM_REG_ARM_TIMER_CTL: 687 - return vtimer->cnt_ctl; 563 + return read_timer_ctl(vtimer); 688 564 case KVM_REG_ARM_TIMER_CNT: 689 565 return kvm_phys_timer_read() - vtimer->cntvoff; 690 566 case KVM_REG_ARM_TIMER_CVAL: 691 567 return vtimer->cnt_cval; 568 + case KVM_REG_ARM_PTIMER_CTL: 569 + return read_timer_ctl(ptimer); 570 + case KVM_REG_ARM_PTIMER_CVAL: 571 + return ptimer->cnt_cval; 572 + case KVM_REG_ARM_PTIMER_CNT: 573 + return kvm_phys_timer_read(); 692 574 } 693 575 return (u64)-1; 694 576 } ··· 762 602 return err; 763 603 } 764 604 605 + err = irq_set_vcpu_affinity(host_vtimer_irq, kvm_get_running_vcpus()); 606 + if (err) { 607 + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); 608 + goto out_free_irq; 609 + } 610 + 765 611 kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); 766 612 767 613 cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, 768 614 "kvm/arm/timer:starting", kvm_timer_starting_cpu, 769 615 kvm_timer_dying_cpu); 616 + return 0; 617 + out_free_irq: 618 + free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); 770 619 return err; 771 620 } 772 621 ··· 784 615 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 785 616 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 786 617 787 - timer_disarm(timer); 618 + soft_timer_cancel(&timer->bg_timer, &timer->expired); 619 + soft_timer_cancel(&timer->phys_timer, NULL); 788 620 kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); 789 621 } 790 622 ··· 861 691 return ret; 862 692 863 693 no_vgic: 694 + preempt_disable(); 864 695 timer->enabled = 1; 696 + kvm_timer_vcpu_load_vgic(vcpu); 697 + preempt_enable(); 698 + 865 699 return 0; 866 700 } 867 701

+27 -18

virt/kvm/arm/arm.c

··· 307 307 308 308 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 309 309 { 310 - return kvm_timer_should_fire(vcpu_vtimer(vcpu)) || 311 - kvm_timer_should_fire(vcpu_ptimer(vcpu)); 310 + return kvm_timer_is_pending(vcpu); 312 311 } 313 312 314 313 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) ··· 353 354 vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); 354 355 355 356 kvm_arm_set_running_vcpu(vcpu); 356 - 357 357 kvm_vgic_load(vcpu); 358 + kvm_timer_vcpu_load(vcpu); 358 359 } 359 360 360 361 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 361 362 { 363 + kvm_timer_vcpu_put(vcpu); 362 364 kvm_vgic_put(vcpu); 363 365 364 366 vcpu->cpu = -1; 365 367 366 368 kvm_arm_set_running_vcpu(NULL); 367 - kvm_timer_vcpu_put(vcpu); 368 369 } 369 370 370 371 static void vcpu_power_off(struct kvm_vcpu *vcpu) ··· 656 657 657 658 kvm_pmu_flush_hwstate(vcpu); 658 659 659 - kvm_timer_flush_hwstate(vcpu); 660 - kvm_vgic_flush_hwstate(vcpu); 661 - 662 660 local_irq_disable(); 661 + 662 + kvm_vgic_flush_hwstate(vcpu); 663 663 664 664 /* 665 665 * If we have a singal pending, or need to notify a userspace ··· 684 686 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || 685 687 kvm_request_pending(vcpu)) { 686 688 vcpu->mode = OUTSIDE_GUEST_MODE; 687 - local_irq_enable(); 688 689 kvm_pmu_sync_hwstate(vcpu); 689 690 kvm_timer_sync_hwstate(vcpu); 690 691 kvm_vgic_sync_hwstate(vcpu); 692 + local_irq_enable(); 691 693 preempt_enable(); 692 694 continue; 693 695 } ··· 711 713 kvm_arm_clear_debug(vcpu); 712 714 713 715 /* 716 + * We must sync the PMU state before the vgic state so 717 + * that the vgic can properly sample the updated state of the 718 + * interrupt line. 719 + */ 720 + kvm_pmu_sync_hwstate(vcpu); 721 + 722 + /* 723 + * Sync the vgic state before syncing the timer state because 724 + * the timer code needs to know if the virtual timer 725 + * interrupts are active. 726 + */ 727 + kvm_vgic_sync_hwstate(vcpu); 728 + 729 + /* 730 + * Sync the timer hardware state before enabling interrupts as 731 + * we don't want vtimer interrupts to race with syncing the 732 + * timer virtual interrupt state. 733 + */ 734 + kvm_timer_sync_hwstate(vcpu); 735 + 736 + /* 714 737 * We may have taken a host interrupt in HYP mode (ie 715 738 * while executing the guest). This interrupt is still 716 739 * pending, as we haven't serviced it yet! ··· 753 734 */ 754 735 guest_exit(); 755 736 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 756 - 757 - /* 758 - * We must sync the PMU and timer state before the vgic state so 759 - * that the vgic can properly sample the updated state of the 760 - * interrupt line. 761 - */ 762 - kvm_pmu_sync_hwstate(vcpu); 763 - kvm_timer_sync_hwstate(vcpu); 764 - 765 - kvm_vgic_sync_hwstate(vcpu); 766 737 767 738 preempt_enable(); 768 739

+32 -42

virt/kvm/arm/hyp/timer-sr.c

··· 21 21 22 22 #include <asm/kvm_hyp.h> 23 23 24 - /* vcpu is already in the HYP VA space */ 25 - void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) 24 + void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high) 26 25 { 27 - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 28 - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 26 + u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low; 27 + write_sysreg(cntvoff, cntvoff_el2); 28 + } 29 + 30 + void __hyp_text enable_el1_phys_timer_access(void) 31 + { 29 32 u64 val; 30 33 31 - if (timer->enabled) { 32 - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); 33 - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); 34 - } 34 + /* Allow physical timer/counter access for the host */ 35 + val = read_sysreg(cnthctl_el2); 36 + val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; 37 + write_sysreg(val, cnthctl_el2); 38 + } 35 39 36 - /* Disable the virtual timer */ 37 - write_sysreg_el0(0, cntv_ctl); 40 + void __hyp_text disable_el1_phys_timer_access(void) 41 + { 42 + u64 val; 38 43 44 + /* 45 + * Disallow physical timer access for the guest 46 + * Physical counter access is allowed 47 + */ 48 + val = read_sysreg(cnthctl_el2); 49 + val &= ~CNTHCTL_EL1PCEN; 50 + val |= CNTHCTL_EL1PCTEN; 51 + write_sysreg(val, cnthctl_el2); 52 + } 53 + 54 + void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu) 55 + { 39 56 /* 40 57 * We don't need to do this for VHE since the host kernel runs in EL2 41 58 * with HCR_EL2.TGE ==1, which makes those bits have no impact. 42 59 */ 43 - if (!has_vhe()) { 44 - /* Allow physical timer/counter access for the host */ 45 - val = read_sysreg(cnthctl_el2); 46 - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; 47 - write_sysreg(val, cnthctl_el2); 48 - } 49 - 50 - /* Clear cntvoff for the host */ 51 - write_sysreg(0, cntvoff_el2); 60 + if (!has_vhe()) 61 + enable_el1_phys_timer_access(); 52 62 } 53 63 54 - void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) 64 + void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu) 55 65 { 56 - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 57 - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 58 - u64 val; 59 - 60 - /* Those bits are already configured at boot on VHE-system */ 61 - if (!has_vhe()) { 62 - /* 63 - * Disallow physical timer access for the guest 64 - * Physical counter access is allowed 65 - */ 66 - val = read_sysreg(cnthctl_el2); 67 - val &= ~CNTHCTL_EL1PCEN; 68 - val |= CNTHCTL_EL1PCTEN; 69 - write_sysreg(val, cnthctl_el2); 70 - } 71 - 72 - if (timer->enabled) { 73 - write_sysreg(vtimer->cntvoff, cntvoff_el2); 74 - write_sysreg_el0(vtimer->cnt_cval, cntv_cval); 75 - isb(); 76 - write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); 77 - } 66 + if (!has_vhe()) 67 + disable_el1_phys_timer_access(); 78 68 }

+106 -93

virt/kvm/arm/vgic/vgic-its.c

··· 278 278 u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); 279 279 u8 prop; 280 280 int ret; 281 + unsigned long flags; 281 282 282 283 ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET, 283 284 &prop, 1); ··· 286 285 if (ret) 287 286 return ret; 288 287 289 - spin_lock(&irq->irq_lock); 288 + spin_lock_irqsave(&irq->irq_lock, flags); 290 289 291 290 if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { 292 291 irq->priority = LPI_PROP_PRIORITY(prop); 293 292 irq->enabled = LPI_PROP_ENABLE_BIT(prop); 294 293 295 - vgic_queue_irq_unlock(kvm, irq); 294 + vgic_queue_irq_unlock(kvm, irq, flags); 296 295 } else { 297 - spin_unlock(&irq->irq_lock); 296 + spin_unlock_irqrestore(&irq->irq_lock, flags); 298 297 } 299 298 300 299 return 0; ··· 394 393 int ret = 0; 395 394 u32 *intids; 396 395 int nr_irqs, i; 396 + unsigned long flags; 397 397 398 398 nr_irqs = vgic_copy_lpi_list(vcpu, &intids); 399 399 if (nr_irqs < 0) ··· 422 420 } 423 421 424 422 irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); 425 - spin_lock(&irq->irq_lock); 423 + spin_lock_irqsave(&irq->irq_lock, flags); 426 424 irq->pending_latch = pendmask & (1U << bit_nr); 427 - vgic_queue_irq_unlock(vcpu->kvm, irq); 425 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 428 426 vgic_put_irq(vcpu->kvm, irq); 429 427 } 430 428 ··· 517 515 { 518 516 struct kvm_vcpu *vcpu; 519 517 struct its_ite *ite; 518 + unsigned long flags; 520 519 521 520 if (!its->enabled) 522 521 return -EBUSY; ··· 533 530 if (!vcpu->arch.vgic_cpu.lpis_enabled) 534 531 return -EBUSY; 535 532 536 - spin_lock(&ite->irq->irq_lock); 533 + spin_lock_irqsave(&ite->irq->irq_lock, flags); 537 534 ite->irq->pending_latch = true; 538 - vgic_queue_irq_unlock(kvm, ite->irq); 535 + vgic_queue_irq_unlock(kvm, ite->irq, flags); 539 536 540 537 return 0; 541 538 } ··· 897 894 } 898 895 899 896 /* Requires the its_lock to be held. */ 900 - static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) 897 + static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) 901 898 { 902 899 struct its_ite *ite, *temp; 903 900 ··· 911 908 912 909 list_del(&device->dev_list); 913 910 kfree(device); 911 + } 912 + 913 + /* its lock must be held */ 914 + static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) 915 + { 916 + struct its_device *cur, *temp; 917 + 918 + list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) 919 + vgic_its_free_device(kvm, cur); 920 + } 921 + 922 + /* its lock must be held */ 923 + static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) 924 + { 925 + struct its_collection *cur, *temp; 926 + 927 + list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) 928 + vgic_its_free_collection(its, cur->collection_id); 914 929 } 915 930 916 931 /* Must be called with its_lock mutex held */ ··· 978 957 * by removing the mapping and re-establishing it. 979 958 */ 980 959 if (device) 981 - vgic_its_unmap_device(kvm, device); 960 + vgic_its_free_device(kvm, device); 982 961 983 962 /* 984 963 * The spec does not say whether unmapping a not-mapped device ··· 1431 1410 unsigned long val) 1432 1411 { 1433 1412 const struct vgic_its_abi *abi = vgic_its_get_abi(its); 1434 - u64 entry_size, device_type; 1413 + u64 entry_size, table_type; 1435 1414 u64 reg, *regptr, clearbits = 0; 1436 1415 1437 1416 /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ ··· 1442 1421 case 0: 1443 1422 regptr = &its->baser_device_table; 1444 1423 entry_size = abi->dte_esz; 1445 - device_type = GITS_BASER_TYPE_DEVICE; 1424 + table_type = GITS_BASER_TYPE_DEVICE; 1446 1425 break; 1447 1426 case 1: 1448 1427 regptr = &its->baser_coll_table; 1449 1428 entry_size = abi->cte_esz; 1450 - device_type = GITS_BASER_TYPE_COLLECTION; 1429 + table_type = GITS_BASER_TYPE_COLLECTION; 1451 1430 clearbits = GITS_BASER_INDIRECT; 1452 1431 break; 1453 1432 default: ··· 1459 1438 reg &= ~clearbits; 1460 1439 1461 1440 reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; 1462 - reg |= device_type << GITS_BASER_TYPE_SHIFT; 1441 + reg |= table_type << GITS_BASER_TYPE_SHIFT; 1463 1442 reg = vgic_sanitise_its_baser(reg); 1464 1443 1465 1444 *regptr = reg; 1445 + 1446 + if (!(reg & GITS_BASER_VALID)) { 1447 + /* Take the its_lock to prevent a race with a save/restore */ 1448 + mutex_lock(&its->its_lock); 1449 + switch (table_type) { 1450 + case GITS_BASER_TYPE_DEVICE: 1451 + vgic_its_free_device_list(kvm, its); 1452 + break; 1453 + case GITS_BASER_TYPE_COLLECTION: 1454 + vgic_its_free_collection_list(kvm, its); 1455 + break; 1456 + } 1457 + mutex_unlock(&its->its_lock); 1458 + } 1466 1459 } 1467 1460 1468 1461 static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, ··· 1658 1623 return vgic_its_set_abi(its, NR_ITS_ABIS - 1); 1659 1624 } 1660 1625 1661 - static void vgic_its_free_device(struct kvm *kvm, struct its_device *dev) 1662 - { 1663 - struct its_ite *ite, *tmp; 1664 - 1665 - list_for_each_entry_safe(ite, tmp, &dev->itt_head, ite_list) 1666 - its_free_ite(kvm, ite); 1667 - list_del(&dev->dev_list); 1668 - kfree(dev); 1669 - } 1670 - 1671 1626 static void vgic_its_destroy(struct kvm_device *kvm_dev) 1672 1627 { 1673 1628 struct kvm *kvm = kvm_dev->kvm; 1674 1629 struct vgic_its *its = kvm_dev->private; 1675 - struct list_head *cur, *temp; 1676 - 1677 - /* 1678 - * We may end up here without the lists ever having been initialized. 1679 - * Check this and bail out early to avoid dereferencing a NULL pointer. 1680 - */ 1681 - if (!its->device_list.next) 1682 - return; 1683 1630 1684 1631 mutex_lock(&its->its_lock); 1685 - list_for_each_safe(cur, temp, &its->device_list) { 1686 - struct its_device *dev; 1687 1632 1688 - dev = list_entry(cur, struct its_device, dev_list); 1689 - vgic_its_free_device(kvm, dev); 1690 - } 1633 + vgic_its_free_device_list(kvm, its); 1634 + vgic_its_free_collection_list(kvm, its); 1691 1635 1692 - list_for_each_safe(cur, temp, &its->collection_list) { 1693 - struct its_collection *coll; 1694 - 1695 - coll = list_entry(cur, struct its_collection, coll_list); 1696 - list_del(cur); 1697 - kfree(coll); 1698 - } 1699 1636 mutex_unlock(&its->its_lock); 1700 - 1701 1637 kfree(its); 1702 1638 } 1703 1639 ··· 2296 2290 */ 2297 2291 static int vgic_its_save_tables_v0(struct vgic_its *its) 2298 2292 { 2299 - struct kvm *kvm = its->dev->kvm; 2300 2293 int ret; 2301 - 2302 - mutex_lock(&kvm->lock); 2303 - mutex_lock(&its->its_lock); 2304 - 2305 - if (!lock_all_vcpus(kvm)) { 2306 - mutex_unlock(&its->its_lock); 2307 - mutex_unlock(&kvm->lock); 2308 - return -EBUSY; 2309 - } 2310 2294 2311 2295 ret = vgic_its_save_device_tables(its); 2312 2296 if (ret) 2313 - goto out; 2297 + return ret; 2314 2298 2315 - ret = vgic_its_save_collection_table(its); 2316 - 2317 - out: 2318 - unlock_all_vcpus(kvm); 2319 - mutex_unlock(&its->its_lock); 2320 - mutex_unlock(&kvm->lock); 2321 - return ret; 2299 + return vgic_its_save_collection_table(its); 2322 2300 } 2323 2301 2324 2302 /** ··· 2312 2322 */ 2313 2323 static int vgic_its_restore_tables_v0(struct vgic_its *its) 2314 2324 { 2315 - struct kvm *kvm = its->dev->kvm; 2316 2325 int ret; 2317 - 2318 - mutex_lock(&kvm->lock); 2319 - mutex_lock(&its->its_lock); 2320 - 2321 - if (!lock_all_vcpus(kvm)) { 2322 - mutex_unlock(&its->its_lock); 2323 - mutex_unlock(&kvm->lock); 2324 - return -EBUSY; 2325 - } 2326 2326 2327 2327 ret = vgic_its_restore_collection_table(its); 2328 2328 if (ret) 2329 - goto out; 2329 + return ret; 2330 2330 2331 - ret = vgic_its_restore_device_tables(its); 2332 - out: 2333 - unlock_all_vcpus(kvm); 2334 - mutex_unlock(&its->its_lock); 2335 - mutex_unlock(&kvm->lock); 2336 - 2337 - return ret; 2331 + return vgic_its_restore_device_tables(its); 2338 2332 } 2339 2333 2340 2334 static int vgic_its_commit_v0(struct vgic_its *its) ··· 2337 2363 return 0; 2338 2364 } 2339 2365 2366 + static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) 2367 + { 2368 + /* We need to keep the ABI specific field values */ 2369 + its->baser_coll_table &= ~GITS_BASER_VALID; 2370 + its->baser_device_table &= ~GITS_BASER_VALID; 2371 + its->cbaser = 0; 2372 + its->creadr = 0; 2373 + its->cwriter = 0; 2374 + its->enabled = 0; 2375 + vgic_its_free_device_list(kvm, its); 2376 + vgic_its_free_collection_list(kvm, its); 2377 + } 2378 + 2340 2379 static int vgic_its_has_attr(struct kvm_device *dev, 2341 2380 struct kvm_device_attr *attr) 2342 2381 { ··· 2364 2377 switch (attr->attr) { 2365 2378 case KVM_DEV_ARM_VGIC_CTRL_INIT: 2366 2379 return 0; 2380 + case KVM_DEV_ARM_ITS_CTRL_RESET: 2381 + return 0; 2367 2382 case KVM_DEV_ARM_ITS_SAVE_TABLES: 2368 2383 return 0; 2369 2384 case KVM_DEV_ARM_ITS_RESTORE_TABLES: ··· 2376 2387 return vgic_its_has_attr_regs(dev, attr); 2377 2388 } 2378 2389 return -ENXIO; 2390 + } 2391 + 2392 + static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) 2393 + { 2394 + const struct vgic_its_abi *abi = vgic_its_get_abi(its); 2395 + int ret = 0; 2396 + 2397 + if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ 2398 + return 0; 2399 + 2400 + mutex_lock(&kvm->lock); 2401 + mutex_lock(&its->its_lock); 2402 + 2403 + if (!lock_all_vcpus(kvm)) { 2404 + mutex_unlock(&its->its_lock); 2405 + mutex_unlock(&kvm->lock); 2406 + return -EBUSY; 2407 + } 2408 + 2409 + switch (attr) { 2410 + case KVM_DEV_ARM_ITS_CTRL_RESET: 2411 + vgic_its_reset(kvm, its); 2412 + break; 2413 + case KVM_DEV_ARM_ITS_SAVE_TABLES: 2414 + ret = abi->save_tables(its); 2415 + break; 2416 + case KVM_DEV_ARM_ITS_RESTORE_TABLES: 2417 + ret = abi->restore_tables(its); 2418 + break; 2419 + } 2420 + 2421 + unlock_all_vcpus(kvm); 2422 + mutex_unlock(&its->its_lock); 2423 + mutex_unlock(&kvm->lock); 2424 + return ret; 2379 2425 } 2380 2426 2381 2427 static int vgic_its_set_attr(struct kvm_device *dev, ··· 2438 2414 2439 2415 return vgic_register_its_iodev(dev->kvm, its, addr); 2440 2416 } 2441 - case KVM_DEV_ARM_VGIC_GRP_CTRL: { 2442 - const struct vgic_its_abi *abi = vgic_its_get_abi(its); 2443 - 2444 - switch (attr->attr) { 2445 - case KVM_DEV_ARM_VGIC_CTRL_INIT: 2446 - /* Nothing to do */ 2447 - return 0; 2448 - case KVM_DEV_ARM_ITS_SAVE_TABLES: 2449 - return abi->save_tables(its); 2450 - case KVM_DEV_ARM_ITS_RESTORE_TABLES: 2451 - return abi->restore_tables(its); 2452 - } 2453 - } 2417 + case KVM_DEV_ARM_VGIC_GRP_CTRL: 2418 + return vgic_its_ctrl(dev->kvm, its, attr->attr); 2454 2419 case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { 2455 2420 u64 __user *uaddr = (u64 __user *)(long)attr->addr; 2456 2421 u64 reg;

+13 -9

virt/kvm/arm/vgic/vgic-mmio-v2.c

··· 74 74 int mode = (val >> 24) & 0x03; 75 75 int c; 76 76 struct kvm_vcpu *vcpu; 77 + unsigned long flags; 77 78 78 79 switch (mode) { 79 80 case 0x0: /* as specified by targets */ ··· 98 97 99 98 irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); 100 99 101 - spin_lock(&irq->irq_lock); 100 + spin_lock_irqsave(&irq->irq_lock, flags); 102 101 irq->pending_latch = true; 103 102 irq->source |= 1U << source_vcpu->vcpu_id; 104 103 105 - vgic_queue_irq_unlock(source_vcpu->kvm, irq); 104 + vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags); 106 105 vgic_put_irq(source_vcpu->kvm, irq); 107 106 } 108 107 } ··· 132 131 u32 intid = VGIC_ADDR_TO_INTID(addr, 8); 133 132 u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); 134 133 int i; 134 + unsigned long flags; 135 135 136 136 /* GICD_ITARGETSR[0-7] are read-only */ 137 137 if (intid < VGIC_NR_PRIVATE_IRQS) ··· 142 140 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); 143 141 int target; 144 142 145 - spin_lock(&irq->irq_lock); 143 + spin_lock_irqsave(&irq->irq_lock, flags); 146 144 147 145 irq->targets = (val >> (i * 8)) & cpu_mask; 148 146 target = irq->targets ? __ffs(irq->targets) : 0; 149 147 irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); 150 148 151 - spin_unlock(&irq->irq_lock); 149 + spin_unlock_irqrestore(&irq->irq_lock, flags); 152 150 vgic_put_irq(vcpu->kvm, irq); 153 151 } 154 152 } ··· 176 174 { 177 175 u32 intid = addr & 0x0f; 178 176 int i; 177 + unsigned long flags; 179 178 180 179 for (i = 0; i < len; i++) { 181 180 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 182 181 183 - spin_lock(&irq->irq_lock); 182 + spin_lock_irqsave(&irq->irq_lock, flags); 184 183 185 184 irq->source &= ~((val >> (i * 8)) & 0xff); 186 185 if (!irq->source) 187 186 irq->pending_latch = false; 188 187 189 - spin_unlock(&irq->irq_lock); 188 + spin_unlock_irqrestore(&irq->irq_lock, flags); 190 189 vgic_put_irq(vcpu->kvm, irq); 191 190 } 192 191 } ··· 198 195 { 199 196 u32 intid = addr & 0x0f; 200 197 int i; 198 + unsigned long flags; 201 199 202 200 for (i = 0; i < len; i++) { 203 201 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 204 202 205 - spin_lock(&irq->irq_lock); 203 + spin_lock_irqsave(&irq->irq_lock, flags); 206 204 207 205 irq->source |= (val >> (i * 8)) & 0xff; 208 206 209 207 if (irq->source) { 210 208 irq->pending_latch = true; 211 - vgic_queue_irq_unlock(vcpu->kvm, irq); 209 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 212 210 } else { 213 - spin_unlock(&irq->irq_lock); 211 + spin_unlock_irqrestore(&irq->irq_lock, flags); 214 212 } 215 213 vgic_put_irq(vcpu->kvm, irq); 216 214 }

+10 -7

virt/kvm/arm/vgic/vgic-mmio-v3.c

··· 129 129 { 130 130 int intid = VGIC_ADDR_TO_INTID(addr, 64); 131 131 struct vgic_irq *irq; 132 + unsigned long flags; 132 133 133 134 /* The upper word is WI for us since we don't implement Aff3. */ 134 135 if (addr & 4) ··· 140 139 if (!irq) 141 140 return; 142 141 143 - spin_lock(&irq->irq_lock); 142 + spin_lock_irqsave(&irq->irq_lock, flags); 144 143 145 144 /* We only care about and preserve Aff0, Aff1 and Aff2. */ 146 145 irq->mpidr = val & GENMASK(23, 0); 147 146 irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); 148 147 149 - spin_unlock(&irq->irq_lock); 148 + spin_unlock_irqrestore(&irq->irq_lock, flags); 150 149 vgic_put_irq(vcpu->kvm, irq); 151 150 } 152 151 ··· 242 241 { 243 242 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 244 243 int i; 244 + unsigned long flags; 245 245 246 246 for (i = 0; i < len * 8; i++) { 247 247 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 248 248 249 - spin_lock(&irq->irq_lock); 249 + spin_lock_irqsave(&irq->irq_lock, flags); 250 250 if (test_bit(i, &val)) { 251 251 /* 252 252 * pending_latch is set irrespective of irq type ··· 255 253 * restore irq config before pending info. 256 254 */ 257 255 irq->pending_latch = true; 258 - vgic_queue_irq_unlock(vcpu->kvm, irq); 256 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 259 257 } else { 260 258 irq->pending_latch = false; 261 - spin_unlock(&irq->irq_lock); 259 + spin_unlock_irqrestore(&irq->irq_lock, flags); 262 260 } 263 261 264 262 vgic_put_irq(vcpu->kvm, irq); ··· 801 799 int sgi, c; 802 800 int vcpu_id = vcpu->vcpu_id; 803 801 bool broadcast; 802 + unsigned long flags; 804 803 805 804 sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; 806 805 broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); ··· 840 837 841 838 irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); 842 839 843 - spin_lock(&irq->irq_lock); 840 + spin_lock_irqsave(&irq->irq_lock, flags); 844 841 irq->pending_latch = true; 845 842 846 - vgic_queue_irq_unlock(vcpu->kvm, irq); 843 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 847 844 vgic_put_irq(vcpu->kvm, irq); 848 845 } 849 846 }

+26 -18

virt/kvm/arm/vgic/vgic-mmio.c

··· 69 69 { 70 70 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 71 71 int i; 72 + unsigned long flags; 72 73 73 74 for_each_set_bit(i, &val, len * 8) { 74 75 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 75 76 76 - spin_lock(&irq->irq_lock); 77 + spin_lock_irqsave(&irq->irq_lock, flags); 77 78 irq->enabled = true; 78 - vgic_queue_irq_unlock(vcpu->kvm, irq); 79 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 79 80 80 81 vgic_put_irq(vcpu->kvm, irq); 81 82 } ··· 88 87 { 89 88 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 90 89 int i; 90 + unsigned long flags; 91 91 92 92 for_each_set_bit(i, &val, len * 8) { 93 93 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 94 94 95 - spin_lock(&irq->irq_lock); 95 + spin_lock_irqsave(&irq->irq_lock, flags); 96 96 97 97 irq->enabled = false; 98 98 99 - spin_unlock(&irq->irq_lock); 99 + spin_unlock_irqrestore(&irq->irq_lock, flags); 100 100 vgic_put_irq(vcpu->kvm, irq); 101 101 } 102 102 } ··· 128 126 { 129 127 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 130 128 int i; 129 + unsigned long flags; 131 130 132 131 for_each_set_bit(i, &val, len * 8) { 133 132 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 134 133 135 - spin_lock(&irq->irq_lock); 134 + spin_lock_irqsave(&irq->irq_lock, flags); 136 135 irq->pending_latch = true; 137 136 138 - vgic_queue_irq_unlock(vcpu->kvm, irq); 137 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 139 138 vgic_put_irq(vcpu->kvm, irq); 140 139 } 141 140 } ··· 147 144 { 148 145 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 149 146 int i; 147 + unsigned long flags; 150 148 151 149 for_each_set_bit(i, &val, len * 8) { 152 150 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 153 151 154 - spin_lock(&irq->irq_lock); 152 + spin_lock_irqsave(&irq->irq_lock, flags); 155 153 156 154 irq->pending_latch = false; 157 155 158 - spin_unlock(&irq->irq_lock); 156 + spin_unlock_irqrestore(&irq->irq_lock, flags); 159 157 vgic_put_irq(vcpu->kvm, irq); 160 158 } 161 159 } ··· 185 181 bool new_active_state) 186 182 { 187 183 struct kvm_vcpu *requester_vcpu; 188 - spin_lock(&irq->irq_lock); 184 + unsigned long flags; 185 + spin_lock_irqsave(&irq->irq_lock, flags); 189 186 190 187 /* 191 188 * The vcpu parameter here can mean multiple things depending on how ··· 221 216 222 217 irq->active = new_active_state; 223 218 if (new_active_state) 224 - vgic_queue_irq_unlock(vcpu->kvm, irq); 219 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 225 220 else 226 - spin_unlock(&irq->irq_lock); 221 + spin_unlock_irqrestore(&irq->irq_lock, flags); 227 222 } 228 223 229 224 /* ··· 357 352 { 358 353 u32 intid = VGIC_ADDR_TO_INTID(addr, 8); 359 354 int i; 355 + unsigned long flags; 360 356 361 357 for (i = 0; i < len; i++) { 362 358 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 363 359 364 - spin_lock(&irq->irq_lock); 360 + spin_lock_irqsave(&irq->irq_lock, flags); 365 361 /* Narrow the priority range to what we actually support */ 366 362 irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); 367 - spin_unlock(&irq->irq_lock); 363 + spin_unlock_irqrestore(&irq->irq_lock, flags); 368 364 369 365 vgic_put_irq(vcpu->kvm, irq); 370 366 } ··· 396 390 { 397 391 u32 intid = VGIC_ADDR_TO_INTID(addr, 2); 398 392 int i; 393 + unsigned long flags; 399 394 400 395 for (i = 0; i < len * 4; i++) { 401 396 struct vgic_irq *irq; ··· 411 404 continue; 412 405 413 406 irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 414 - spin_lock(&irq->irq_lock); 407 + spin_lock_irqsave(&irq->irq_lock, flags); 415 408 416 409 if (test_bit(i * 2 + 1, &val)) 417 410 irq->config = VGIC_CONFIG_EDGE; 418 411 else 419 412 irq->config = VGIC_CONFIG_LEVEL; 420 413 421 - spin_unlock(&irq->irq_lock); 414 + spin_unlock_irqrestore(&irq->irq_lock, flags); 422 415 vgic_put_irq(vcpu->kvm, irq); 423 416 } 424 417 } ··· 450 443 { 451 444 int i; 452 445 int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; 446 + unsigned long flags; 453 447 454 448 for (i = 0; i < 32; i++) { 455 449 struct vgic_irq *irq; ··· 467 459 * restore irq config before line level. 468 460 */ 469 461 new_level = !!(val & (1U << i)); 470 - spin_lock(&irq->irq_lock); 462 + spin_lock_irqsave(&irq->irq_lock, flags); 471 463 irq->line_level = new_level; 472 464 if (new_level) 473 - vgic_queue_irq_unlock(vcpu->kvm, irq); 465 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 474 466 else 475 - spin_unlock(&irq->irq_lock); 467 + spin_unlock_irqrestore(&irq->irq_lock, flags); 476 468 477 469 vgic_put_irq(vcpu->kvm, irq); 478 470 }

+3 -2

virt/kvm/arm/vgic/vgic-v2.c

··· 62 62 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 63 63 struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; 64 64 int lr; 65 + unsigned long flags; 65 66 66 67 cpuif->vgic_hcr &= ~GICH_HCR_UIE; 67 68 ··· 78 77 79 78 irq = vgic_get_irq(vcpu->kvm, vcpu, intid); 80 79 81 - spin_lock(&irq->irq_lock); 80 + spin_lock_irqsave(&irq->irq_lock, flags); 82 81 83 82 /* Always preserve the active bit */ 84 83 irq->active = !!(val & GICH_LR_ACTIVE_BIT); ··· 105 104 irq->pending_latch = false; 106 105 } 107 106 108 - spin_unlock(&irq->irq_lock); 107 + spin_unlock_irqrestore(&irq->irq_lock, flags); 109 108 vgic_put_irq(vcpu->kvm, irq); 110 109 } 111 110

+7 -5

virt/kvm/arm/vgic/vgic-v3.c

··· 44 44 struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; 45 45 u32 model = vcpu->kvm->arch.vgic.vgic_model; 46 46 int lr; 47 + unsigned long flags; 47 48 48 49 cpuif->vgic_hcr &= ~ICH_HCR_UIE; 49 50 ··· 67 66 if (!irq) /* An LPI could have been unmapped. */ 68 67 continue; 69 68 70 - spin_lock(&irq->irq_lock); 69 + spin_lock_irqsave(&irq->irq_lock, flags); 71 70 72 71 /* Always preserve the active bit */ 73 72 irq->active = !!(val & ICH_LR_ACTIVE_BIT); ··· 95 94 irq->pending_latch = false; 96 95 } 97 96 98 - spin_unlock(&irq->irq_lock); 97 + spin_unlock_irqrestore(&irq->irq_lock, flags); 99 98 vgic_put_irq(vcpu->kvm, irq); 100 99 } 101 100 ··· 279 278 bool status; 280 279 u8 val; 281 280 int ret; 281 + unsigned long flags; 282 282 283 283 retry: 284 284 vcpu = irq->target_vcpu; ··· 298 296 299 297 status = val & (1 << bit_nr); 300 298 301 - spin_lock(&irq->irq_lock); 299 + spin_lock_irqsave(&irq->irq_lock, flags); 302 300 if (irq->target_vcpu != vcpu) { 303 - spin_unlock(&irq->irq_lock); 301 + spin_unlock_irqrestore(&irq->irq_lock, flags); 304 302 goto retry; 305 303 } 306 304 irq->pending_latch = status; 307 - vgic_queue_irq_unlock(vcpu->kvm, irq); 305 + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); 308 306 309 307 if (status) { 310 308 /* clear consumed data */

+39 -23

virt/kvm/arm/vgic/vgic.c

··· 53 53 * vcpuX->vcpu_id < vcpuY->vcpu_id: 54 54 * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); 55 55 * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); 56 + * 57 + * Since the VGIC must support injecting virtual interrupts from ISRs, we have 58 + * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer 59 + * spinlocks for any lock that may be taken while injecting an interrupt. 56 60 */ 57 61 58 62 /* ··· 265 261 * Needs to be entered with the IRQ lock already held, but will return 266 262 * with all locks dropped. 267 263 */ 268 - bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq) 264 + bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, 265 + unsigned long flags) 269 266 { 270 267 struct kvm_vcpu *vcpu; 271 268 ··· 284 279 * not need to be inserted into an ap_list and there is also 285 280 * no more work for us to do. 286 281 */ 287 - spin_unlock(&irq->irq_lock); 282 + spin_unlock_irqrestore(&irq->irq_lock, flags); 288 283 289 284 /* 290 285 * We have to kick the VCPU here, because we could be ··· 306 301 * We must unlock the irq lock to take the ap_list_lock where 307 302 * we are going to insert this new pending interrupt. 308 303 */ 309 - spin_unlock(&irq->irq_lock); 304 + spin_unlock_irqrestore(&irq->irq_lock, flags); 310 305 311 306 /* someone can do stuff here, which we re-check below */ 312 307 313 - spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 308 + spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); 314 309 spin_lock(&irq->irq_lock); 315 310 316 311 /* ··· 327 322 328 323 if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { 329 324 spin_unlock(&irq->irq_lock); 330 - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 325 + spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); 331 326 332 - spin_lock(&irq->irq_lock); 327 + spin_lock_irqsave(&irq->irq_lock, flags); 333 328 goto retry; 334 329 } 335 330 ··· 342 337 irq->vcpu = vcpu; 343 338 344 339 spin_unlock(&irq->irq_lock); 345 - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 340 + spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); 346 341 347 342 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); 348 343 kvm_vcpu_kick(vcpu); ··· 372 367 { 373 368 struct kvm_vcpu *vcpu; 374 369 struct vgic_irq *irq; 370 + unsigned long flags; 375 371 int ret; 376 372 377 373 trace_vgic_update_irq_pending(cpuid, intid, level); ··· 389 383 if (!irq) 390 384 return -EINVAL; 391 385 392 - spin_lock(&irq->irq_lock); 386 + spin_lock_irqsave(&irq->irq_lock, flags); 393 387 394 388 if (!vgic_validate_injection(irq, level, owner)) { 395 389 /* Nothing to see here, move along... */ 396 - spin_unlock(&irq->irq_lock); 390 + spin_unlock_irqrestore(&irq->irq_lock, flags); 397 391 vgic_put_irq(kvm, irq); 398 392 return 0; 399 393 } ··· 403 397 else 404 398 irq->pending_latch = true; 405 399 406 - vgic_queue_irq_unlock(kvm, irq); 400 + vgic_queue_irq_unlock(kvm, irq, flags); 407 401 vgic_put_irq(kvm, irq); 408 402 409 403 return 0; ··· 412 406 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) 413 407 { 414 408 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); 409 + unsigned long flags; 415 410 416 411 BUG_ON(!irq); 417 412 418 - spin_lock(&irq->irq_lock); 413 + spin_lock_irqsave(&irq->irq_lock, flags); 419 414 420 415 irq->hw = true; 421 416 irq->hwintid = phys_irq; 422 417 423 - spin_unlock(&irq->irq_lock); 418 + spin_unlock_irqrestore(&irq->irq_lock, flags); 424 419 vgic_put_irq(vcpu->kvm, irq); 425 420 426 421 return 0; ··· 430 423 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) 431 424 { 432 425 struct vgic_irq *irq; 426 + unsigned long flags; 433 427 434 428 if (!vgic_initialized(vcpu->kvm)) 435 429 return -EAGAIN; ··· 438 430 irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); 439 431 BUG_ON(!irq); 440 432 441 - spin_lock(&irq->irq_lock); 433 + spin_lock_irqsave(&irq->irq_lock, flags); 442 434 443 435 irq->hw = false; 444 436 irq->hwintid = 0; 445 437 446 - spin_unlock(&irq->irq_lock); 438 + spin_unlock_irqrestore(&irq->irq_lock, flags); 447 439 vgic_put_irq(vcpu->kvm, irq); 448 440 449 441 return 0; ··· 494 486 { 495 487 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 496 488 struct vgic_irq *irq, *tmp; 489 + unsigned long flags; 497 490 498 491 retry: 499 - spin_lock(&vgic_cpu->ap_list_lock); 492 + spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); 500 493 501 494 list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { 502 495 struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; ··· 537 528 /* This interrupt looks like it has to be migrated. */ 538 529 539 530 spin_unlock(&irq->irq_lock); 540 - spin_unlock(&vgic_cpu->ap_list_lock); 531 + spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); 541 532 542 533 /* 543 534 * Ensure locking order by always locking the smallest ··· 551 542 vcpuB = vcpu; 552 543 } 553 544 554 - spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); 545 + spin_lock_irqsave(&vcpuA->arch.vgic_cpu.ap_list_lock, flags); 555 546 spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, 556 547 SINGLE_DEPTH_NESTING); 557 548 spin_lock(&irq->irq_lock); ··· 575 566 576 567 spin_unlock(&irq->irq_lock); 577 568 spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); 578 - spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); 569 + spin_unlock_irqrestore(&vcpuA->arch.vgic_cpu.ap_list_lock, flags); 579 570 goto retry; 580 571 } 581 572 582 - spin_unlock(&vgic_cpu->ap_list_lock); 573 + spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); 583 574 } 584 575 585 576 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) ··· 712 703 if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) 713 704 return; 714 705 706 + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); 707 + 715 708 spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 716 709 vgic_flush_lr_state(vcpu); 717 710 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); ··· 746 735 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 747 736 struct vgic_irq *irq; 748 737 bool pending = false; 738 + unsigned long flags; 749 739 750 740 if (!vcpu->kvm->arch.vgic.enabled) 751 741 return false; 752 742 753 - spin_lock(&vgic_cpu->ap_list_lock); 743 + spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); 754 744 755 745 list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { 756 746 spin_lock(&irq->irq_lock); ··· 762 750 break; 763 751 } 764 752 765 - spin_unlock(&vgic_cpu->ap_list_lock); 753 + spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); 766 754 767 755 return pending; 768 756 } ··· 788 776 { 789 777 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); 790 778 bool map_is_active; 779 + unsigned long flags; 791 780 792 - spin_lock(&irq->irq_lock); 781 + if (!vgic_initialized(vcpu->kvm)) 782 + return false; 783 + 784 + spin_lock_irqsave(&irq->irq_lock, flags); 793 785 map_is_active = irq->hw && irq->active; 794 - spin_unlock(&irq->irq_lock); 786 + spin_unlock_irqrestore(&irq->irq_lock, flags); 795 787 vgic_put_irq(vcpu->kvm, irq); 796 788 797 789 return map_is_active;

+2 -1

virt/kvm/arm/vgic/vgic.h

··· 140 140 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, 141 141 u32 intid); 142 142 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); 143 - bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); 143 + bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, 144 + unsigned long flags); 144 145 void vgic_kick_vcpus(struct kvm *kvm); 145 146 146 147 int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,

+3 -3

virt/kvm/kvm_main.c

··· 122 122 123 123 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 124 124 125 - static void kvm_release_pfn_dirty(kvm_pfn_t pfn); 126 125 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 127 126 128 127 __visible bool kvm_rebooting; ··· 1678 1679 } 1679 1680 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1680 1681 1681 - static void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1682 + void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1682 1683 { 1683 1684 kvm_set_pfn_dirty(pfn); 1684 1685 kvm_release_pfn_clean(pfn); 1685 1686 } 1687 + EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1686 1688 1687 1689 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 1688 1690 { ··· 4010 4010 if (!vcpu_align) 4011 4011 vcpu_align = __alignof__(struct kvm_vcpu); 4012 4012 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 4013 - 0, NULL); 4013 + SLAB_ACCOUNT, NULL); 4014 4014 if (!kvm_vcpu_cache) { 4015 4015 r = -ENOMEM; 4016 4016 goto out_free_3;