Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+40 -11

Documentation/virt/kvm/api.rst

··· 372 372 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, 373 373 see the description of the capability. 374 374 375 - Note that the Xen shared info page, if configured, shall always be assumed 375 + Note that the Xen shared_info page, if configured, shall always be assumed 376 376 to be dirty. KVM will not explicitly mark it such. 377 377 378 378 ··· 5487 5487 __u8 long_mode; 5488 5488 __u8 vector; 5489 5489 __u8 runstate_update_flag; 5490 - struct { 5490 + union { 5491 5491 __u64 gfn; 5492 + __u64 hva; 5492 5493 } shared_info; 5493 5494 struct { 5494 5495 __u32 send_port; ··· 5517 5516 5518 5517 KVM_XEN_ATTR_TYPE_LONG_MODE 5519 5518 Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This 5520 - determines the layout of the shared info pages exposed to the VM. 5519 + determines the layout of the shared_info page exposed to the VM. 5521 5520 5522 5521 KVM_XEN_ATTR_TYPE_SHARED_INFO 5523 - Sets the guest physical frame number at which the Xen "shared info" 5522 + Sets the guest physical frame number at which the Xen shared_info 5524 5523 page resides. Note that although Xen places vcpu_info for the first 5525 5524 32 vCPUs in the shared_info page, KVM does not automatically do so 5526 - and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used 5527 - explicitly even when the vcpu_info for a given vCPU resides at the 5528 - "default" location in the shared_info page. This is because KVM may 5529 - not be aware of the Xen CPU id which is used as the index into the 5530 - vcpu_info[] array, so may know the correct default location. 5525 + and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO or 5526 + KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA be used explicitly even when 5527 + the vcpu_info for a given vCPU resides at the "default" location 5528 + in the shared_info page. This is because KVM may not be aware of 5529 + the Xen CPU id which is used as the index into the vcpu_info[] 5530 + array, so may know the correct default location. 5531 5531 5532 - Note that the shared info page may be constantly written to by KVM; 5532 + Note that the shared_info page may be constantly written to by KVM; 5533 5533 it contains the event channel bitmap used to deliver interrupts to 5534 5534 a Xen guest, amongst other things. It is exempt from dirty tracking 5535 5535 mechanisms — KVM will not explicitly mark the page as dirty each ··· 5539 5537 any vCPU has been running or any event channel interrupts can be 5540 5538 routed to the guest. 5541 5539 5542 - Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared info 5540 + Setting the gfn to KVM_XEN_INVALID_GFN will disable the shared_info 5543 5541 page. 5542 + 5543 + KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 5544 + If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the 5545 + Xen capabilities, then this attribute may be used to set the 5546 + userspace address at which the shared_info page resides, which 5547 + will always be fixed in the VMM regardless of where it is mapped 5548 + in guest physical address space. This attribute should be used in 5549 + preference to KVM_XEN_ATTR_TYPE_SHARED_INFO as it avoids 5550 + unnecessary invalidation of an internal cache when the page is 5551 + re-mapped in guest physcial address space. 5552 + 5553 + Setting the hva to zero will disable the shared_info page. 5544 5554 5545 5555 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 5546 5556 Sets the exception vector used to deliver Xen event channel upcalls. ··· 5649 5635 userspace should always assume that the page is dirty without relying 5650 5636 on dirty logging. Setting the gpa to KVM_XEN_INVALID_GPA will disable 5651 5637 the vcpu_info. 5638 + 5639 + KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 5640 + If the KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA flag is also set in the 5641 + Xen capabilities, then this attribute may be used to set the 5642 + userspace address of the vcpu_info for a given vCPU. It should 5643 + only be used when the vcpu_info resides at the "default" location 5644 + in the shared_info page. In this case it is safe to assume the 5645 + userspace address will not change, because the shared_info page is 5646 + an overlay on guest memory and remains at a fixed host address 5647 + regardless of where it is mapped in guest physical address space 5648 + and hence unnecessary invalidation of an internal cache may be 5649 + avoided if the guest memory layout is modified. 5650 + If the vcpu_info does not reside at the "default" location then 5651 + it is not guaranteed to remain at the same host address and 5652 + hence the aforementioned cache invalidation is required. 5652 5653 5653 5654 KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 5654 5655 Sets the guest physical address of an additional pvclock structure

-1

arch/arm64/Kconfig

··· 216 216 select HAVE_HW_BREAKPOINT if PERF_EVENTS 217 217 select HAVE_IOREMAP_PROT 218 218 select HAVE_IRQ_TIME_ACCOUNTING 219 - select HAVE_KVM 220 219 select HAVE_MOD_ARCH_SPECIFIC 221 220 select HAVE_NMI 222 221 select HAVE_PERF_EVENTS

+1

arch/arm64/include/asm/cpu.h

··· 57 57 u64 reg_id_aa64mmfr1; 58 58 u64 reg_id_aa64mmfr2; 59 59 u64 reg_id_aa64mmfr3; 60 + u64 reg_id_aa64mmfr4; 60 61 u64 reg_id_aa64pfr0; 61 62 u64 reg_id_aa64pfr1; 62 63 u64 reg_id_aa64pfr2;

+1

arch/arm64/include/asm/cpufeature.h

··· 364 364 u8 field_pos; 365 365 u8 field_width; 366 366 u8 min_field_value; 367 + u8 max_field_value; 367 368 u8 hwcap_type; 368 369 bool sign; 369 370 unsigned long hwcap;

+1 -3

arch/arm64/include/asm/kvm_arm.h

··· 102 102 #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) 103 103 #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) 104 104 105 - #define HCRX_GUEST_FLAGS \ 106 - (HCRX_EL2_SMPME | HCRX_EL2_TCR2En | \ 107 - (cpus_have_final_cap(ARM64_HAS_MOPS) ? (HCRX_EL2_MSCEn | HCRX_EL2_MCE2) : 0)) 105 + #define HCRX_GUEST_FLAGS (HCRX_EL2_SMPME | HCRX_EL2_TCR2En) 108 106 #define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) 109 107 110 108 /* TCR_EL2 Registers bits */

+2 -1

arch/arm64/include/asm/kvm_emulate.h

··· 209 209 210 210 static inline bool __vcpu_el2_e2h_is_set(const struct kvm_cpu_context *ctxt) 211 211 { 212 - return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H; 212 + return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) || 213 + (ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H)); 213 214 } 214 215 215 216 static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu)

+98 -1

arch/arm64/include/asm/kvm_host.h

··· 238 238 return index; 239 239 } 240 240 241 + struct kvm_sysreg_masks; 242 + 243 + enum fgt_group_id { 244 + __NO_FGT_GROUP__, 245 + HFGxTR_GROUP, 246 + HDFGRTR_GROUP, 247 + HDFGWTR_GROUP = HDFGRTR_GROUP, 248 + HFGITR_GROUP, 249 + HAFGRTR_GROUP, 250 + 251 + /* Must be last */ 252 + __NR_FGT_GROUP_IDS__ 253 + }; 254 + 241 255 struct kvm_arch { 242 256 struct kvm_s2_mmu mmu; 257 + 258 + /* 259 + * Fine-Grained UNDEF, mimicking the FGT layout defined by the 260 + * architecture. We track them globally, as we present the 261 + * same feature-set to all vcpus. 262 + * 263 + * Index 0 is currently spare. 264 + */ 265 + u64 fgu[__NR_FGT_GROUP_IDS__]; 243 266 244 267 /* Interrupt controller */ 245 268 struct vgic_dist vgic; ··· 297 274 #define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6 298 275 /* Initial ID reg values loaded */ 299 276 #define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 7 277 + /* Fine-Grained UNDEF initialised */ 278 + #define KVM_ARCH_FLAG_FGU_INITIALIZED 8 300 279 unsigned long flags; 301 280 302 281 /* VM-wide vCPU feature set */ ··· 319 294 /* PMCR_EL0.N value for the guest */ 320 295 u8 pmcr_n; 321 296 297 + /* Iterator for idreg debugfs */ 298 + u8 idreg_debugfs_iter; 299 + 322 300 /* Hypercall features firmware registers' descriptor */ 323 301 struct kvm_smccc_features smccc_feat; 324 302 struct maple_tree smccc_filter; ··· 339 311 #define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)]) 340 312 #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) 341 313 u64 id_regs[KVM_ARM_ID_REG_NUM]; 314 + 315 + /* Masks for VNCR-baked sysregs */ 316 + struct kvm_sysreg_masks *sysreg_masks; 342 317 343 318 /* 344 319 * For an untrusted host VM, 'pkvm.handle' is used to lookup ··· 505 474 NR_SYS_REGS /* Nothing after this line! */ 506 475 }; 507 476 477 + struct kvm_sysreg_masks { 478 + struct { 479 + u64 res0; 480 + u64 res1; 481 + } mask[NR_SYS_REGS - __VNCR_START__]; 482 + }; 483 + 508 484 struct kvm_cpu_context { 509 485 struct user_pt_regs regs; /* sp = sp_el0 */ 510 486 ··· 588 550 589 551 /* Values of trap registers for the guest. */ 590 552 u64 hcr_el2; 553 + u64 hcrx_el2; 591 554 u64 mdcr_el2; 592 555 u64 cptr_el2; 593 556 ··· 908 869 909 870 #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) 910 871 911 - #define __vcpu_sys_reg(v,r) (ctxt_sys_reg(&(v)->arch.ctxt, (r))) 872 + u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *, enum vcpu_sysreg); 873 + #define __vcpu_sys_reg(v,r) \ 874 + (*({ \ 875 + const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ 876 + u64 *__r = __ctxt_sys_reg(ctxt, (r)); \ 877 + if (vcpu_has_nv((v)) && (r) >= __VNCR_START__) \ 878 + *__r = kvm_vcpu_sanitise_vncr_reg((v), (r)); \ 879 + __r; \ 880 + })) 912 881 913 882 u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); 914 883 void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); ··· 1103 1056 int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); 1104 1057 int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); 1105 1058 1059 + void kvm_sys_regs_create_debugfs(struct kvm *kvm); 1106 1060 void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); 1107 1061 1108 1062 int __init kvm_sys_reg_table_init(void); 1063 + struct sys_reg_desc; 1064 + int __init populate_sysreg_config(const struct sys_reg_desc *sr, 1065 + unsigned int idx); 1109 1066 int __init populate_nv_trap_config(void); 1110 1067 1111 1068 bool lock_all_vcpus(struct kvm *kvm); 1112 1069 void unlock_all_vcpus(struct kvm *kvm); 1070 + 1071 + void kvm_init_sysreg(struct kvm_vcpu *); 1113 1072 1114 1073 /* MMIO helpers */ 1115 1074 void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); ··· 1286 1233 1287 1234 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); 1288 1235 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); 1236 + 1237 + #define __expand_field_sign_unsigned(id, fld, val) \ 1238 + ((u64)SYS_FIELD_VALUE(id, fld, val)) 1239 + 1240 + #define __expand_field_sign_signed(id, fld, val) \ 1241 + ({ \ 1242 + u64 __val = SYS_FIELD_VALUE(id, fld, val); \ 1243 + sign_extend64(__val, id##_##fld##_WIDTH - 1); \ 1244 + }) 1245 + 1246 + #define expand_field_sign(id, fld, val) \ 1247 + (id##_##fld##_SIGNED ? \ 1248 + __expand_field_sign_signed(id, fld, val) : \ 1249 + __expand_field_sign_unsigned(id, fld, val)) 1250 + 1251 + #define get_idreg_field_unsigned(kvm, id, fld) \ 1252 + ({ \ 1253 + u64 __val = IDREG((kvm), SYS_##id); \ 1254 + FIELD_GET(id##_##fld##_MASK, __val); \ 1255 + }) 1256 + 1257 + #define get_idreg_field_signed(kvm, id, fld) \ 1258 + ({ \ 1259 + u64 __val = get_idreg_field_unsigned(kvm, id, fld); \ 1260 + sign_extend64(__val, id##_##fld##_WIDTH - 1); \ 1261 + }) 1262 + 1263 + #define get_idreg_field_enum(kvm, id, fld) \ 1264 + get_idreg_field_unsigned(kvm, id, fld) 1265 + 1266 + #define get_idreg_field(kvm, id, fld) \ 1267 + (id##_##fld##_SIGNED ? \ 1268 + get_idreg_field_signed(kvm, id, fld) : \ 1269 + get_idreg_field_unsigned(kvm, id, fld)) 1270 + 1271 + #define kvm_has_feat(kvm, id, fld, limit) \ 1272 + (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, limit)) 1273 + 1274 + #define kvm_has_feat_enum(kvm, id, fld, val) \ 1275 + (get_idreg_field_unsigned((kvm), id, fld) == __expand_field_sign_unsigned(id, fld, val)) 1276 + 1277 + #define kvm_has_feat_range(kvm, id, fld, min, max) \ 1278 + (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, min) && \ 1279 + get_idreg_field((kvm), id, fld) <= expand_field_sign(id, fld, max)) 1289 1280 1290 1281 #endif /* __ARM64_KVM_HOST_H__ */

+1 -1

arch/arm64/include/asm/kvm_hyp.h

··· 70 70 /* 71 71 * Without an __arch_swab32(), we fall back to ___constant_swab32(), but the 72 72 * static inline can allow the compiler to out-of-line this. KVM always wants 73 - * the macro version as its always inlined. 73 + * the macro version as it's always inlined. 74 74 */ 75 75 #define __kvm_swab32(x) ___constant_swab32(x) 76 76

+20 -26

arch/arm64/include/asm/kvm_mmu.h

··· 54 54 #include <asm/alternative.h> 55 55 56 56 /* 57 - * Convert a kernel VA into a HYP VA. 58 - * reg: VA to be converted. 59 - * 60 - * The actual code generation takes place in kvm_update_va_mask, and 61 - * the instructions below are only there to reserve the space and 62 - * perform the register allocation (kvm_update_va_mask uses the 63 - * specific registers encoded in the instructions). 64 - */ 65 - .macro kern_hyp_va reg 66 - #ifndef __KVM_VHE_HYPERVISOR__ 67 - alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask 68 - and \reg, \reg, #1 /* mask with va_mask */ 69 - ror \reg, \reg, #1 /* rotate to the first tag bit */ 70 - add \reg, \reg, #0 /* insert the low 12 bits of the tag */ 71 - add \reg, \reg, #0, lsl 12 /* insert the top 12 bits of the tag */ 72 - ror \reg, \reg, #63 /* rotate back */ 73 - alternative_cb_end 74 - #endif 75 - .endm 76 - 77 - /* 78 57 * Convert a hypervisor VA to a PA 79 58 * reg: hypervisor address to be converted in place 80 59 * tmp: temporary register ··· 106 127 107 128 #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset) 108 129 130 + /* 131 + * Convert a kernel VA into a HYP VA. 132 + * 133 + * Can be called from hyp or non-hyp context. 134 + * 135 + * The actual code generation takes place in kvm_update_va_mask(), and 136 + * the instructions below are only there to reserve the space and 137 + * perform the register allocation (kvm_update_va_mask() uses the 138 + * specific registers encoded in the instructions). 139 + */ 109 140 static __always_inline unsigned long __kern_hyp_va(unsigned long v) 110 141 { 142 + /* 143 + * This #ifndef is an optimisation for when this is called from VHE hyp 144 + * context. When called from a VHE non-hyp context, kvm_update_va_mask() will 145 + * replace the instructions with `nop`s. 146 + */ 111 147 #ifndef __KVM_VHE_HYPERVISOR__ 112 - asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" 113 - "ror %0, %0, #1\n" 114 - "add %0, %0, #0\n" 115 - "add %0, %0, #0, lsl 12\n" 116 - "ror %0, %0, #63\n", 148 + asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" /* mask with va_mask */ 149 + "ror %0, %0, #1\n" /* rotate to the first tag bit */ 150 + "add %0, %0, #0\n" /* insert the low 12 bits of the tag */ 151 + "add %0, %0, #0, lsl 12\n" /* insert the top 12 bits of the tag */ 152 + "ror %0, %0, #63\n", /* rotate back */ 117 153 ARM64_ALWAYS_SYSTEM, 118 154 kvm_update_va_mask) 119 155 : "+r" (v));

-1

arch/arm64/include/asm/kvm_nested.h

··· 60 60 return ttbr0 & ~GENMASK_ULL(63, 48); 61 61 } 62 62 63 - extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); 64 63 65 64 int kvm_init_nv_sysregs(struct kvm *kvm); 66 65

+2

arch/arm64/include/asm/kvm_pgtable.h

··· 197 197 * @KVM_PGTABLE_PROT_W: Write permission. 198 198 * @KVM_PGTABLE_PROT_R: Read permission. 199 199 * @KVM_PGTABLE_PROT_DEVICE: Device attributes. 200 + * @KVM_PGTABLE_PROT_NORMAL_NC: Normal noncacheable attributes. 200 201 * @KVM_PGTABLE_PROT_SW0: Software bit 0. 201 202 * @KVM_PGTABLE_PROT_SW1: Software bit 1. 202 203 * @KVM_PGTABLE_PROT_SW2: Software bit 2. ··· 209 208 KVM_PGTABLE_PROT_R = BIT(2), 210 209 211 210 KVM_PGTABLE_PROT_DEVICE = BIT(3), 211 + KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), 212 212 213 213 KVM_PGTABLE_PROT_SW0 = BIT(55), 214 214 KVM_PGTABLE_PROT_SW1 = BIT(56),

+2

arch/arm64/include/asm/memory.h

··· 177 177 * Memory types for Stage-2 translation 178 178 */ 179 179 #define MT_S2_NORMAL 0xf 180 + #define MT_S2_NORMAL_NC 0x5 180 181 #define MT_S2_DEVICE_nGnRE 0x1 181 182 182 183 /* ··· 185 184 * Stage-2 enforces Normal-WB and Device-nGnRE 186 185 */ 187 186 #define MT_S2_FWB_NORMAL 6 187 + #define MT_S2_FWB_NORMAL_NC 5 188 188 #define MT_S2_FWB_DEVICE_nGnRE 1 189 189 190 190 #ifdef CONFIG_ARM64_4K_PAGES

+4 -1

arch/arm64/include/asm/sysreg.h

··· 1181 1181 par; \ 1182 1182 }) 1183 1183 1184 + #define SYS_FIELD_VALUE(reg, field, val) reg##_##field##_##val 1185 + 1184 1186 #define SYS_FIELD_GET(reg, field, val) \ 1185 1187 FIELD_GET(reg##_##field##_MASK, val) 1186 1188 ··· 1190 1188 FIELD_PREP(reg##_##field##_MASK, val) 1191 1189 1192 1190 #define SYS_FIELD_PREP_ENUM(reg, field, val) \ 1193 - FIELD_PREP(reg##_##field##_MASK, reg##_##field##_##val) 1191 + FIELD_PREP(reg##_##field##_MASK, \ 1192 + SYS_FIELD_VALUE(reg, field, val)) 1194 1193 1195 1194 #endif 1196 1195

+9 -6

arch/arm64/include/uapi/asm/kvm.h

··· 37 37 #include <asm/ptrace.h> 38 38 #include <asm/sve_context.h> 39 39 40 - #define __KVM_HAVE_GUEST_DEBUG 41 40 #define __KVM_HAVE_IRQ_LINE 42 - #define __KVM_HAVE_READONLY_MEM 43 41 #define __KVM_HAVE_VCPU_EVENTS 44 42 45 43 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 ··· 74 76 75 77 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ 76 78 #define KVM_ARM_DEVICE_TYPE_SHIFT 0 77 - #define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ 78 - KVM_ARM_DEVICE_TYPE_SHIFT) 79 + #define KVM_ARM_DEVICE_TYPE_MASK __GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ 80 + KVM_ARM_DEVICE_TYPE_SHIFT) 79 81 #define KVM_ARM_DEVICE_ID_SHIFT 16 80 - #define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ 81 - KVM_ARM_DEVICE_ID_SHIFT) 82 + #define KVM_ARM_DEVICE_ID_MASK __GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ 83 + KVM_ARM_DEVICE_ID_SHIFT) 82 84 83 85 /* Supported device IDs */ 84 86 #define KVM_ARM_DEVICE_VGIC_V2 0 ··· 159 161 /* Used with KVM_CAP_ARM_USER_IRQ */ 160 162 __u64 device_irq_level; 161 163 }; 164 + 165 + /* Bits for run->s.regs.device_irq_level */ 166 + #define KVM_ARM_DEV_EL1_VTIMER (1 << 0) 167 + #define KVM_ARM_DEV_EL1_PTIMER (1 << 1) 168 + #define KVM_ARM_DEV_PMU (1 << 2) 162 169 163 170 /* 164 171 * PMU filter structure. Describe a range of events with a particular

+95 -10

arch/arm64/kernel/cpufeature.c

··· 140 140 pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps); 141 141 } 142 142 143 + #define __ARM64_MAX_POSITIVE(reg, field) \ 144 + ((reg##_##field##_SIGNED ? \ 145 + BIT(reg##_##field##_WIDTH - 1) : \ 146 + BIT(reg##_##field##_WIDTH)) - 1) 147 + 148 + #define __ARM64_MIN_NEGATIVE(reg, field) BIT(reg##_##field##_WIDTH - 1) 149 + 150 + #define __ARM64_CPUID_FIELDS(reg, field, min_value, max_value) \ 151 + .sys_reg = SYS_##reg, \ 152 + .field_pos = reg##_##field##_SHIFT, \ 153 + .field_width = reg##_##field##_WIDTH, \ 154 + .sign = reg##_##field##_SIGNED, \ 155 + .min_field_value = min_value, \ 156 + .max_field_value = max_value, 157 + 158 + /* 159 + * ARM64_CPUID_FIELDS() encodes a field with a range from min_value to 160 + * an implicit maximum that depends on the sign-ess of the field. 161 + * 162 + * An unsigned field will be capped at all ones, while a signed field 163 + * will be limited to the positive half only. 164 + */ 143 165 #define ARM64_CPUID_FIELDS(reg, field, min_value) \ 144 - .sys_reg = SYS_##reg, \ 145 - .field_pos = reg##_##field##_SHIFT, \ 146 - .field_width = reg##_##field##_WIDTH, \ 147 - .sign = reg##_##field##_SIGNED, \ 148 - .min_field_value = reg##_##field##_##min_value, 166 + __ARM64_CPUID_FIELDS(reg, field, \ 167 + SYS_FIELD_VALUE(reg, field, min_value), \ 168 + __ARM64_MAX_POSITIVE(reg, field)) 169 + 170 + /* 171 + * ARM64_CPUID_FIELDS_NEG() encodes a field with a range from an 172 + * implicit minimal value to max_value. This should be used when 173 + * matching a non-implemented property. 174 + */ 175 + #define ARM64_CPUID_FIELDS_NEG(reg, field, max_value) \ 176 + __ARM64_CPUID_FIELDS(reg, field, \ 177 + __ARM64_MIN_NEGATIVE(reg, field), \ 178 + SYS_FIELD_VALUE(reg, field, max_value)) 149 179 150 180 #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ 151 181 { \ ··· 467 437 static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { 468 438 ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), 469 439 ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), 440 + ARM64_FTR_END, 441 + }; 442 + 443 + static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { 444 + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), 470 445 ARM64_FTR_END, 471 446 }; 472 447 ··· 799 764 ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2, 800 765 &id_aa64mmfr2_override), 801 766 ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), 767 + ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), 802 768 803 769 /* Op1 = 1, CRn = 0, CRm = 0 */ 804 770 ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), ··· 995 959 pr_warn("%s[%d:%d]: %s to %llx\n", 996 960 reg->name, 997 961 ftrp->shift + ftrp->width - 1, 998 - ftrp->shift, str, tmp); 962 + ftrp->shift, str, 963 + tmp & (BIT(ftrp->width) - 1)); 999 964 } else if ((ftr_mask & reg->override->val) == ftr_mask) { 1000 965 reg->override->val &= ~ftr_mask; 1001 966 pr_warn("%s[%d:%d]: impossible override, ignored\n", ··· 1125 1088 init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); 1126 1089 init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); 1127 1090 init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); 1091 + init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4); 1128 1092 init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); 1129 1093 init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); 1130 1094 init_cpu_ftr_reg(SYS_ID_AA64PFR2_EL1, info->reg_id_aa64pfr2); ··· 1508 1470 read_sysreg_case(SYS_ID_AA64MMFR1_EL1); 1509 1471 read_sysreg_case(SYS_ID_AA64MMFR2_EL1); 1510 1472 read_sysreg_case(SYS_ID_AA64MMFR3_EL1); 1473 + read_sysreg_case(SYS_ID_AA64MMFR4_EL1); 1511 1474 read_sysreg_case(SYS_ID_AA64ISAR0_EL1); 1512 1475 read_sysreg_case(SYS_ID_AA64ISAR1_EL1); 1513 1476 read_sysreg_case(SYS_ID_AA64ISAR2_EL1); ··· 1543 1504 static bool 1544 1505 feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) 1545 1506 { 1546 - int val = cpuid_feature_extract_field_width(reg, entry->field_pos, 1547 - entry->field_width, 1548 - entry->sign); 1507 + int val, min, max; 1508 + u64 tmp; 1549 1509 1550 - return val >= entry->min_field_value; 1510 + val = cpuid_feature_extract_field_width(reg, entry->field_pos, 1511 + entry->field_width, 1512 + entry->sign); 1513 + 1514 + tmp = entry->min_field_value; 1515 + tmp <<= entry->field_pos; 1516 + 1517 + min = cpuid_feature_extract_field_width(tmp, entry->field_pos, 1518 + entry->field_width, 1519 + entry->sign); 1520 + 1521 + tmp = entry->max_field_value; 1522 + tmp <<= entry->field_pos; 1523 + 1524 + max = cpuid_feature_extract_field_width(tmp, entry->field_pos, 1525 + entry->field_width, 1526 + entry->sign); 1527 + 1528 + return val >= min && val <= max; 1551 1529 } 1552 1530 1553 1531 static u64 ··· 1806 1750 } 1807 1751 1808 1752 return !meltdown_safe; 1753 + } 1754 + 1755 + static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) 1756 + { 1757 + /* 1758 + * Although the Apple M2 family appears to support NV1, the 1759 + * PTW barfs on the nVHE EL2 S1 page table format. Pretend 1760 + * that it doesn't support NV1 at all. 1761 + */ 1762 + static const struct midr_range nv1_ni_list[] = { 1763 + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), 1764 + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), 1765 + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), 1766 + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), 1767 + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), 1768 + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), 1769 + {} 1770 + }; 1771 + 1772 + return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && 1773 + !(has_cpuid_feature(entry, scope) || 1774 + is_midr_in_range_list(read_cpuid_id(), nv1_ni_list))); 1809 1775 } 1810 1776 1811 1777 #if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) ··· 2854 2776 #endif 2855 2777 }, 2856 2778 #endif 2779 + { 2780 + .desc = "NV1", 2781 + .capability = ARM64_HAS_HCR_NV1, 2782 + .type = ARM64_CPUCAP_SYSTEM_FEATURE, 2783 + .matches = has_nv1, 2784 + ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) 2785 + }, 2857 2786 {}, 2858 2787 }; 2859 2788

+1

arch/arm64/kernel/cpuinfo.c

··· 463 463 info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); 464 464 info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); 465 465 info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1); 466 + info->reg_id_aa64mmfr4 = read_cpuid(ID_AA64MMFR4_EL1); 466 467 info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); 467 468 info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); 468 469 info->reg_id_aa64pfr2 = read_cpuid(ID_AA64PFR2_EL1);

+15 -8

arch/arm64/kernel/head.S

··· 304 304 mov_q x1, INIT_SCTLR_EL1_MMU_OFF 305 305 306 306 /* 307 - * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, 308 - * making it impossible to start in nVHE mode. Is that 309 - * compliant with the architecture? Absolutely not! 307 + * Compliant CPUs advertise their VHE-onlyness with 308 + * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be 309 + * RES1 in that case. 310 + * 311 + * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, but 312 + * don't advertise it (they predate this relaxation). 310 313 */ 314 + mrs_s x0, SYS_ID_AA64MMFR4_EL1 315 + ubfx x0, x0, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH 316 + tbnz x0, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f 317 + 311 318 mrs x0, hcr_el2 312 319 and x0, x0, #HCR_E2H 313 - cbz x0, 1f 314 - 320 + cbz x0, 2f 321 + 1: 315 322 /* Set a sane SCTLR_EL1, the VHE way */ 316 323 pre_disable_mmu_workaround 317 324 msr_s SYS_SCTLR_EL12, x1 318 325 mov x2, #BOOT_CPU_FLAG_E2H 319 - b 2f 326 + b 3f 320 327 321 - 1: 328 + 2: 322 329 pre_disable_mmu_workaround 323 330 msr sctlr_el1, x1 324 331 mov x2, xzr 325 - 2: 332 + 3: 326 333 __init_el2_nvhe_prepare_eret 327 334 328 335 mov w0, #BOOT_CPU_MODE_EL2

+12 -3

arch/arm64/kvm/Kconfig

··· 19 19 20 20 menuconfig KVM 21 21 bool "Kernel-based Virtual Machine (KVM) support" 22 - depends on HAVE_KVM 23 22 select KVM_COMMON 24 23 select KVM_GENERIC_HARDWARE_ENABLING 25 24 select KVM_GENERIC_MMU_NOTIFIER ··· 32 33 select HAVE_KVM_MSI 33 34 select HAVE_KVM_IRQCHIP 34 35 select HAVE_KVM_IRQ_ROUTING 35 - select IRQ_BYPASS_MANAGER 36 36 select HAVE_KVM_IRQ_BYPASS 37 + select HAVE_KVM_READONLY_MEM 37 38 select HAVE_KVM_VCPU_RUN_PID_CHANGE 38 39 select SCHED_INFO 39 40 select GUEST_PERF_EVENTS if PERF_EVENTS 40 - select XARRAY_MULTI 41 41 help 42 42 Support hosting virtualized guest machines. 43 43 ··· 64 66 say N. 65 67 66 68 If unsure, or not using protected nVHE (pKVM), say N. 69 + 70 + config KVM_ARM64_RES_BITS_PARANOIA 71 + bool "Build-time check of RES0/RES1 bits" 72 + depends on KVM 73 + default n 74 + help 75 + Say Y here to validate that KVM's knowledge of most system 76 + registers' RES0/RES1 bits matches when the rest of the kernel 77 + defines. Expect the build to fail badly if you enable this. 78 + 79 + Just say N. 67 80 68 81 endif # VIRTUALIZATION

+1 -1

arch/arm64/kvm/arch_timer.c

··· 745 745 WARN_ON_ONCE(ret); 746 746 747 747 /* 748 - * The virtual offset behaviour is "interresting", as it 748 + * The virtual offset behaviour is "interesting", as it 749 749 * always applies when HCR_EL2.E2H==0, but only when 750 750 * accessed from EL1 when HCR_EL2.E2H==1. So make sure we 751 751 * track E2H when putting the HV timer in "direct" mode.

+13 -1

arch/arm64/kvm/arm.c

··· 190 190 return VM_FAULT_SIGBUS; 191 191 } 192 192 193 + void kvm_arch_create_vm_debugfs(struct kvm *kvm) 194 + { 195 + kvm_sys_regs_create_debugfs(kvm); 196 + } 193 197 194 198 /** 195 199 * kvm_arch_destroy_vm - destroy the VM data structure ··· 210 206 pkvm_destroy_hyp_vm(kvm); 211 207 212 208 kfree(kvm->arch.mpidr_data); 209 + kfree(kvm->arch.sysreg_masks); 213 210 kvm_destroy_vcpus(kvm); 214 211 215 212 kvm_unshare_hyp(kvm, kvm + 1); ··· 678 673 if (ret) 679 674 return ret; 680 675 } 676 + 677 + /* 678 + * This needs to happen after NV has imposed its own restrictions on 679 + * the feature set 680 + */ 681 + kvm_init_sysreg(vcpu); 681 682 682 683 ret = kvm_timer_enable(vcpu); 683 684 if (ret) ··· 2602 2591 } else if (in_hyp_mode) { 2603 2592 kvm_info("VHE mode initialized successfully\n"); 2604 2593 } else { 2605 - kvm_info("Hyp mode initialized successfully\n"); 2594 + char mode = cpus_have_final_cap(ARM64_KVM_HVHE) ? 'h' : 'n'; 2595 + kvm_info("Hyp mode (%cVHE) initialized successfully\n", mode); 2606 2596 } 2607 2597 2608 2598 /*

+125

arch/arm64/kvm/check-res-bits.h

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2024 - Google LLC 4 + * Author: Marc Zyngier <maz@kernel.org> 5 + */ 6 + 7 + #include <asm/sysreg-defs.h> 8 + 9 + /* 10 + * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 11 + * 12 + * If any of these BUILD_BUG_ON() fails, that's because some bits that 13 + * were reserved have gained some other meaning, and KVM needs to know 14 + * about those. 15 + * 16 + * In such case, do *NOT* blindly change the assertion so that it 17 + * passes, but also teach the rest of the code about the actual 18 + * change. 19 + * 20 + * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 21 + */ 22 + static inline void check_res_bits(void) 23 + { 24 + #ifdef CONFIG_KVM_ARM64_RES_BITS_PARANOIA 25 + 26 + BUILD_BUG_ON(OSDTRRX_EL1_RES0 != (GENMASK_ULL(63, 32))); 27 + BUILD_BUG_ON(MDCCINT_EL1_RES0 != (GENMASK_ULL(63, 31) | GENMASK_ULL(28, 0))); 28 + BUILD_BUG_ON(MDSCR_EL1_RES0 != (GENMASK_ULL(63, 36) | GENMASK_ULL(28, 28) | GENMASK_ULL(25, 24) | GENMASK_ULL(20, 20) | GENMASK_ULL(18, 16) | GENMASK_ULL(11, 7) | GENMASK_ULL(5, 1))); 29 + BUILD_BUG_ON(OSDTRTX_EL1_RES0 != (GENMASK_ULL(63, 32))); 30 + BUILD_BUG_ON(OSECCR_EL1_RES0 != (GENMASK_ULL(63, 32))); 31 + BUILD_BUG_ON(OSLAR_EL1_RES0 != (GENMASK_ULL(63, 1))); 32 + BUILD_BUG_ON(ID_PFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); 33 + BUILD_BUG_ON(ID_PFR1_EL1_RES0 != (GENMASK_ULL(63, 32))); 34 + BUILD_BUG_ON(ID_DFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); 35 + BUILD_BUG_ON(ID_AFR0_EL1_RES0 != (GENMASK_ULL(63, 16))); 36 + BUILD_BUG_ON(ID_MMFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); 37 + BUILD_BUG_ON(ID_MMFR1_EL1_RES0 != (GENMASK_ULL(63, 32))); 38 + BUILD_BUG_ON(ID_MMFR2_EL1_RES0 != (GENMASK_ULL(63, 32))); 39 + BUILD_BUG_ON(ID_MMFR3_EL1_RES0 != (GENMASK_ULL(63, 32))); 40 + BUILD_BUG_ON(ID_ISAR0_EL1_RES0 != (GENMASK_ULL(63, 28))); 41 + BUILD_BUG_ON(ID_ISAR1_EL1_RES0 != (GENMASK_ULL(63, 32))); 42 + BUILD_BUG_ON(ID_ISAR2_EL1_RES0 != (GENMASK_ULL(63, 32))); 43 + BUILD_BUG_ON(ID_ISAR3_EL1_RES0 != (GENMASK_ULL(63, 32))); 44 + BUILD_BUG_ON(ID_ISAR4_EL1_RES0 != (GENMASK_ULL(63, 32))); 45 + BUILD_BUG_ON(ID_ISAR5_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(23, 20))); 46 + BUILD_BUG_ON(ID_ISAR6_EL1_RES0 != (GENMASK_ULL(63, 28))); 47 + BUILD_BUG_ON(ID_MMFR4_EL1_RES0 != (GENMASK_ULL(63, 32))); 48 + BUILD_BUG_ON(MVFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); 49 + BUILD_BUG_ON(MVFR1_EL1_RES0 != (GENMASK_ULL(63, 32))); 50 + BUILD_BUG_ON(MVFR2_EL1_RES0 != (GENMASK_ULL(63, 8))); 51 + BUILD_BUG_ON(ID_PFR2_EL1_RES0 != (GENMASK_ULL(63, 12))); 52 + BUILD_BUG_ON(ID_DFR1_EL1_RES0 != (GENMASK_ULL(63, 8))); 53 + BUILD_BUG_ON(ID_MMFR5_EL1_RES0 != (GENMASK_ULL(63, 8))); 54 + BUILD_BUG_ON(ID_AA64PFR1_EL1_RES0 != (GENMASK_ULL(23, 20))); 55 + BUILD_BUG_ON(ID_AA64PFR2_EL1_RES0 != (GENMASK_ULL(63, 36) | GENMASK_ULL(31, 12))); 56 + BUILD_BUG_ON(ID_AA64ZFR0_EL1_RES0 != (GENMASK_ULL(63, 60) | GENMASK_ULL(51, 48) | GENMASK_ULL(39, 36) | GENMASK_ULL(31, 28) | GENMASK_ULL(15, 8))); 57 + BUILD_BUG_ON(ID_AA64SMFR0_EL1_RES0 != (GENMASK_ULL(62, 61) | GENMASK_ULL(51, 49) | GENMASK_ULL(31, 31) | GENMASK_ULL(27, 0))); 58 + BUILD_BUG_ON(ID_AA64FPFR0_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 2))); 59 + BUILD_BUG_ON(ID_AA64DFR0_EL1_RES0 != (GENMASK_ULL(27, 24) | GENMASK_ULL(19, 16))); 60 + BUILD_BUG_ON(ID_AA64DFR1_EL1_RES0 != (GENMASK_ULL(63, 0))); 61 + BUILD_BUG_ON(ID_AA64AFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); 62 + BUILD_BUG_ON(ID_AA64AFR1_EL1_RES0 != (GENMASK_ULL(63, 0))); 63 + BUILD_BUG_ON(ID_AA64ISAR0_EL1_RES0 != (GENMASK_ULL(3, 0))); 64 + BUILD_BUG_ON(ID_AA64ISAR2_EL1_RES0 != (GENMASK_ULL(47, 44))); 65 + BUILD_BUG_ON(ID_AA64ISAR3_EL1_RES0 != (GENMASK_ULL(63, 16))); 66 + BUILD_BUG_ON(ID_AA64MMFR0_EL1_RES0 != (GENMASK_ULL(55, 48))); 67 + BUILD_BUG_ON(ID_AA64MMFR2_EL1_RES0 != (GENMASK_ULL(47, 44))); 68 + BUILD_BUG_ON(ID_AA64MMFR3_EL1_RES0 != (GENMASK_ULL(51, 48))); 69 + BUILD_BUG_ON(ID_AA64MMFR4_EL1_RES0 != (GENMASK_ULL(63, 40) | GENMASK_ULL(35, 28) | GENMASK_ULL(3, 0))); 70 + BUILD_BUG_ON(SCTLR_EL1_RES0 != (GENMASK_ULL(17, 17))); 71 + BUILD_BUG_ON(CPACR_ELx_RES0 != (GENMASK_ULL(63, 30) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | GENMASK_ULL(19, 18) | GENMASK_ULL(15, 0))); 72 + BUILD_BUG_ON(SMPRI_EL1_RES0 != (GENMASK_ULL(63, 4))); 73 + BUILD_BUG_ON(ZCR_ELx_RES0 != (GENMASK_ULL(63, 9))); 74 + BUILD_BUG_ON(SMCR_ELx_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(29, 9))); 75 + BUILD_BUG_ON(GCSCR_ELx_RES0 != (GENMASK_ULL(63, 10) | GENMASK_ULL(7, 7) | GENMASK_ULL(4, 1))); 76 + BUILD_BUG_ON(GCSPR_ELx_RES0 != (GENMASK_ULL(2, 0))); 77 + BUILD_BUG_ON(GCSCRE0_EL1_RES0 != (GENMASK_ULL(63, 11) | GENMASK_ULL(7, 6) | GENMASK_ULL(4, 1))); 78 + BUILD_BUG_ON(ALLINT_RES0 != (GENMASK_ULL(63, 14) | GENMASK_ULL(12, 0))); 79 + BUILD_BUG_ON(PMSCR_EL1_RES0 != (GENMASK_ULL(63, 8) | GENMASK_ULL(2, 2))); 80 + BUILD_BUG_ON(PMSICR_EL1_RES0 != (GENMASK_ULL(55, 32))); 81 + BUILD_BUG_ON(PMSIRR_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(7, 1))); 82 + BUILD_BUG_ON(PMSFCR_EL1_RES0 != (GENMASK_ULL(63, 19) | GENMASK_ULL(15, 4))); 83 + BUILD_BUG_ON(PMSLATFR_EL1_RES0 != (GENMASK_ULL(63, 16))); 84 + BUILD_BUG_ON(PMSIDR_EL1_RES0 != (GENMASK_ULL(63, 25) | GENMASK_ULL(7, 7))); 85 + BUILD_BUG_ON(PMBLIMITR_EL1_RES0 != (GENMASK_ULL(11, 6) | GENMASK_ULL(4, 3))); 86 + BUILD_BUG_ON(PMBSR_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(25, 20))); 87 + BUILD_BUG_ON(PMBIDR_EL1_RES0 != (GENMASK_ULL(63, 12) | GENMASK_ULL(7, 6))); 88 + BUILD_BUG_ON(CONTEXTIDR_ELx_RES0 != (GENMASK_ULL(63, 32))); 89 + BUILD_BUG_ON(CCSIDR_EL1_RES0 != (GENMASK_ULL(63, 32))); 90 + BUILD_BUG_ON(CLIDR_EL1_RES0 != (GENMASK_ULL(63, 47))); 91 + BUILD_BUG_ON(CCSIDR2_EL1_RES0 != (GENMASK_ULL(63, 24))); 92 + BUILD_BUG_ON(GMID_EL1_RES0 != (GENMASK_ULL(63, 4))); 93 + BUILD_BUG_ON(SMIDR_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(14, 12))); 94 + BUILD_BUG_ON(CSSELR_EL1_RES0 != (GENMASK_ULL(63, 5))); 95 + BUILD_BUG_ON(CTR_EL0_RES0 != (GENMASK_ULL(63, 38) | GENMASK_ULL(30, 30) | GENMASK_ULL(13, 4))); 96 + BUILD_BUG_ON(CTR_EL0_RES1 != (GENMASK_ULL(31, 31))); 97 + BUILD_BUG_ON(DCZID_EL0_RES0 != (GENMASK_ULL(63, 5))); 98 + BUILD_BUG_ON(SVCR_RES0 != (GENMASK_ULL(63, 2))); 99 + BUILD_BUG_ON(FPMR_RES0 != (GENMASK_ULL(63, 38) | GENMASK_ULL(23, 23) | GENMASK_ULL(13, 9))); 100 + BUILD_BUG_ON(HFGxTR_EL2_RES0 != (GENMASK_ULL(51, 51))); 101 + BUILD_BUG_ON(HFGITR_EL2_RES0 != (GENMASK_ULL(63, 63) | GENMASK_ULL(61, 61))); 102 + BUILD_BUG_ON(HDFGRTR_EL2_RES0 != (GENMASK_ULL(49, 49) | GENMASK_ULL(42, 42) | GENMASK_ULL(39, 38) | GENMASK_ULL(21, 20) | GENMASK_ULL(8, 8))); 103 + BUILD_BUG_ON(HDFGWTR_EL2_RES0 != (GENMASK_ULL(63, 63) | GENMASK_ULL(59, 58) | GENMASK_ULL(51, 51) | GENMASK_ULL(47, 47) | GENMASK_ULL(43, 43) | GENMASK_ULL(40, 38) | GENMASK_ULL(34, 34) | GENMASK_ULL(30, 30) | GENMASK_ULL(22, 22) | GENMASK_ULL(9, 9) | GENMASK_ULL(6, 6))); 104 + BUILD_BUG_ON(HAFGRTR_EL2_RES0 != (GENMASK_ULL(63, 50) | GENMASK_ULL(16, 5))); 105 + BUILD_BUG_ON(HCRX_EL2_RES0 != (GENMASK_ULL(63, 25) | GENMASK_ULL(13, 12))); 106 + BUILD_BUG_ON(DACR32_EL2_RES0 != (GENMASK_ULL(63, 32))); 107 + BUILD_BUG_ON(PMSCR_EL2_RES0 != (GENMASK_ULL(63, 8) | GENMASK_ULL(2, 2))); 108 + BUILD_BUG_ON(TCR2_EL1x_RES0 != (GENMASK_ULL(63, 16) | GENMASK_ULL(13, 12) | GENMASK_ULL(9, 6))); 109 + BUILD_BUG_ON(TCR2_EL2_RES0 != (GENMASK_ULL(63, 16))); 110 + BUILD_BUG_ON(LORSA_EL1_RES0 != (GENMASK_ULL(63, 52) | GENMASK_ULL(15, 1))); 111 + BUILD_BUG_ON(LOREA_EL1_RES0 != (GENMASK_ULL(63, 52) | GENMASK_ULL(15, 0))); 112 + BUILD_BUG_ON(LORN_EL1_RES0 != (GENMASK_ULL(63, 8))); 113 + BUILD_BUG_ON(LORC_EL1_RES0 != (GENMASK_ULL(63, 10) | GENMASK_ULL(1, 1))); 114 + BUILD_BUG_ON(LORID_EL1_RES0 != (GENMASK_ULL(63, 24) | GENMASK_ULL(15, 8))); 115 + BUILD_BUG_ON(ISR_EL1_RES0 != (GENMASK_ULL(63, 11) | GENMASK_ULL(5, 0))); 116 + BUILD_BUG_ON(ICC_NMIAR1_EL1_RES0 != (GENMASK_ULL(63, 24))); 117 + BUILD_BUG_ON(TRBLIMITR_EL1_RES0 != (GENMASK_ULL(11, 7))); 118 + BUILD_BUG_ON(TRBBASER_EL1_RES0 != (GENMASK_ULL(11, 0))); 119 + BUILD_BUG_ON(TRBSR_EL1_RES0 != (GENMASK_ULL(63, 56) | GENMASK_ULL(25, 24) | GENMASK_ULL(19, 19) | GENMASK_ULL(16, 16))); 120 + BUILD_BUG_ON(TRBMAR_EL1_RES0 != (GENMASK_ULL(63, 12))); 121 + BUILD_BUG_ON(TRBTRG_EL1_RES0 != (GENMASK_ULL(63, 32))); 122 + BUILD_BUG_ON(TRBIDR_EL1_RES0 != (GENMASK_ULL(63, 12) | GENMASK_ULL(7, 6))); 123 + 124 + #endif 125 + }

+2 -1

arch/arm64/kvm/debug.c

··· 23 23 24 24 static DEFINE_PER_CPU(u64, mdcr_el2); 25 25 26 - /** 26 + /* 27 27 * save/restore_guest_debug_regs 28 28 * 29 29 * For some debug operations we need to tweak some guest registers. As ··· 143 143 144 144 /** 145 145 * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state 146 + * @vcpu: the vcpu pointer 146 147 */ 147 148 148 149 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)

+176 -55

arch/arm64/kvm/emulate-nested.c

··· 427 427 * [19:14] bit number in the FGT register (6 bits) 428 428 * [20] trap polarity (1 bit) 429 429 * [25:21] FG filter (5 bits) 430 - * [62:26] Unused (37 bits) 430 + * [35:26] Main SysReg table index (10 bits) 431 + * [62:36] Unused (27 bits) 431 432 * [63] RES0 - Must be zero, as lost on insertion in the xarray 432 433 */ 433 434 #define TC_CGT_BITS 10 434 435 #define TC_FGT_BITS 4 435 436 #define TC_FGF_BITS 5 437 + #define TC_SRI_BITS 10 436 438 437 439 union trap_config { 438 440 u64 val; ··· 444 442 unsigned long bit:6; /* Bit number */ 445 443 unsigned long pol:1; /* Polarity */ 446 444 unsigned long fgf:TC_FGF_BITS; /* Fine Grained Filter */ 447 - unsigned long unused:37; /* Unused, should be zero */ 445 + unsigned long sri:TC_SRI_BITS; /* SysReg Index */ 446 + unsigned long unused:27; /* Unused, should be zero */ 448 447 unsigned long mbz:1; /* Must Be Zero */ 449 448 }; 450 449 }; ··· 1008 1005 }; 1009 1006 1010 1007 static DEFINE_XARRAY(sr_forward_xa); 1011 - 1012 - enum fgt_group_id { 1013 - __NO_FGT_GROUP__, 1014 - HFGxTR_GROUP, 1015 - HDFGRTR_GROUP, 1016 - HDFGWTR_GROUP, 1017 - HFGITR_GROUP, 1018 - HAFGRTR_GROUP, 1019 - 1020 - /* Must be last */ 1021 - __NR_FGT_GROUP_IDS__ 1022 - }; 1023 1008 1024 1009 enum fg_filter_id { 1025 1010 __NO_FGF__, ··· 1748 1757 err); 1749 1758 } 1750 1759 1760 + static u32 encoding_next(u32 encoding) 1761 + { 1762 + u8 op0, op1, crn, crm, op2; 1763 + 1764 + op0 = sys_reg_Op0(encoding); 1765 + op1 = sys_reg_Op1(encoding); 1766 + crn = sys_reg_CRn(encoding); 1767 + crm = sys_reg_CRm(encoding); 1768 + op2 = sys_reg_Op2(encoding); 1769 + 1770 + if (op2 < Op2_mask) 1771 + return sys_reg(op0, op1, crn, crm, op2 + 1); 1772 + if (crm < CRm_mask) 1773 + return sys_reg(op0, op1, crn, crm + 1, 0); 1774 + if (crn < CRn_mask) 1775 + return sys_reg(op0, op1, crn + 1, 0, 0); 1776 + if (op1 < Op1_mask) 1777 + return sys_reg(op0, op1 + 1, 0, 0, 0); 1778 + 1779 + return sys_reg(op0 + 1, 0, 0, 0, 0); 1780 + } 1781 + 1751 1782 int __init populate_nv_trap_config(void) 1752 1783 { 1753 1784 int ret = 0; ··· 1788 1775 ret = -EINVAL; 1789 1776 } 1790 1777 1791 - if (cgt->encoding != cgt->end) { 1792 - prev = xa_store_range(&sr_forward_xa, 1793 - cgt->encoding, cgt->end, 1794 - xa_mk_value(cgt->tc.val), 1795 - GFP_KERNEL); 1796 - } else { 1797 - prev = xa_store(&sr_forward_xa, cgt->encoding, 1778 + for (u32 enc = cgt->encoding; enc <= cgt->end; enc = encoding_next(enc)) { 1779 + prev = xa_store(&sr_forward_xa, enc, 1798 1780 xa_mk_value(cgt->tc.val), GFP_KERNEL); 1799 1781 if (prev && !xa_is_err(prev)) { 1800 1782 ret = -EINVAL; 1801 1783 print_nv_trap_error(cgt, "Duplicate CGT", ret); 1802 1784 } 1803 - } 1804 1785 1805 - if (xa_is_err(prev)) { 1806 - ret = xa_err(prev); 1807 - print_nv_trap_error(cgt, "Failed CGT insertion", ret); 1786 + if (xa_is_err(prev)) { 1787 + ret = xa_err(prev); 1788 + print_nv_trap_error(cgt, "Failed CGT insertion", ret); 1789 + } 1808 1790 } 1809 1791 } 1810 1792 ··· 1812 1804 for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) { 1813 1805 const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i]; 1814 1806 union trap_config tc; 1807 + void *prev; 1815 1808 1816 1809 if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) { 1817 1810 ret = -EINVAL; ··· 1827 1818 } 1828 1819 1829 1820 tc.val |= fgt->tc.val; 1830 - xa_store(&sr_forward_xa, fgt->encoding, 1831 - xa_mk_value(tc.val), GFP_KERNEL); 1821 + prev = xa_store(&sr_forward_xa, fgt->encoding, 1822 + xa_mk_value(tc.val), GFP_KERNEL); 1823 + 1824 + if (xa_is_err(prev)) { 1825 + ret = xa_err(prev); 1826 + print_nv_trap_error(fgt, "Failed FGT insertion", ret); 1827 + } 1832 1828 } 1833 1829 1834 1830 kvm_info("nv: %ld fine grained trap handlers\n", ··· 1857 1843 xa_destroy(&sr_forward_xa); 1858 1844 1859 1845 return ret; 1846 + } 1847 + 1848 + int __init populate_sysreg_config(const struct sys_reg_desc *sr, 1849 + unsigned int idx) 1850 + { 1851 + union trap_config tc; 1852 + u32 encoding; 1853 + void *ret; 1854 + 1855 + /* 1856 + * 0 is a valid value for the index, but not for the storage. 1857 + * We'll store (idx+1), so check against an offset'd limit. 1858 + */ 1859 + if (idx >= (BIT(TC_SRI_BITS) - 1)) { 1860 + kvm_err("sysreg %s (%d) out of range\n", sr->name, idx); 1861 + return -EINVAL; 1862 + } 1863 + 1864 + encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2); 1865 + tc = get_trap_config(encoding); 1866 + 1867 + if (tc.sri) { 1868 + kvm_err("sysreg %s (%d) duplicate entry (%d)\n", 1869 + sr->name, idx - 1, tc.sri); 1870 + return -EINVAL; 1871 + } 1872 + 1873 + tc.sri = idx + 1; 1874 + ret = xa_store(&sr_forward_xa, encoding, 1875 + xa_mk_value(tc.val), GFP_KERNEL); 1876 + 1877 + return xa_err(ret); 1860 1878 } 1861 1879 1862 1880 static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, ··· 1938 1892 return __compute_trap_behaviour(vcpu, tc.cgt, b); 1939 1893 } 1940 1894 1941 - static bool check_fgt_bit(u64 val, const union trap_config tc) 1895 + static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) 1942 1896 { 1943 - return ((val >> tc.bit) & 1) == tc.pol; 1897 + struct kvm_sysreg_masks *masks; 1898 + 1899 + /* Only handle the VNCR-backed regs for now */ 1900 + if (sr < __VNCR_START__) 1901 + return 0; 1902 + 1903 + masks = kvm->arch.sysreg_masks; 1904 + 1905 + return masks->mask[sr - __VNCR_START__].res0; 1944 1906 } 1945 1907 1946 - #define sanitised_sys_reg(vcpu, reg) \ 1947 - ({ \ 1948 - u64 __val; \ 1949 - __val = __vcpu_sys_reg(vcpu, reg); \ 1950 - __val &= ~__ ## reg ## _RES0; \ 1951 - (__val); \ 1952 - }) 1908 + static bool check_fgt_bit(struct kvm *kvm, bool is_read, 1909 + u64 val, const union trap_config tc) 1910 + { 1911 + enum vcpu_sysreg sr; 1953 1912 1954 - bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) 1913 + if (tc.pol) 1914 + return (val & BIT(tc.bit)); 1915 + 1916 + /* 1917 + * FGTs with negative polarities are an absolute nightmare, as 1918 + * we need to evaluate the bit in the light of the feature 1919 + * that defines it. WTF were they thinking? 1920 + * 1921 + * So let's check if the bit has been earmarked as RES0, as 1922 + * this indicates an unimplemented feature. 1923 + */ 1924 + if (val & BIT(tc.bit)) 1925 + return false; 1926 + 1927 + switch ((enum fgt_group_id)tc.fgt) { 1928 + case HFGxTR_GROUP: 1929 + sr = is_read ? HFGRTR_EL2 : HFGWTR_EL2; 1930 + break; 1931 + 1932 + case HDFGRTR_GROUP: 1933 + sr = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; 1934 + break; 1935 + 1936 + case HAFGRTR_GROUP: 1937 + sr = HAFGRTR_EL2; 1938 + break; 1939 + 1940 + case HFGITR_GROUP: 1941 + sr = HFGITR_EL2; 1942 + break; 1943 + 1944 + default: 1945 + WARN_ONCE(1, "Unhandled FGT group"); 1946 + return false; 1947 + } 1948 + 1949 + return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit)); 1950 + } 1951 + 1952 + bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) 1955 1953 { 1956 1954 union trap_config tc; 1957 1955 enum trap_behaviour b; 1958 1956 bool is_read; 1959 1957 u32 sysreg; 1960 1958 u64 esr, val; 1961 - 1962 - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) 1963 - return false; 1964 1959 1965 1960 esr = kvm_vcpu_get_esr(vcpu); 1966 1961 sysreg = esr_sys64_to_sysreg(esr); ··· 2013 1926 * A value of 0 for the whole entry means that we know nothing 2014 1927 * for this sysreg, and that it cannot be re-injected into the 2015 1928 * nested hypervisor. In this situation, let's cut it short. 2016 - * 2017 - * Note that ultimately, we could also make use of the xarray 2018 - * to store the index of the sysreg in the local descriptor 2019 - * array, avoiding another search... Hint, hint... 2020 1929 */ 2021 1930 if (!tc.val) 2022 - return false; 1931 + goto local; 1932 + 1933 + /* 1934 + * If a sysreg can be trapped using a FGT, first check whether we 1935 + * trap for the purpose of forbidding the feature. In that case, 1936 + * inject an UNDEF. 1937 + */ 1938 + if (tc.fgt != __NO_FGT_GROUP__ && 1939 + (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) { 1940 + kvm_inject_undefined(vcpu); 1941 + return true; 1942 + } 1943 + 1944 + /* 1945 + * If we're not nesting, immediately return to the caller, with the 1946 + * sysreg index, should we have it. 1947 + */ 1948 + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) 1949 + goto local; 2023 1950 2024 1951 switch ((enum fgt_group_id)tc.fgt) { 2025 1952 case __NO_FGT_GROUP__: ··· 2041 1940 2042 1941 case HFGxTR_GROUP: 2043 1942 if (is_read) 2044 - val = sanitised_sys_reg(vcpu, HFGRTR_EL2); 1943 + val = __vcpu_sys_reg(vcpu, HFGRTR_EL2); 2045 1944 else 2046 - val = sanitised_sys_reg(vcpu, HFGWTR_EL2); 1945 + val = __vcpu_sys_reg(vcpu, HFGWTR_EL2); 2047 1946 break; 2048 1947 2049 1948 case HDFGRTR_GROUP: 2050 - case HDFGWTR_GROUP: 2051 1949 if (is_read) 2052 - val = sanitised_sys_reg(vcpu, HDFGRTR_EL2); 1950 + val = __vcpu_sys_reg(vcpu, HDFGRTR_EL2); 2053 1951 else 2054 - val = sanitised_sys_reg(vcpu, HDFGWTR_EL2); 1952 + val = __vcpu_sys_reg(vcpu, HDFGWTR_EL2); 2055 1953 break; 2056 1954 2057 1955 case HAFGRTR_GROUP: 2058 - val = sanitised_sys_reg(vcpu, HAFGRTR_EL2); 1956 + val = __vcpu_sys_reg(vcpu, HAFGRTR_EL2); 2059 1957 break; 2060 1958 2061 1959 case HFGITR_GROUP: 2062 - val = sanitised_sys_reg(vcpu, HFGITR_EL2); 1960 + val = __vcpu_sys_reg(vcpu, HFGITR_EL2); 2063 1961 switch (tc.fgf) { 2064 1962 u64 tmp; 2065 1963 ··· 2066 1966 break; 2067 1967 2068 1968 case HCRX_FGTnXS: 2069 - tmp = sanitised_sys_reg(vcpu, HCRX_EL2); 1969 + tmp = __vcpu_sys_reg(vcpu, HCRX_EL2); 2070 1970 if (tmp & HCRX_EL2_FGTnXS) 2071 1971 tc.fgt = __NO_FGT_GROUP__; 2072 1972 } ··· 2075 1975 case __NR_FGT_GROUP_IDS__: 2076 1976 /* Something is really wrong, bail out */ 2077 1977 WARN_ONCE(1, "__NR_FGT_GROUP_IDS__"); 2078 - return false; 1978 + goto local; 2079 1979 } 2080 1980 2081 - if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(val, tc)) 1981 + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu->kvm, is_read, 1982 + val, tc)) 2082 1983 goto inject; 2083 1984 2084 1985 b = compute_trap_behaviour(vcpu, tc); ··· 2088 1987 ((b & BEHAVE_FORWARD_WRITE) && !is_read)) 2089 1988 goto inject; 2090 1989 1990 + local: 1991 + if (!tc.sri) { 1992 + struct sys_reg_params params; 1993 + 1994 + params = esr_sys64_to_params(esr); 1995 + 1996 + /* 1997 + * Check for the IMPDEF range, as per DDI0487 J.a, 1998 + * D18.3.2 Reserved encodings for IMPLEMENTATION 1999 + * DEFINED registers. 2000 + */ 2001 + if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011)) 2002 + print_sys_reg_msg(&params, 2003 + "Unsupported guest access at: %lx\n", 2004 + *vcpu_pc(vcpu)); 2005 + kvm_inject_undefined(vcpu); 2006 + return true; 2007 + } 2008 + 2009 + *sr_index = tc.sri - 1; 2091 2010 return false; 2092 2011 2093 2012 inject:

+1 -1

arch/arm64/kvm/fpsimd.c

··· 117 117 } 118 118 119 119 /* 120 - * Called just before entering the guest once we are no longer preemptable 120 + * Called just before entering the guest once we are no longer preemptible 121 121 * and interrupts are disabled. If we have managed to run anything using 122 122 * FP while we were preemptible (such as off the back of an interrupt), 123 123 * then neither the host nor the guest own the FP hardware (and it was the

+5 -2

arch/arm64/kvm/guest.c

··· 711 711 712 712 /** 713 713 * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG 714 + * @vcpu: the vCPU pointer 714 715 * 715 716 * This is for all registers. 716 717 */ ··· 730 729 731 730 /** 732 731 * kvm_arm_copy_reg_indices - get indices of all registers. 732 + * @vcpu: the vCPU pointer 733 + * @uindices: register list to copy 733 734 * 734 735 * We do core registers right here, then we append system regs. 735 736 */ ··· 905 902 906 903 /** 907 904 * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging 908 - * @kvm: pointer to the KVM struct 909 - * @kvm_guest_debug: the ioctl data buffer 905 + * @vcpu: the vCPU pointer 906 + * @dbg: the ioctl data buffer 910 907 * 911 908 * This sets up and enables the VM for guest debugging. Userspace 912 909 * passes in a control flag to enable different debug types and

+2 -2

arch/arm64/kvm/hyp/aarch32.c

··· 84 84 } 85 85 86 86 /** 87 - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block 87 + * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block 88 88 * @vcpu: The VCPU pointer 89 89 * 90 90 * When exceptions occur while instructions are executed in Thumb IF-THEN ··· 120 120 } 121 121 122 122 /** 123 - * kvm_skip_instr - skip a trapped instruction and proceed to the next 123 + * kvm_skip_instr32 - skip a trapped instruction and proceed to the next 124 124 * @vcpu: The vcpu pointer 125 125 */ 126 126 void kvm_skip_instr32(struct kvm_vcpu *vcpu)

+68 -62

arch/arm64/kvm/hyp/include/hyp/switch.h

··· 79 79 clr |= ~hfg & __ ## reg ## _nMASK; \ 80 80 } while(0) 81 81 82 - #define update_fgt_traps_cs(vcpu, reg, clr, set) \ 82 + #define reg_to_fgt_group_id(reg) \ 83 + ({ \ 84 + enum fgt_group_id id; \ 85 + switch(reg) { \ 86 + case HFGRTR_EL2: \ 87 + case HFGWTR_EL2: \ 88 + id = HFGxTR_GROUP; \ 89 + break; \ 90 + case HFGITR_EL2: \ 91 + id = HFGITR_GROUP; \ 92 + break; \ 93 + case HDFGRTR_EL2: \ 94 + case HDFGWTR_EL2: \ 95 + id = HDFGRTR_GROUP; \ 96 + break; \ 97 + case HAFGRTR_EL2: \ 98 + id = HAFGRTR_GROUP; \ 99 + break; \ 100 + default: \ 101 + BUILD_BUG_ON(1); \ 102 + } \ 103 + \ 104 + id; \ 105 + }) 106 + 107 + #define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \ 83 108 do { \ 84 - struct kvm_cpu_context *hctxt = \ 85 - &this_cpu_ptr(&kvm_host_data)->host_ctxt; \ 109 + u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \ 110 + set |= hfg & __ ## reg ## _MASK; \ 111 + clr |= hfg & __ ## reg ## _nMASK; \ 112 + } while(0) 113 + 114 + #define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set) \ 115 + do { \ 86 116 u64 c = 0, s = 0; \ 87 117 \ 88 118 ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ 89 - compute_clr_set(vcpu, reg, c, s); \ 119 + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \ 120 + compute_clr_set(vcpu, reg, c, s); \ 121 + \ 122 + compute_undef_clr_set(vcpu, kvm, reg, c, s); \ 123 + \ 90 124 s |= set; \ 91 125 c |= clr; \ 92 126 if (c || s) { \ ··· 131 97 } \ 132 98 } while(0) 133 99 134 - #define update_fgt_traps(vcpu, reg) \ 135 - update_fgt_traps_cs(vcpu, reg, 0, 0) 100 + #define update_fgt_traps(hctxt, vcpu, kvm, reg) \ 101 + update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0) 136 102 137 103 /* 138 104 * Validate the fine grain trap masks. ··· 156 122 static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) 157 123 { 158 124 struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; 159 - u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; 160 - u64 r_val, w_val; 125 + struct kvm *kvm = kern_hyp_va(vcpu->kvm); 161 126 162 127 CHECK_FGT_MASKS(HFGRTR_EL2); 163 128 CHECK_FGT_MASKS(HFGWTR_EL2); ··· 169 136 if (!cpus_have_final_cap(ARM64_HAS_FGT)) 170 137 return; 171 138 172 - ctxt_sys_reg(hctxt, HFGRTR_EL2) = read_sysreg_s(SYS_HFGRTR_EL2); 173 - ctxt_sys_reg(hctxt, HFGWTR_EL2) = read_sysreg_s(SYS_HFGWTR_EL2); 174 - 175 - if (cpus_have_final_cap(ARM64_SME)) { 176 - tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; 177 - 178 - r_clr |= tmp; 179 - w_clr |= tmp; 180 - } 181 - 182 - /* 183 - * Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD. 184 - */ 185 - if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 186 - w_set |= HFGxTR_EL2_TCR_EL1_MASK; 187 - 188 - if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { 189 - compute_clr_set(vcpu, HFGRTR_EL2, r_clr, r_set); 190 - compute_clr_set(vcpu, HFGWTR_EL2, w_clr, w_set); 191 - } 192 - 193 - /* The default to trap everything not handled or supported in KVM. */ 194 - tmp = HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | 195 - HFGxTR_EL2_nPOR_EL1 | HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1; 196 - 197 - r_val = __HFGRTR_EL2_nMASK & ~tmp; 198 - r_val |= r_set; 199 - r_val &= ~r_clr; 200 - 201 - w_val = __HFGWTR_EL2_nMASK & ~tmp; 202 - w_val |= w_set; 203 - w_val &= ~w_clr; 204 - 205 - write_sysreg_s(r_val, SYS_HFGRTR_EL2); 206 - write_sysreg_s(w_val, SYS_HFGWTR_EL2); 207 - 208 - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) 209 - return; 210 - 211 - update_fgt_traps(vcpu, HFGITR_EL2); 212 - update_fgt_traps(vcpu, HDFGRTR_EL2); 213 - update_fgt_traps(vcpu, HDFGWTR_EL2); 139 + update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2); 140 + update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0, 141 + cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ? 142 + HFGxTR_EL2_TCR_EL1_MASK : 0); 143 + update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2); 144 + update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2); 145 + update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2); 214 146 215 147 if (cpu_has_amu()) 216 - update_fgt_traps(vcpu, HAFGRTR_EL2); 148 + update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2); 217 149 } 150 + 151 + #define __deactivate_fgt(htcxt, vcpu, kvm, reg) \ 152 + do { \ 153 + if ((vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) || \ 154 + kvm->arch.fgu[reg_to_fgt_group_id(reg)]) \ 155 + write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ 156 + SYS_ ## reg); \ 157 + } while(0) 218 158 219 159 static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) 220 160 { 221 161 struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; 162 + struct kvm *kvm = kern_hyp_va(vcpu->kvm); 222 163 223 164 if (!cpus_have_final_cap(ARM64_HAS_FGT)) 224 165 return; 225 166 226 - write_sysreg_s(ctxt_sys_reg(hctxt, HFGRTR_EL2), SYS_HFGRTR_EL2); 227 - write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); 228 - 229 - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) 230 - return; 231 - 232 - write_sysreg_s(ctxt_sys_reg(hctxt, HFGITR_EL2), SYS_HFGITR_EL2); 233 - write_sysreg_s(ctxt_sys_reg(hctxt, HDFGRTR_EL2), SYS_HDFGRTR_EL2); 234 - write_sysreg_s(ctxt_sys_reg(hctxt, HDFGWTR_EL2), SYS_HDFGWTR_EL2); 167 + __deactivate_fgt(hctxt, vcpu, kvm, HFGRTR_EL2); 168 + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 169 + write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); 170 + else 171 + __deactivate_fgt(hctxt, vcpu, kvm, HFGWTR_EL2); 172 + __deactivate_fgt(hctxt, vcpu, kvm, HFGITR_EL2); 173 + __deactivate_fgt(hctxt, vcpu, kvm, HDFGRTR_EL2); 174 + __deactivate_fgt(hctxt, vcpu, kvm, HDFGWTR_EL2); 235 175 236 176 if (cpu_has_amu()) 237 - write_sysreg_s(ctxt_sys_reg(hctxt, HAFGRTR_EL2), SYS_HAFGRTR_EL2); 177 + __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); 238 178 } 239 179 240 180 static inline void __activate_traps_common(struct kvm_vcpu *vcpu) ··· 236 230 write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); 237 231 238 232 if (cpus_have_final_cap(ARM64_HAS_HCX)) { 239 - u64 hcrx = HCRX_GUEST_FLAGS; 233 + u64 hcrx = vcpu->arch.hcrx_el2; 240 234 if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { 241 235 u64 clr = 0, set = 0; 242 236

+21 -3

arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h

··· 27 27 ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); 28 28 } 29 29 30 - static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) 30 + static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) 31 31 { 32 32 struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; 33 33 34 34 if (!vcpu) 35 35 vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); 36 36 37 + return vcpu; 38 + } 39 + 40 + static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) 41 + { 42 + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); 43 + 37 44 return kvm_has_mte(kern_hyp_va(vcpu->kvm)); 45 + } 46 + 47 + static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) 48 + { 49 + struct kvm_vcpu *vcpu; 50 + 51 + if (!cpus_have_final_cap(ARM64_HAS_S1PIE)) 52 + return false; 53 + 54 + vcpu = ctxt_to_vcpu(ctxt); 55 + return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1PIE, IMP); 38 56 } 39 57 40 58 static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ··· 73 55 ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR); 74 56 ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR); 75 57 ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL); 76 - if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 58 + if (ctxt_has_s1pie(ctxt)) { 77 59 ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); 78 60 ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); 79 61 } ··· 149 131 write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR); 150 132 write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1), SYS_AMAIR); 151 133 write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL); 152 - if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { 134 + if (ctxt_has_s1pie(ctxt)) { 153 135 write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); 154 136 write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); 155 137 }

+6 -6

arch/arm64/kvm/hyp/nvhe/debug-sr.c

··· 31 31 return; 32 32 33 33 /* Yes; save the control register and disable data generation */ 34 - *pmscr_el1 = read_sysreg_s(SYS_PMSCR_EL1); 35 - write_sysreg_s(0, SYS_PMSCR_EL1); 34 + *pmscr_el1 = read_sysreg_el1(SYS_PMSCR); 35 + write_sysreg_el1(0, SYS_PMSCR); 36 36 isb(); 37 37 38 38 /* Now drain all buffered data to memory */ ··· 48 48 isb(); 49 49 50 50 /* Re-enable data generation */ 51 - write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); 51 + write_sysreg_el1(pmscr_el1, SYS_PMSCR); 52 52 } 53 53 54 54 static void __debug_save_trace(u64 *trfcr_el1) ··· 63 63 * Since access to TRFCR_EL1 is trapped, the guest can't 64 64 * modify the filtering set by the host. 65 65 */ 66 - *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1); 67 - write_sysreg_s(0, SYS_TRFCR_EL1); 66 + *trfcr_el1 = read_sysreg_el1(SYS_TRFCR); 67 + write_sysreg_el1(0, SYS_TRFCR); 68 68 isb(); 69 69 /* Drain the trace buffer to memory */ 70 70 tsb_csync(); ··· 76 76 return; 77 77 78 78 /* Restore trace filter controls */ 79 - write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1); 79 + write_sysreg_el1(trfcr_el1, SYS_TRFCR); 80 80 } 81 81 82 82 void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)

+1 -1

arch/arm64/kvm/hyp/nvhe/host.S

··· 110 110 * u64 elr, u64 par); 111 111 */ 112 112 SYM_FUNC_START(__hyp_do_panic) 113 - /* Prepare and exit to the host's panic funciton. */ 113 + /* Prepare and exit to the host's panic function. */ 114 114 mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ 115 115 PSR_MODE_EL1h) 116 116 msr spsr_el2, lr

+2 -2

arch/arm64/kvm/hyp/nvhe/mm.c

··· 155 155 start = hyp_memory[i].base; 156 156 start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE); 157 157 /* 158 - * The begining of the hyp_vmemmap region for the current 158 + * The beginning of the hyp_vmemmap region for the current 159 159 * memblock may already be backed by the page backing the end 160 160 * the previous region, so avoid mapping it twice. 161 161 */ ··· 408 408 return pop_hyp_memcache(host_mc, hyp_phys_to_virt); 409 409 } 410 410 411 - /* Refill our local memcache by poping pages from the one provided by the host. */ 411 + /* Refill our local memcache by popping pages from the one provided by the host. */ 412 412 int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, 413 413 struct kvm_hyp_memcache *host_mc) 414 414 {

+19 -5

arch/arm64/kvm/hyp/pgtable.c

··· 717 717 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, 718 718 kvm_pte_t *ptep) 719 719 { 720 - bool device = prot & KVM_PGTABLE_PROT_DEVICE; 721 - kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : 722 - KVM_S2_MEMATTR(pgt, NORMAL); 720 + kvm_pte_t attr; 723 721 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; 722 + 723 + switch (prot & (KVM_PGTABLE_PROT_DEVICE | 724 + KVM_PGTABLE_PROT_NORMAL_NC)) { 725 + case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: 726 + return -EINVAL; 727 + case KVM_PGTABLE_PROT_DEVICE: 728 + if (prot & KVM_PGTABLE_PROT_X) 729 + return -EINVAL; 730 + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); 731 + break; 732 + case KVM_PGTABLE_PROT_NORMAL_NC: 733 + if (prot & KVM_PGTABLE_PROT_X) 734 + return -EINVAL; 735 + attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); 736 + break; 737 + default: 738 + attr = KVM_S2_MEMATTR(pgt, NORMAL); 739 + } 724 740 725 741 if (!(prot & KVM_PGTABLE_PROT_X)) 726 742 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 727 - else if (device) 728 - return -EINVAL; 729 743 730 744 if (prot & KVM_PGTABLE_PROT_R) 731 745 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;

+1 -1

arch/arm64/kvm/hyp/vhe/sysreg-sr.c

··· 95 95 } 96 96 97 97 /** 98 - * __vcpu_put_switch_syregs - Restore host system registers to the physical CPU 98 + * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU 99 99 * 100 100 * @vcpu: The VCPU pointer 101 101 *

+1 -1

arch/arm64/kvm/inject_fault.c

··· 134 134 if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) { 135 135 fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; 136 136 } else { 137 - /* no need to shuffle FS[4] into DFSR[10] as its 0 */ 137 + /* no need to shuffle FS[4] into DFSR[10] as it's 0 */ 138 138 fsr = DFSR_FSC_EXTABT_nLPAE; 139 139 } 140 140

+11 -5

arch/arm64/kvm/mmu.c

··· 305 305 * does. 306 306 */ 307 307 /** 308 - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 308 + * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 309 309 * @mmu: The KVM stage-2 MMU pointer 310 310 * @start: The intermediate physical base address of the range to unmap 311 311 * @size: The size of the area to unmap ··· 1381 1381 int ret = 0; 1382 1382 bool write_fault, writable, force_pte = false; 1383 1383 bool exec_fault, mte_allowed; 1384 - bool device = false; 1384 + bool device = false, vfio_allow_any_uc = false; 1385 1385 unsigned long mmu_seq; 1386 1386 struct kvm *kvm = vcpu->kvm; 1387 1387 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; ··· 1472 1472 gfn = fault_ipa >> PAGE_SHIFT; 1473 1473 mte_allowed = kvm_vma_mte_allowed(vma); 1474 1474 1475 + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1476 + 1475 1477 /* Don't use the VMA after the unlock -- it may have vanished */ 1476 1478 vma = NULL; 1477 1479 ··· 1559 1557 if (exec_fault) 1560 1558 prot |= KVM_PGTABLE_PROT_X; 1561 1559 1562 - if (device) 1563 - prot |= KVM_PGTABLE_PROT_DEVICE; 1564 - else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1560 + if (device) { 1561 + if (vfio_allow_any_uc) 1562 + prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1563 + else 1564 + prot |= KVM_PGTABLE_PROT_DEVICE; 1565 + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { 1565 1566 prot |= KVM_PGTABLE_PROT_X; 1567 + } 1566 1568 1567 1569 /* 1568 1570 * Under the premise of getting a FSC_PERM fault, we just need to relax

+273 -1

arch/arm64/kvm/nested.c

··· 133 133 val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001); 134 134 break; 135 135 136 + case SYS_ID_AA64MMFR4_EL1: 137 + val = 0; 138 + if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) 139 + val |= FIELD_PREP(NV_FTR(MMFR4, E2H0), 140 + ID_AA64MMFR4_EL1_E2H0_NI_NV1); 141 + break; 142 + 136 143 case SYS_ID_AA64DFR0_EL1: 137 144 /* Only limited support for PMU, Debug, BPs and WPs */ 138 145 val &= (NV_FTR(DFR0, PMUVer) | ··· 163 156 164 157 return val; 165 158 } 159 + 160 + u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) 161 + { 162 + u64 v = ctxt_sys_reg(&vcpu->arch.ctxt, sr); 163 + struct kvm_sysreg_masks *masks; 164 + 165 + masks = vcpu->kvm->arch.sysreg_masks; 166 + 167 + if (masks) { 168 + sr -= __VNCR_START__; 169 + 170 + v &= ~masks->mask[sr].res0; 171 + v |= masks->mask[sr].res1; 172 + } 173 + 174 + return v; 175 + } 176 + 177 + static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) 178 + { 179 + int i = sr - __VNCR_START__; 180 + 181 + kvm->arch.sysreg_masks->mask[i].res0 = res0; 182 + kvm->arch.sysreg_masks->mask[i].res1 = res1; 183 + } 184 + 166 185 int kvm_init_nv_sysregs(struct kvm *kvm) 167 186 { 187 + u64 res0, res1; 188 + int ret = 0; 189 + 168 190 mutex_lock(&kvm->arch.config_lock); 191 + 192 + if (kvm->arch.sysreg_masks) 193 + goto out; 194 + 195 + kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), 196 + GFP_KERNEL); 197 + if (!kvm->arch.sysreg_masks) { 198 + ret = -ENOMEM; 199 + goto out; 200 + } 169 201 170 202 for (int i = 0; i < KVM_ARM_ID_REG_NUM; i++) 171 203 kvm->arch.id_regs[i] = limit_nv_id_reg(IDX_IDREG(i), 172 204 kvm->arch.id_regs[i]); 173 205 206 + /* VTTBR_EL2 */ 207 + res0 = res1 = 0; 208 + if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16)) 209 + res0 |= GENMASK(63, 56); 210 + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP)) 211 + res0 |= VTTBR_CNP_BIT; 212 + set_sysreg_masks(kvm, VTTBR_EL2, res0, res1); 213 + 214 + /* VTCR_EL2 */ 215 + res0 = GENMASK(63, 32) | GENMASK(30, 20); 216 + res1 = BIT(31); 217 + set_sysreg_masks(kvm, VTCR_EL2, res0, res1); 218 + 219 + /* VMPIDR_EL2 */ 220 + res0 = GENMASK(63, 40) | GENMASK(30, 24); 221 + res1 = BIT(31); 222 + set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1); 223 + 224 + /* HCR_EL2 */ 225 + res0 = BIT(48); 226 + res1 = HCR_RW; 227 + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, TWED, IMP)) 228 + res0 |= GENMASK(63, 59); 229 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, MTE2)) 230 + res0 |= (HCR_TID5 | HCR_DCT | HCR_ATA); 231 + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, TTLBxS)) 232 + res0 |= (HCR_TTLBIS | HCR_TTLBOS); 233 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) && 234 + !kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2)) 235 + res0 |= HCR_ENSCXT; 236 + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, IMP)) 237 + res0 |= (HCR_TOCU | HCR_TICAB | HCR_TID4); 238 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) 239 + res0 |= HCR_AMVOFFEN; 240 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1)) 241 + res0 |= HCR_FIEN; 242 + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP)) 243 + res0 |= HCR_FWB; 244 + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)) 245 + res0 |= HCR_NV2; 246 + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP)) 247 + res0 |= (HCR_AT | HCR_NV1 | HCR_NV); 248 + if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && 249 + __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) 250 + res0 |= (HCR_API | HCR_APK); 251 + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP)) 252 + res0 |= BIT(39); 253 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) 254 + res0 |= (HCR_TEA | HCR_TERR); 255 + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) 256 + res0 |= HCR_TLOR; 257 + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP)) 258 + res1 |= HCR_E2H; 259 + set_sysreg_masks(kvm, HCR_EL2, res0, res1); 260 + 261 + /* HCRX_EL2 */ 262 + res0 = HCRX_EL2_RES0; 263 + res1 = HCRX_EL2_RES1; 264 + if (!kvm_has_feat(kvm, ID_AA64ISAR3_EL1, PACM, TRIVIAL_IMP)) 265 + res0 |= HCRX_EL2_PACMEn; 266 + if (!kvm_has_feat(kvm, ID_AA64PFR2_EL1, FPMR, IMP)) 267 + res0 |= HCRX_EL2_EnFPM; 268 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) 269 + res0 |= HCRX_EL2_GCSEn; 270 + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, SYSREG_128, IMP)) 271 + res0 |= HCRX_EL2_EnIDCP128; 272 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, DEV_ASYNC)) 273 + res0 |= (HCRX_EL2_EnSDERR | HCRX_EL2_EnSNERR); 274 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, DF2, IMP)) 275 + res0 |= HCRX_EL2_TMEA; 276 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP)) 277 + res0 |= HCRX_EL2_D128En; 278 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) 279 + res0 |= HCRX_EL2_PTTWI; 280 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP)) 281 + res0 |= HCRX_EL2_SCTLR2En; 282 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) 283 + res0 |= HCRX_EL2_TCR2En; 284 + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) 285 + res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); 286 + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, CMOW, IMP)) 287 + res0 |= HCRX_EL2_CMOW; 288 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, NMI, IMP)) 289 + res0 |= (HCRX_EL2_VFNMI | HCRX_EL2_VINMI | HCRX_EL2_TALLINT); 290 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP) || 291 + !(read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS)) 292 + res0 |= HCRX_EL2_SMPME; 293 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) 294 + res0 |= (HCRX_EL2_FGTnXS | HCRX_EL2_FnXS); 295 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V)) 296 + res0 |= HCRX_EL2_EnASR; 297 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64)) 298 + res0 |= HCRX_EL2_EnALS; 299 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) 300 + res0 |= HCRX_EL2_EnAS0; 301 + set_sysreg_masks(kvm, HCRX_EL2, res0, res1); 302 + 303 + /* HFG[RW]TR_EL2 */ 304 + res0 = res1 = 0; 305 + if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && 306 + __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) 307 + res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey | 308 + HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey | 309 + HFGxTR_EL2_APIBKey); 310 + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) 311 + res0 |= (HFGxTR_EL2_LORC_EL1 | HFGxTR_EL2_LOREA_EL1 | 312 + HFGxTR_EL2_LORID_EL1 | HFGxTR_EL2_LORN_EL1 | 313 + HFGxTR_EL2_LORSA_EL1); 314 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) && 315 + !kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2)) 316 + res0 |= (HFGxTR_EL2_SCXTNUM_EL1 | HFGxTR_EL2_SCXTNUM_EL0); 317 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP)) 318 + res0 |= HFGxTR_EL2_ICC_IGRPENn_EL1; 319 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) 320 + res0 |= (HFGxTR_EL2_ERRIDR_EL1 | HFGxTR_EL2_ERRSELR_EL1 | 321 + HFGxTR_EL2_ERXFR_EL1 | HFGxTR_EL2_ERXCTLR_EL1 | 322 + HFGxTR_EL2_ERXSTATUS_EL1 | HFGxTR_EL2_ERXMISCn_EL1 | 323 + HFGxTR_EL2_ERXPFGF_EL1 | HFGxTR_EL2_ERXPFGCTL_EL1 | 324 + HFGxTR_EL2_ERXPFGCDN_EL1 | HFGxTR_EL2_ERXADDR_EL1); 325 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) 326 + res0 |= HFGxTR_EL2_nACCDATA_EL1; 327 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) 328 + res0 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1); 329 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) 330 + res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); 331 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) 332 + res0 |= HFGxTR_EL2_nRCWMASK_EL1; 333 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) 334 + res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); 335 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) 336 + res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); 337 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) 338 + res0 |= HFGxTR_EL2_nS2POR_EL1; 339 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) 340 + res0 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1); 341 + set_sysreg_masks(kvm, HFGRTR_EL2, res0 | __HFGRTR_EL2_RES0, res1); 342 + set_sysreg_masks(kvm, HFGWTR_EL2, res0 | __HFGWTR_EL2_RES0, res1); 343 + 344 + /* HDFG[RW]TR_EL2 */ 345 + res0 = res1 = 0; 346 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) 347 + res0 |= HDFGRTR_EL2_OSDLR_EL1; 348 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) 349 + res0 |= (HDFGRTR_EL2_PMEVCNTRn_EL0 | HDFGRTR_EL2_PMEVTYPERn_EL0 | 350 + HDFGRTR_EL2_PMCCFILTR_EL0 | HDFGRTR_EL2_PMCCNTR_EL0 | 351 + HDFGRTR_EL2_PMCNTEN | HDFGRTR_EL2_PMINTEN | 352 + HDFGRTR_EL2_PMOVS | HDFGRTR_EL2_PMSELR_EL0 | 353 + HDFGRTR_EL2_PMMIR_EL1 | HDFGRTR_EL2_PMUSERENR_EL0 | 354 + HDFGRTR_EL2_PMCEIDn_EL0); 355 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) 356 + res0 |= (HDFGRTR_EL2_PMBLIMITR_EL1 | HDFGRTR_EL2_PMBPTR_EL1 | 357 + HDFGRTR_EL2_PMBSR_EL1 | HDFGRTR_EL2_PMSCR_EL1 | 358 + HDFGRTR_EL2_PMSEVFR_EL1 | HDFGRTR_EL2_PMSFCR_EL1 | 359 + HDFGRTR_EL2_PMSICR_EL1 | HDFGRTR_EL2_PMSIDR_EL1 | 360 + HDFGRTR_EL2_PMSIRR_EL1 | HDFGRTR_EL2_PMSLATFR_EL1 | 361 + HDFGRTR_EL2_PMBIDR_EL1); 362 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) 363 + res0 |= (HDFGRTR_EL2_TRC | HDFGRTR_EL2_TRCAUTHSTATUS | 364 + HDFGRTR_EL2_TRCAUXCTLR | HDFGRTR_EL2_TRCCLAIM | 365 + HDFGRTR_EL2_TRCCNTVRn | HDFGRTR_EL2_TRCID | 366 + HDFGRTR_EL2_TRCIMSPECn | HDFGRTR_EL2_TRCOSLSR | 367 + HDFGRTR_EL2_TRCPRGCTLR | HDFGRTR_EL2_TRCSEQSTR | 368 + HDFGRTR_EL2_TRCSSCSRn | HDFGRTR_EL2_TRCSTATR | 369 + HDFGRTR_EL2_TRCVICTLR); 370 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) 371 + res0 |= (HDFGRTR_EL2_TRBBASER_EL1 | HDFGRTR_EL2_TRBIDR_EL1 | 372 + HDFGRTR_EL2_TRBLIMITR_EL1 | HDFGRTR_EL2_TRBMAR_EL1 | 373 + HDFGRTR_EL2_TRBPTR_EL1 | HDFGRTR_EL2_TRBSR_EL1 | 374 + HDFGRTR_EL2_TRBTRG_EL1); 375 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) 376 + res0 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL | 377 + HDFGRTR_EL2_nBRBDATA); 378 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) 379 + res0 |= HDFGRTR_EL2_nPMSNEVFR_EL1; 380 + set_sysreg_masks(kvm, HDFGRTR_EL2, res0 | HDFGRTR_EL2_RES0, res1); 381 + 382 + /* Reuse the bits from the read-side and add the write-specific stuff */ 383 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) 384 + res0 |= (HDFGWTR_EL2_PMCR_EL0 | HDFGWTR_EL2_PMSWINC_EL0); 385 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) 386 + res0 |= HDFGWTR_EL2_TRCOSLAR; 387 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) 388 + res0 |= HDFGWTR_EL2_TRFCR_EL1; 389 + set_sysreg_masks(kvm, HFGWTR_EL2, res0 | HDFGWTR_EL2_RES0, res1); 390 + 391 + /* HFGITR_EL2 */ 392 + res0 = HFGITR_EL2_RES0; 393 + res1 = HFGITR_EL2_RES1; 394 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, DPB, DPB2)) 395 + res0 |= HFGITR_EL2_DCCVADP; 396 + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) 397 + res0 |= (HFGITR_EL2_ATS1E1RP | HFGITR_EL2_ATS1E1WP); 398 + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) 399 + res0 |= (HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS | 400 + HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS | 401 + HFGITR_EL2_TLBIVAALE1OS | HFGITR_EL2_TLBIVALE1OS | 402 + HFGITR_EL2_TLBIVAAE1OS | HFGITR_EL2_TLBIASIDE1OS | 403 + HFGITR_EL2_TLBIVAE1OS | HFGITR_EL2_TLBIVMALLE1OS); 404 + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) 405 + res0 |= (HFGITR_EL2_TLBIRVAALE1 | HFGITR_EL2_TLBIRVALE1 | 406 + HFGITR_EL2_TLBIRVAAE1 | HFGITR_EL2_TLBIRVAE1 | 407 + HFGITR_EL2_TLBIRVAALE1IS | HFGITR_EL2_TLBIRVALE1IS | 408 + HFGITR_EL2_TLBIRVAAE1IS | HFGITR_EL2_TLBIRVAE1IS | 409 + HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS | 410 + HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); 411 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, IMP)) 412 + res0 |= (HFGITR_EL2_CFPRCTX | HFGITR_EL2_DVPRCTX | 413 + HFGITR_EL2_CPPRCTX); 414 + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) 415 + res0 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); 416 + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) 417 + res0 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 | 418 + HFGITR_EL2_nGCSEPP); 419 + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX)) 420 + res0 |= HFGITR_EL2_COSPRCTX; 421 + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) 422 + res0 |= HFGITR_EL2_ATS1E1A; 423 + set_sysreg_masks(kvm, HFGITR_EL2, res0, res1); 424 + 425 + /* HAFGRTR_EL2 - not a lot to see here */ 426 + res0 = HAFGRTR_EL2_RES0; 427 + res1 = HAFGRTR_EL2_RES1; 428 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) 429 + res0 |= ~(res0 | res1); 430 + set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); 431 + out: 174 432 mutex_unlock(&kvm->arch.config_lock); 175 433 176 - return 0; 434 + return ret; 177 435 }

+8 -7

arch/arm64/kvm/pmu-emul.c

··· 64 64 { 65 65 u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | 66 66 kvm_pmu_event_mask(kvm); 67 - u64 pfr0 = IDREG(kvm, SYS_ID_AA64PFR0_EL1); 68 67 69 - if (SYS_FIELD_GET(ID_AA64PFR0_EL1, EL2, pfr0)) 68 + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) 70 69 mask |= ARMV8_PMU_INCLUDE_EL2; 71 70 72 - if (SYS_FIELD_GET(ID_AA64PFR0_EL1, EL3, pfr0)) 71 + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) 73 72 mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | 74 73 ARMV8_PMU_EXCLUDE_NS_EL1 | 75 74 ARMV8_PMU_EXCLUDE_EL3; ··· 82 83 */ 83 84 static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) 84 85 { 86 + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); 87 + 85 88 return (pmc->idx == ARMV8_PMU_CYCLE_IDX || 86 - kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc))); 89 + kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)); 87 90 } 88 91 89 92 static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) ··· 420 419 kvm_pmu_update_state(vcpu); 421 420 } 422 421 423 - /** 422 + /* 424 423 * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding 425 424 * to the event. 426 425 * This is why we need a callback to do it once outside of the NMI context. ··· 491 490 return val; 492 491 } 493 492 494 - /** 493 + /* 495 494 * When the perf event overflows, set the overflow status and inform the vcpu. 496 495 */ 497 496 static void kvm_pmu_perf_overflow(struct perf_event *perf_event, ··· 557 556 return; 558 557 559 558 /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ 560 - if (!kvm_pmu_is_3p5(vcpu)) 559 + if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) 561 560 val &= ~ARMV8_PMU_PMCR_LP; 562 561 563 562 /* The reset bits don't indicate any state, and shouldn't be saved. */

+225 -43

arch/arm64/kvm/sys_regs.c

··· 12 12 #include <linux/bitfield.h> 13 13 #include <linux/bsearch.h> 14 14 #include <linux/cacheinfo.h> 15 + #include <linux/debugfs.h> 15 16 #include <linux/kvm_host.h> 16 17 #include <linux/mm.h> 17 18 #include <linux/printk.h> ··· 32 31 33 32 #include <trace/events/kvm.h> 34 33 34 + #include "check-res-bits.h" 35 35 #include "sys_regs.h" 36 36 37 37 #include "trace.h" ··· 507 505 struct sys_reg_params *p, 508 506 const struct sys_reg_desc *r) 509 507 { 510 - u64 val = IDREG(vcpu->kvm, SYS_ID_AA64MMFR1_EL1); 511 508 u32 sr = reg_to_encoding(r); 512 509 513 - if (!(val & (0xfUL << ID_AA64MMFR1_EL1_LO_SHIFT))) { 510 + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) { 514 511 kvm_inject_undefined(vcpu); 515 512 return false; 516 513 } ··· 1686 1685 u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ 1687 1686 (val) &= ~reg##_##field##_MASK; \ 1688 1687 (val) |= FIELD_PREP(reg##_##field##_MASK, \ 1689 - min(__f_val, (u64)reg##_##field##_##limit)); \ 1688 + min(__f_val, \ 1689 + (u64)SYS_FIELD_VALUE(reg, field, limit))); \ 1690 1690 (val); \ 1691 1691 }) 1692 1692 ··· 2176 2174 return true; 2177 2175 } 2178 2176 2177 + static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 2178 + { 2179 + u64 val = r->val; 2180 + 2181 + if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) 2182 + val |= HCR_E2H; 2183 + 2184 + return __vcpu_sys_reg(vcpu, r->reg) = val; 2185 + } 2186 + 2179 2187 /* 2180 2188 * Architected system registers. 2181 2189 * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 ··· 2198 2186 * guest... 2199 2187 */ 2200 2188 static const struct sys_reg_desc sys_reg_descs[] = { 2201 - { SYS_DESC(SYS_DC_ISW), access_dcsw }, 2202 - { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, 2203 - { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, 2204 - { SYS_DESC(SYS_DC_CSW), access_dcsw }, 2205 - { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, 2206 - { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, 2207 - { SYS_DESC(SYS_DC_CISW), access_dcsw }, 2208 - { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, 2209 - { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, 2210 - 2211 2189 DBG_BCR_BVR_WCR_WVR_EL1(0), 2212 2190 DBG_BCR_BVR_WCR_WVR_EL1(1), 2213 2191 { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, ··· 2351 2349 ID_AA64MMFR2_EL1_NV | 2352 2350 ID_AA64MMFR2_EL1_CCIDX)), 2353 2351 ID_SANITISED(ID_AA64MMFR3_EL1), 2354 - ID_UNALLOCATED(7,4), 2352 + ID_SANITISED(ID_AA64MMFR4_EL1), 2355 2353 ID_UNALLOCATED(7,5), 2356 2354 ID_UNALLOCATED(7,6), 2357 2355 ID_UNALLOCATED(7,7), ··· 2667 2665 EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0), 2668 2666 EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), 2669 2667 EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), 2670 - EL2_REG_VNCR(HCR_EL2, reset_val, 0), 2668 + EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), 2671 2669 EL2_REG(MDCR_EL2, access_rw, reset_val, 0), 2672 2670 EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), 2673 2671 EL2_REG_VNCR(HSTR_EL2, reset_val, 0), ··· 2729 2727 EL2_REG(SP_EL2, NULL, reset_unknown, 0), 2730 2728 }; 2731 2729 2730 + static struct sys_reg_desc sys_insn_descs[] = { 2731 + { SYS_DESC(SYS_DC_ISW), access_dcsw }, 2732 + { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, 2733 + { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, 2734 + { SYS_DESC(SYS_DC_CSW), access_dcsw }, 2735 + { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, 2736 + { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, 2737 + { SYS_DESC(SYS_DC_CISW), access_dcsw }, 2738 + { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, 2739 + { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, 2740 + }; 2741 + 2732 2742 static const struct sys_reg_desc *first_idreg; 2733 2743 2734 2744 static bool trap_dbgdidr(struct kvm_vcpu *vcpu, ··· 2751 2737 return ignore_write(vcpu, p); 2752 2738 } else { 2753 2739 u64 dfr = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); 2754 - u64 pfr = IDREG(vcpu->kvm, SYS_ID_AA64PFR0_EL1); 2755 - u32 el3 = !!SYS_FIELD_GET(ID_AA64PFR0_EL1, EL3, pfr); 2740 + u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); 2756 2741 2757 2742 p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | 2758 2743 (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) | ··· 3172 3159 /** 3173 3160 * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access 3174 3161 * @vcpu: The VCPU pointer 3175 - * @run: The kvm_run struct 3162 + * @global: &struct sys_reg_desc 3163 + * @nr_global: size of the @global array 3176 3164 */ 3177 3165 static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, 3178 3166 const struct sys_reg_desc *global, ··· 3340 3326 /** 3341 3327 * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access 3342 3328 * @vcpu: The VCPU pointer 3343 - * @run: The kvm_run struct 3329 + * @params: &struct sys_reg_params 3330 + * @global: &struct sys_reg_desc 3331 + * @nr_global: size of the @global array 3344 3332 */ 3345 3333 static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, 3346 3334 struct sys_reg_params *params, ··· 3400 3384 return kvm_handle_cp_32(vcpu, &params, cp14_regs, ARRAY_SIZE(cp14_regs)); 3401 3385 } 3402 3386 3403 - static bool is_imp_def_sys_reg(struct sys_reg_params *params) 3404 - { 3405 - // See ARM DDI 0487E.a, section D12.3.2 3406 - return params->Op0 == 3 && (params->CRn & 0b1011) == 0b1011; 3407 - } 3408 - 3409 3387 /** 3410 3388 * emulate_sys_reg - Emulate a guest access to an AArch64 system register 3411 3389 * @vcpu: The VCPU pointer ··· 3408 3398 * Return: true if the system register access was successful, false otherwise. 3409 3399 */ 3410 3400 static bool emulate_sys_reg(struct kvm_vcpu *vcpu, 3411 - struct sys_reg_params *params) 3401 + struct sys_reg_params *params) 3412 3402 { 3413 3403 const struct sys_reg_desc *r; 3414 3404 3415 3405 r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); 3416 - 3417 3406 if (likely(r)) { 3418 3407 perform_access(vcpu, params, r); 3419 3408 return true; 3420 3409 } 3421 3410 3422 - if (is_imp_def_sys_reg(params)) { 3423 - kvm_inject_undefined(vcpu); 3424 - } else { 3425 - print_sys_reg_msg(params, 3426 - "Unsupported guest sys_reg access at: %lx [%08lx]\n", 3427 - *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); 3428 - kvm_inject_undefined(vcpu); 3429 - } 3411 + print_sys_reg_msg(params, 3412 + "Unsupported guest sys_reg access at: %lx [%08lx]\n", 3413 + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); 3414 + kvm_inject_undefined(vcpu); 3415 + 3430 3416 return false; 3417 + } 3418 + 3419 + static void *idregs_debug_start(struct seq_file *s, loff_t *pos) 3420 + { 3421 + struct kvm *kvm = s->private; 3422 + u8 *iter; 3423 + 3424 + mutex_lock(&kvm->arch.config_lock); 3425 + 3426 + iter = &kvm->arch.idreg_debugfs_iter; 3427 + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && 3428 + *iter == (u8)~0) { 3429 + *iter = *pos; 3430 + if (*iter >= KVM_ARM_ID_REG_NUM) 3431 + iter = NULL; 3432 + } else { 3433 + iter = ERR_PTR(-EBUSY); 3434 + } 3435 + 3436 + mutex_unlock(&kvm->arch.config_lock); 3437 + 3438 + return iter; 3439 + } 3440 + 3441 + static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) 3442 + { 3443 + struct kvm *kvm = s->private; 3444 + 3445 + (*pos)++; 3446 + 3447 + if ((kvm->arch.idreg_debugfs_iter + 1) < KVM_ARM_ID_REG_NUM) { 3448 + kvm->arch.idreg_debugfs_iter++; 3449 + 3450 + return &kvm->arch.idreg_debugfs_iter; 3451 + } 3452 + 3453 + return NULL; 3454 + } 3455 + 3456 + static void idregs_debug_stop(struct seq_file *s, void *v) 3457 + { 3458 + struct kvm *kvm = s->private; 3459 + 3460 + if (IS_ERR(v)) 3461 + return; 3462 + 3463 + mutex_lock(&kvm->arch.config_lock); 3464 + 3465 + kvm->arch.idreg_debugfs_iter = ~0; 3466 + 3467 + mutex_unlock(&kvm->arch.config_lock); 3468 + } 3469 + 3470 + static int idregs_debug_show(struct seq_file *s, void *v) 3471 + { 3472 + struct kvm *kvm = s->private; 3473 + const struct sys_reg_desc *desc; 3474 + 3475 + desc = first_idreg + kvm->arch.idreg_debugfs_iter; 3476 + 3477 + if (!desc->name) 3478 + return 0; 3479 + 3480 + seq_printf(s, "%20s:\t%016llx\n", 3481 + desc->name, IDREG(kvm, IDX_IDREG(kvm->arch.idreg_debugfs_iter))); 3482 + 3483 + return 0; 3484 + } 3485 + 3486 + static const struct seq_operations idregs_debug_sops = { 3487 + .start = idregs_debug_start, 3488 + .next = idregs_debug_next, 3489 + .stop = idregs_debug_stop, 3490 + .show = idregs_debug_show, 3491 + }; 3492 + 3493 + DEFINE_SEQ_ATTRIBUTE(idregs_debug); 3494 + 3495 + void kvm_sys_regs_create_debugfs(struct kvm *kvm) 3496 + { 3497 + kvm->arch.idreg_debugfs_iter = ~0; 3498 + 3499 + debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, 3500 + &idregs_debug_fops); 3431 3501 } 3432 3502 3433 3503 static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) ··· 3557 3467 } 3558 3468 3559 3469 /** 3560 - * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access 3470 + * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction 3471 + * trap on a guest execution 3561 3472 * @vcpu: The VCPU pointer 3562 3473 */ 3563 3474 int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) 3564 3475 { 3476 + const struct sys_reg_desc *desc = NULL; 3565 3477 struct sys_reg_params params; 3566 3478 unsigned long esr = kvm_vcpu_get_esr(vcpu); 3567 3479 int Rt = kvm_vcpu_sys_get_rt(vcpu); 3480 + int sr_idx; 3568 3481 3569 3482 trace_kvm_handle_sys_reg(esr); 3570 3483 3571 - if (__check_nv_sr_forward(vcpu)) 3484 + if (triage_sysreg_trap(vcpu, &sr_idx)) 3572 3485 return 1; 3573 3486 3574 3487 params = esr_sys64_to_params(esr); 3575 3488 params.regval = vcpu_get_reg(vcpu, Rt); 3576 3489 3577 - if (!emulate_sys_reg(vcpu, &params)) 3578 - return 1; 3490 + /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */ 3491 + if (params.Op0 == 2 || params.Op0 == 3) 3492 + desc = &sys_reg_descs[sr_idx]; 3493 + else 3494 + desc = &sys_insn_descs[sr_idx]; 3579 3495 3580 - if (!params.is_write) 3496 + perform_access(vcpu, &params, desc); 3497 + 3498 + /* Read from system register? */ 3499 + if (!params.is_write && 3500 + (params.Op0 == 2 || params.Op0 == 3)) 3581 3501 vcpu_set_reg(vcpu, Rt, params.regval); 3502 + 3582 3503 return 1; 3583 3504 } 3584 3505 ··· 4031 3930 return 0; 4032 3931 } 4033 3932 3933 + void kvm_init_sysreg(struct kvm_vcpu *vcpu) 3934 + { 3935 + struct kvm *kvm = vcpu->kvm; 3936 + 3937 + mutex_lock(&kvm->arch.config_lock); 3938 + 3939 + /* 3940 + * In the absence of FGT, we cannot independently trap TLBI 3941 + * Range instructions. This isn't great, but trapping all 3942 + * TLBIs would be far worse. Live with it... 3943 + */ 3944 + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) 3945 + vcpu->arch.hcr_el2 |= HCR_TTLBOS; 3946 + 3947 + if (cpus_have_final_cap(ARM64_HAS_HCX)) { 3948 + vcpu->arch.hcrx_el2 = HCRX_GUEST_FLAGS; 3949 + 3950 + if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) 3951 + vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); 3952 + } 3953 + 3954 + if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) 3955 + goto out; 3956 + 3957 + kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | 3958 + HFGxTR_EL2_nMAIR2_EL1 | 3959 + HFGxTR_EL2_nS2POR_EL1 | 3960 + HFGxTR_EL2_nPOR_EL1 | 3961 + HFGxTR_EL2_nPOR_EL0 | 3962 + HFGxTR_EL2_nACCDATA_EL1 | 3963 + HFGxTR_EL2_nSMPRI_EL1_MASK | 3964 + HFGxTR_EL2_nTPIDR2_EL0_MASK); 3965 + 3966 + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) 3967 + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1OS| 3968 + HFGITR_EL2_TLBIRVALE1OS | 3969 + HFGITR_EL2_TLBIRVAAE1OS | 3970 + HFGITR_EL2_TLBIRVAE1OS | 3971 + HFGITR_EL2_TLBIVAALE1OS | 3972 + HFGITR_EL2_TLBIVALE1OS | 3973 + HFGITR_EL2_TLBIVAAE1OS | 3974 + HFGITR_EL2_TLBIASIDE1OS | 3975 + HFGITR_EL2_TLBIVAE1OS | 3976 + HFGITR_EL2_TLBIVMALLE1OS); 3977 + 3978 + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) 3979 + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1 | 3980 + HFGITR_EL2_TLBIRVALE1 | 3981 + HFGITR_EL2_TLBIRVAAE1 | 3982 + HFGITR_EL2_TLBIRVAE1 | 3983 + HFGITR_EL2_TLBIRVAALE1IS| 3984 + HFGITR_EL2_TLBIRVALE1IS | 3985 + HFGITR_EL2_TLBIRVAAE1IS | 3986 + HFGITR_EL2_TLBIRVAE1IS | 3987 + HFGITR_EL2_TLBIRVAALE1OS| 3988 + HFGITR_EL2_TLBIRVALE1OS | 3989 + HFGITR_EL2_TLBIRVAAE1OS | 3990 + HFGITR_EL2_TLBIRVAE1OS); 3991 + 3992 + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) 3993 + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | 3994 + HFGxTR_EL2_nPIR_EL1); 3995 + 3996 + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) 3997 + kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | 3998 + HAFGRTR_EL2_RES1); 3999 + 4000 + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); 4001 + out: 4002 + mutex_unlock(&kvm->arch.config_lock); 4003 + } 4004 + 4034 4005 int __init kvm_sys_reg_table_init(void) 4035 4006 { 4036 4007 struct sys_reg_params params; 4037 4008 bool valid = true; 4038 4009 unsigned int i; 4010 + int ret = 0; 4011 + 4012 + check_res_bits(); 4039 4013 4040 4014 /* Make sure tables are unique and in order. */ 4041 4015 valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); ··· 4119 3943 valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); 4120 3944 valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); 4121 3945 valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); 3946 + valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); 4122 3947 4123 3948 if (!valid) 4124 3949 return -EINVAL; ··· 4134 3957 if (!first_idreg) 4135 3958 return -EINVAL; 4136 3959 4137 - if (kvm_get_mode() == KVM_MODE_NV) 4138 - return populate_nv_trap_config(); 3960 + ret = populate_nv_trap_config(); 4139 3961 4140 - return 0; 3962 + for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) 3963 + ret = populate_sysreg_config(sys_reg_descs + i, i); 3964 + 3965 + for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++) 3966 + ret = populate_sysreg_config(sys_insn_descs + i, i); 3967 + 3968 + return ret; 4141 3969 }

+2

arch/arm64/kvm/sys_regs.h

··· 233 233 int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, 234 234 const struct sys_reg_desc table[], unsigned int num); 235 235 236 + bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); 237 + 236 238 #define AA32(_x) .aarch32_map = AA32_##_x 237 239 #define Op0(_x) .Op0 = _x 238 240 #define Op1(_x) .Op1 = _x

+1 -1

arch/arm64/kvm/vgic/vgic-debug.c

··· 149 149 seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); 150 150 seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); 151 151 if (v3) 152 - seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); 152 + seq_printf(s, "nr_lpis:\t%d\n", atomic_read(&dist->lpi_count)); 153 153 seq_printf(s, "enabled:\t%d\n", dist->enabled); 154 154 seq_printf(s, "\n"); 155 155

+7 -3

arch/arm64/kvm/vgic/vgic-init.c

··· 53 53 { 54 54 struct vgic_dist *dist = &kvm->arch.vgic; 55 55 56 - INIT_LIST_HEAD(&dist->lpi_list_head); 57 56 INIT_LIST_HEAD(&dist->lpi_translation_cache); 58 57 raw_spin_lock_init(&dist->lpi_list_lock); 58 + xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); 59 59 } 60 60 61 61 /* CREATION */ ··· 309 309 vgic_lpi_translation_cache_init(kvm); 310 310 311 311 /* 312 - * If we have GICv4.1 enabled, unconditionnaly request enable the 312 + * If we have GICv4.1 enabled, unconditionally request enable the 313 313 * v4 support so that we get HW-accelerated vSGIs. Otherwise, only 314 314 * enable it if we present a virtual ITS to the guest. 315 315 */ ··· 366 366 367 367 if (vgic_supports_direct_msis(kvm)) 368 368 vgic_v4_teardown(kvm); 369 + 370 + xa_destroy(&dist->lpi_xa); 369 371 } 370 372 371 373 static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) ··· 447 445 /* RESOURCE MAPPING */ 448 446 449 447 /** 448 + * kvm_vgic_map_resources - map the MMIO regions 449 + * @kvm: kvm struct pointer 450 + * 450 451 * Map the MMIO regions depending on the VGIC model exposed to the guest 451 452 * called on the first VCPU run. 452 453 * Also map the virtual CPU interface into the VM. 453 454 * v2 calls vgic_init() if not already done. 454 455 * v3 and derivatives return an error if the VGIC is not initialized. 455 456 * vgic_ready() returns true if this function has succeeded. 456 - * @kvm: kvm struct pointer 457 457 */ 458 458 int kvm_vgic_map_resources(struct kvm *kvm) 459 459 {

+41 -24

arch/arm64/kvm/vgic/vgic-its.c

··· 52 52 if (!irq) 53 53 return ERR_PTR(-ENOMEM); 54 54 55 - INIT_LIST_HEAD(&irq->lpi_list); 55 + ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); 56 + if (ret) { 57 + kfree(irq); 58 + return ERR_PTR(ret); 59 + } 60 + 56 61 INIT_LIST_HEAD(&irq->ap_list); 57 62 raw_spin_lock_init(&irq->irq_lock); 58 63 ··· 73 68 * There could be a race with another vgic_add_lpi(), so we need to 74 69 * check that we don't add a second list entry with the same LPI. 75 70 */ 76 - list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { 77 - if (oldirq->intid != intid) 78 - continue; 79 - 71 + oldirq = xa_load(&dist->lpi_xa, intid); 72 + if (vgic_try_get_irq_kref(oldirq)) { 80 73 /* Someone was faster with adding this LPI, lets use that. */ 81 74 kfree(irq); 82 75 irq = oldirq; 83 76 84 - /* 85 - * This increases the refcount, the caller is expected to 86 - * call vgic_put_irq() on the returned pointer once it's 87 - * finished with the IRQ. 88 - */ 89 - vgic_get_irq_kref(irq); 90 - 91 77 goto out_unlock; 92 78 } 93 79 94 - list_add_tail(&irq->lpi_list, &dist->lpi_list_head); 95 - dist->lpi_list_count++; 80 + ret = xa_err(xa_store(&dist->lpi_xa, intid, irq, 0)); 81 + if (ret) { 82 + xa_release(&dist->lpi_xa, intid); 83 + kfree(irq); 84 + goto out_unlock; 85 + } 86 + 87 + atomic_inc(&dist->lpi_count); 96 88 97 89 out_unlock: 98 90 raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 91 + 92 + if (ret) 93 + return ERR_PTR(ret); 99 94 100 95 /* 101 96 * We "cache" the configuration table entries in our struct vgic_irq's. ··· 163 158 * @cte_esz: collection table entry size 164 159 * @dte_esz: device table entry size 165 160 * @ite_esz: interrupt translation table entry size 166 - * @save tables: save the ITS tables into guest RAM 161 + * @save_tables: save the ITS tables into guest RAM 167 162 * @restore_tables: restore the ITS internal structs from tables 168 163 * stored in guest RAM 169 164 * @commit: initialize the registers which expose the ABI settings, ··· 316 311 return 0; 317 312 } 318 313 314 + #define GIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) 315 + 319 316 /* 320 317 * Create a snapshot of the current LPIs targeting @vcpu, so that we can 321 318 * enumerate those LPIs without holding any lock. ··· 326 319 int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) 327 320 { 328 321 struct vgic_dist *dist = &kvm->arch.vgic; 322 + XA_STATE(xas, &dist->lpi_xa, GIC_LPI_OFFSET); 329 323 struct vgic_irq *irq; 330 324 unsigned long flags; 331 325 u32 *intids; ··· 339 331 * command). If coming from another path (such as enabling LPIs), 340 332 * we must be careful not to overrun the array. 341 333 */ 342 - irq_count = READ_ONCE(dist->lpi_list_count); 334 + irq_count = atomic_read(&dist->lpi_count); 343 335 intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT); 344 336 if (!intids) 345 337 return -ENOMEM; 346 338 347 339 raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 348 - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { 340 + rcu_read_lock(); 341 + 342 + xas_for_each(&xas, irq, GIC_LPI_MAX_INTID) { 349 343 if (i == irq_count) 350 344 break; 351 345 /* We don't need to "get" the IRQ, as we hold the list lock. */ ··· 355 345 continue; 356 346 intids[i++] = irq->intid; 357 347 } 348 + 349 + rcu_read_unlock(); 358 350 raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 359 351 360 352 *intid_ptr = intids; ··· 607 595 raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 608 596 609 597 irq = __vgic_its_check_cache(dist, db, devid, eventid); 610 - if (irq) 611 - vgic_get_irq_kref(irq); 598 + if (!vgic_try_get_irq_kref(irq)) 599 + irq = NULL; 612 600 613 601 raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 614 602 ··· 652 640 * was in the cache, and increment it on the new interrupt. 653 641 */ 654 642 if (cte->irq) 655 - __vgic_put_lpi_locked(kvm, cte->irq); 643 + vgic_put_irq(kvm, cte->irq); 656 644 645 + /* 646 + * The irq refcount is guaranteed to be nonzero while holding the 647 + * its_lock, as the ITE (and the reference it holds) cannot be freed. 648 + */ 649 + lockdep_assert_held(&its->its_lock); 657 650 vgic_get_irq_kref(irq); 658 651 659 652 cte->db = db; ··· 689 672 if (!cte->irq) 690 673 break; 691 674 692 - __vgic_put_lpi_locked(kvm, cte->irq); 675 + vgic_put_irq(kvm, cte->irq); 693 676 cte->irq = NULL; 694 677 } 695 678 ··· 1362 1345 } 1363 1346 1364 1347 /** 1365 - * vgic_its_invall - invalidate all LPIs targetting a given vcpu 1366 - * @vcpu: the vcpu for which the RD is targetted by an invalidation 1348 + * vgic_its_invall - invalidate all LPIs targeting a given vcpu 1349 + * @vcpu: the vcpu for which the RD is targeted by an invalidation 1367 1350 * 1368 1351 * Contrary to the INVALL command, this targets a RD instead of a 1369 1352 * collection, and we don't need to hold the its_lock, since no ITS is ··· 2161 2144 } 2162 2145 2163 2146 /** 2164 - * entry_fn_t - Callback called on a table entry restore path 2147 + * typedef entry_fn_t - Callback called on a table entry restore path 2165 2148 * @its: its handle 2166 2149 * @id: id of the entry 2167 2150 * @entry: pointer to the entry

+2 -1

arch/arm64/kvm/vgic/vgic-v3.c

··· 380 380 struct vgic_irq *irq; 381 381 gpa_t last_ptr = ~(gpa_t)0; 382 382 bool vlpi_avail = false; 383 + unsigned long index; 383 384 int ret = 0; 384 385 u8 val; 385 386 ··· 397 396 vlpi_avail = true; 398 397 } 399 398 400 - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { 399 + xa_for_each(&dist->lpi_xa, index, irq) { 401 400 int byte_offset, bit_nr; 402 401 struct kvm_vcpu *vcpu; 403 402 gpa_t pendbase, ptr;

+22 -40

arch/arm64/kvm/vgic/vgic.c

··· 30 30 * its->its_lock (mutex) 31 31 * vgic_cpu->ap_list_lock must be taken with IRQs disabled 32 32 * kvm->lpi_list_lock must be taken with IRQs disabled 33 - * vgic_irq->irq_lock must be taken with IRQs disabled 33 + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled 34 + * vgic_irq->irq_lock must be taken with IRQs disabled 34 35 * 35 36 * As the ap_list_lock might be taken from the timer interrupt handler, 36 37 * we have to disable IRQs before taking this lock and everything lower ··· 55 54 */ 56 55 57 56 /* 58 - * Iterate over the VM's list of mapped LPIs to find the one with a 59 - * matching interrupt ID and return a reference to the IRQ structure. 57 + * Index the VM's xarray of mapped LPIs and return a reference to the IRQ 58 + * structure. The caller is expected to call vgic_put_irq() later once it's 59 + * finished with the IRQ. 60 60 */ 61 61 static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) 62 62 { 63 63 struct vgic_dist *dist = &kvm->arch.vgic; 64 64 struct vgic_irq *irq = NULL; 65 - unsigned long flags; 66 65 67 - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 66 + rcu_read_lock(); 68 67 69 - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { 70 - if (irq->intid != intid) 71 - continue; 68 + irq = xa_load(&dist->lpi_xa, intid); 69 + if (!vgic_try_get_irq_kref(irq)) 70 + irq = NULL; 72 71 73 - /* 74 - * This increases the refcount, the caller is expected to 75 - * call vgic_put_irq() later once it's finished with the IRQ. 76 - */ 77 - vgic_get_irq_kref(irq); 78 - goto out_unlock; 79 - } 80 - irq = NULL; 81 - 82 - out_unlock: 83 - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 72 + rcu_read_unlock(); 84 73 85 74 return irq; 86 75 } ··· 111 120 { 112 121 } 113 122 114 - /* 115 - * Drop the refcount on the LPI. Must be called with lpi_list_lock held. 116 - */ 117 - void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq) 118 - { 119 - struct vgic_dist *dist = &kvm->arch.vgic; 120 - 121 - if (!kref_put(&irq->refcount, vgic_irq_release)) 122 - return; 123 - 124 - list_del(&irq->lpi_list); 125 - dist->lpi_list_count--; 126 - 127 - kfree(irq); 128 - } 129 - 130 123 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) 131 124 { 132 125 struct vgic_dist *dist = &kvm->arch.vgic; ··· 119 144 if (irq->intid < VGIC_MIN_LPI) 120 145 return; 121 146 122 - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 123 - __vgic_put_lpi_locked(kvm, irq); 124 - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 147 + if (!kref_put(&irq->refcount, vgic_irq_release)) 148 + return; 149 + 150 + xa_lock_irqsave(&dist->lpi_xa, flags); 151 + __xa_erase(&dist->lpi_xa, irq->intid); 152 + xa_unlock_irqrestore(&dist->lpi_xa, flags); 153 + 154 + atomic_dec(&dist->lpi_count); 155 + kfree_rcu(irq, rcu); 125 156 } 126 157 127 158 void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) ··· 184 203 } 185 204 186 205 /** 187 - * kvm_vgic_target_oracle - compute the target vcpu for an irq 206 + * vgic_target_oracle - compute the target vcpu for an irq 188 207 * 189 208 * @irq: The irq to route. Must be already locked. 190 209 * ··· 385 404 386 405 /* 387 406 * Grab a reference to the irq to reflect the fact that it is 388 - * now in the ap_list. 407 + * now in the ap_list. This is safe as the caller must already hold a 408 + * reference on the irq. 389 409 */ 390 410 vgic_get_irq_kref(irq); 391 411 list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);

+12 -5

arch/arm64/kvm/vgic/vgic.h

··· 180 180 gpa_t addr, int len); 181 181 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, 182 182 u32 intid); 183 - void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq); 184 183 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); 185 184 bool vgic_get_phys_line_level(struct vgic_irq *irq); 186 185 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); ··· 219 220 void vgic_v2_save_state(struct kvm_vcpu *vcpu); 220 221 void vgic_v2_restore_state(struct kvm_vcpu *vcpu); 221 222 223 + static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) 224 + { 225 + if (!irq) 226 + return false; 227 + 228 + if (irq->intid < VGIC_MIN_LPI) 229 + return true; 230 + 231 + return kref_get_unless_zero(&irq->refcount); 232 + } 233 + 222 234 static inline void vgic_get_irq_kref(struct vgic_irq *irq) 223 235 { 224 - if (irq->intid < VGIC_MIN_LPI) 225 - return; 226 - 227 - kref_get(&irq->refcount); 236 + WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); 228 237 } 229 238 230 239 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);

+1

arch/arm64/tools/cpucaps

··· 36 36 HAS_GIC_CPUIF_SYSREGS 37 37 HAS_GIC_PRIO_MASKING 38 38 HAS_GIC_PRIO_RELAXED_SYNC 39 + HAS_HCR_NV1 39 40 HAS_HCX 40 41 HAS_LDAPR 41 42 HAS_LPA2

+44 -1

arch/arm64/tools/sysreg

··· 1399 1399 UnsignedEnum 43:40 SPECRES 1400 1400 0b0000 NI 1401 1401 0b0001 IMP 1402 + 0b0010 COSP_RCTX 1402 1403 EndEnum 1403 1404 UnsignedEnum 39:36 SB 1404 1405 0b0000 NI ··· 1526 1525 EndSysreg 1527 1526 1528 1527 Sysreg ID_AA64ISAR3_EL1 3 0 0 6 3 1529 - Res0 63:12 1528 + Res0 63:16 1529 + UnsignedEnum 15:12 PACM 1530 + 0b0000 NI 1531 + 0b0001 TRIVIAL_IMP 1532 + 0b0010 FULL_IMP 1533 + EndEnum 1530 1534 UnsignedEnum 11:8 TLBIW 1531 1535 0b0000 NI 1532 1536 0b0001 IMP ··· 1828 1822 0b0000 NI 1829 1823 0b0001 IMP 1830 1824 EndEnum 1825 + EndSysreg 1826 + 1827 + Sysreg ID_AA64MMFR4_EL1 3 0 0 7 4 1828 + Res0 63:40 1829 + UnsignedEnum 39:36 E3DSE 1830 + 0b0000 NI 1831 + 0b0001 IMP 1832 + EndEnum 1833 + Res0 35:28 1834 + SignedEnum 27:24 E2H0 1835 + 0b0000 IMP 1836 + 0b1110 NI_NV1 1837 + 0b1111 NI 1838 + EndEnum 1839 + UnsignedEnum 23:20 NV_frac 1840 + 0b0000 NV_NV2 1841 + 0b0001 NV2_ONLY 1842 + EndEnum 1843 + UnsignedEnum 19:16 FGWTE3 1844 + 0b0000 NI 1845 + 0b0001 IMP 1846 + EndEnum 1847 + UnsignedEnum 15:12 HACDBS 1848 + 0b0000 NI 1849 + 0b0001 IMP 1850 + EndEnum 1851 + UnsignedEnum 11:8 ASID2 1852 + 0b0000 NI 1853 + 0b0001 IMP 1854 + EndEnum 1855 + SignedEnum 7:4 EIESB 1856 + 0b0000 NI 1857 + 0b0001 ToEL3 1858 + 0b0010 ToELx 1859 + 0b1111 ANY 1860 + EndEnum 1861 + Res0 3:0 1831 1862 EndSysreg 1832 1863 1833 1864 Sysreg SCTLR_EL1 3 0 1 0 0

-1

arch/loongarch/Kconfig

··· 133 133 select HAVE_KPROBES 134 134 select HAVE_KPROBES_ON_FTRACE 135 135 select HAVE_KRETPROBES 136 - select HAVE_KVM 137 136 select HAVE_MOD_ARCH_SPECIFIC 138 137 select HAVE_NMI 139 138 select HAVE_PCI

-2

arch/loongarch/include/uapi/asm/kvm.h

··· 14 14 * Some parts derived from the x86 version of this file. 15 15 */ 16 16 17 - #define __KVM_HAVE_READONLY_MEM 18 - 19 17 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 20 18 #define KVM_DIRTY_LOG_PAGE_OFFSET 64 21 19

+1 -1

arch/loongarch/kvm/Kconfig

··· 20 20 config KVM 21 21 tristate "Kernel-based Virtual Machine (KVM) support" 22 22 depends on AS_HAS_LVZ_EXTENSION 23 - depends on HAVE_KVM 24 23 select HAVE_KVM_DIRTY_RING_ACQ_REL 25 24 select HAVE_KVM_VCPU_ASYNC_IOCTL 26 25 select KVM_COMMON ··· 27 28 select KVM_GENERIC_HARDWARE_ENABLING 28 29 select KVM_GENERIC_MMU_NOTIFIER 29 30 select KVM_MMIO 31 + select HAVE_KVM_READONLY_MEM 30 32 select KVM_XFER_TO_GUEST_WORK 31 33 help 32 34 Support hosting virtualized guest machines using

-6

arch/loongarch/kvm/switch.S

··· 213 213 /* Save host GPRs */ 214 214 kvm_save_host_gpr a2 215 215 216 - /* Save host CRMD, PRMD to stack */ 217 - csrrd a3, LOONGARCH_CSR_CRMD 218 - st.d a3, a2, PT_CRMD 219 - csrrd a3, LOONGARCH_CSR_PRMD 220 - st.d a3, a2, PT_PRMD 221 - 222 216 addi.d a2, a1, KVM_VCPU_ARCH 223 217 st.d sp, a2, KVM_ARCH_HSP 224 218 st.d tp, a2, KVM_ARCH_HTP

+10 -33

arch/loongarch/kvm/timer.c

··· 23 23 return div_u64(tick * MNSEC_PER_SEC, vcpu->arch.timer_mhz); 24 24 } 25 25 26 - /* 27 - * Push timer forward on timeout. 28 - * Handle an hrtimer event by push the hrtimer forward a period. 29 - */ 30 - static enum hrtimer_restart kvm_count_timeout(struct kvm_vcpu *vcpu) 31 - { 32 - unsigned long cfg, period; 33 - 34 - /* Add periodic tick to current expire time */ 35 - cfg = kvm_read_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG); 36 - if (cfg & CSR_TCFG_PERIOD) { 37 - period = tick_to_ns(vcpu, cfg & CSR_TCFG_VAL); 38 - hrtimer_add_expires_ns(&vcpu->arch.swtimer, period); 39 - return HRTIMER_RESTART; 40 - } else 41 - return HRTIMER_NORESTART; 42 - } 43 - 44 26 /* Low level hrtimer wake routine */ 45 27 enum hrtimer_restart kvm_swtimer_wakeup(struct hrtimer *timer) 46 28 { ··· 32 50 kvm_queue_irq(vcpu, INT_TI); 33 51 rcuwait_wake_up(&vcpu->wait); 34 52 35 - return kvm_count_timeout(vcpu); 53 + return HRTIMER_NORESTART; 36 54 } 37 55 38 56 /* ··· 75 93 /* 76 94 * Freeze the soft-timer and sync the guest stable timer with it. 77 95 */ 78 - hrtimer_cancel(&vcpu->arch.swtimer); 96 + if (kvm_vcpu_is_blocking(vcpu)) 97 + hrtimer_cancel(&vcpu->arch.swtimer); 79 98 80 99 /* 81 100 * From LoongArch Reference Manual Volume 1 Chapter 7.6.2 ··· 151 168 * Here judge one-shot timer fired by checking whether TVAL is larger 152 169 * than TCFG 153 170 */ 154 - if (ticks < cfg) { 171 + if (ticks < cfg) 155 172 delta = tick_to_ns(vcpu, ticks); 156 - expire = ktime_add_ns(ktime_get(), delta); 157 - vcpu->arch.expire = expire; 173 + else 174 + delta = 0; 175 + 176 + expire = ktime_add_ns(ktime_get(), delta); 177 + vcpu->arch.expire = expire; 178 + if (kvm_vcpu_is_blocking(vcpu)) { 158 179 159 180 /* 160 181 * HRTIMER_MODE_PINNED is suggested since vcpu may run in 161 182 * the same physical cpu in next time 162 183 */ 163 - hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED); 164 - } else if (vcpu->stat.generic.blocking) { 165 - /* 166 - * Inject timer interrupt so that halt polling can dectect and exit. 167 - * VCPU is scheduled out already and sleeps in rcuwait queue and 168 - * will not poll pending events again. kvm_queue_irq() is not enough, 169 - * hrtimer swtimer should be used here. 170 - */ 171 - expire = ktime_add_ns(ktime_get(), 10); 172 - vcpu->arch.expire = expire; 173 184 hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED); 174 185 } 175 186 }

+26 -7

arch/loongarch/kvm/vcpu.c

··· 304 304 return -EINVAL; 305 305 306 306 switch (id) { 307 - case 2: 307 + case LOONGARCH_CPUCFG0: 308 + *v = GENMASK(31, 0); 309 + return 0; 310 + case LOONGARCH_CPUCFG1: 311 + /* CPUCFG1_MSGINT is not supported by KVM */ 312 + *v = GENMASK(25, 0); 313 + return 0; 314 + case LOONGARCH_CPUCFG2: 308 315 /* CPUCFG2 features unconditionally supported by KVM */ 309 316 *v = CPUCFG2_FP | CPUCFG2_FPSP | CPUCFG2_FPDP | 310 317 CPUCFG2_FPVERS | CPUCFG2_LLFTP | CPUCFG2_LLFTPREV | 311 - CPUCFG2_LAM; 318 + CPUCFG2_LSPW | CPUCFG2_LAM; 312 319 /* 313 320 * For the ISA extensions listed below, if one is supported 314 321 * by the host, then it is also supported by KVM. ··· 326 319 *v |= CPUCFG2_LASX; 327 320 328 321 return 0; 322 + case LOONGARCH_CPUCFG3: 323 + *v = GENMASK(16, 0); 324 + return 0; 325 + case LOONGARCH_CPUCFG4: 326 + case LOONGARCH_CPUCFG5: 327 + *v = GENMASK(31, 0); 328 + return 0; 329 + case LOONGARCH_CPUCFG16: 330 + *v = GENMASK(16, 0); 331 + return 0; 332 + case LOONGARCH_CPUCFG17 ... LOONGARCH_CPUCFG20: 333 + *v = GENMASK(30, 0); 334 + return 0; 329 335 default: 330 336 /* 331 - * No restrictions on other valid CPUCFG IDs' values, but 332 - * CPUCFG data is limited to 32 bits as the LoongArch ISA 333 - * manual says (Volume 1, Section 2.2.10.5 "CPUCFG"). 337 + * CPUCFG bits should be zero if reserved by HW or not 338 + * supported by KVM. 334 339 */ 335 - *v = U32_MAX; 340 + *v = 0; 336 341 return 0; 337 342 } 338 343 } ··· 363 344 return -EINVAL; 364 345 365 346 switch (id) { 366 - case 2: 347 + case LOONGARCH_CPUCFG2: 367 348 if (!(val & CPUCFG2_LLFTP)) 368 349 /* Guests must have a constant timer */ 369 350 return -EINVAL;

+9 -9

arch/mips/Kconfig

··· 1313 1313 select CPU_SUPPORTS_HIGHMEM 1314 1314 select CPU_SUPPORTS_HUGEPAGES 1315 1315 select CPU_SUPPORTS_MSA 1316 + select CPU_SUPPORTS_VZ 1316 1317 select CPU_DIEI_BROKEN if !LOONGSON3_ENHANCEMENT 1317 1318 select CPU_MIPSR2_IRQ_VI 1318 1319 select DMA_NONCOHERENT ··· 1325 1324 select MIPS_FP_SUPPORT 1326 1325 select GPIOLIB 1327 1326 select SWIOTLB 1328 - select HAVE_KVM 1329 1327 help 1330 1328 The Loongson GSx64(GS264/GS464/GS464E/GS464V) series of processor 1331 1329 cores implements the MIPS64R2 instruction set with many extensions, ··· 1399 1399 select CPU_SUPPORTS_32BIT_KERNEL 1400 1400 select CPU_SUPPORTS_HIGHMEM 1401 1401 select CPU_SUPPORTS_MSA 1402 - select HAVE_KVM 1403 1402 help 1404 1403 Choose this option to build a kernel for release 2 or later of the 1405 1404 MIPS32 architecture. Most modern embedded systems with a 32-bit ··· 1413 1414 select CPU_SUPPORTS_32BIT_KERNEL 1414 1415 select CPU_SUPPORTS_HIGHMEM 1415 1416 select CPU_SUPPORTS_MSA 1416 - select HAVE_KVM 1417 + select CPU_SUPPORTS_VZ 1417 1418 select MIPS_O32_FP64_SUPPORT 1418 1419 help 1419 1420 Choose this option to build a kernel for release 5 or later of the ··· 1429 1430 select CPU_SUPPORTS_32BIT_KERNEL 1430 1431 select CPU_SUPPORTS_HIGHMEM 1431 1432 select CPU_SUPPORTS_MSA 1432 - select HAVE_KVM 1433 + select CPU_SUPPORTS_VZ 1433 1434 select MIPS_O32_FP64_SUPPORT 1434 1435 help 1435 1436 Choose this option to build a kernel for release 6 or later of the ··· 1465 1466 select CPU_SUPPORTS_HIGHMEM 1466 1467 select CPU_SUPPORTS_HUGEPAGES 1467 1468 select CPU_SUPPORTS_MSA 1468 - select HAVE_KVM 1469 1469 help 1470 1470 Choose this option to build a kernel for release 2 or later of the 1471 1471 MIPS64 architecture. Many modern embedded systems with a 64-bit ··· 1482 1484 select CPU_SUPPORTS_HUGEPAGES 1483 1485 select CPU_SUPPORTS_MSA 1484 1486 select MIPS_O32_FP64_SUPPORT if 32BIT || MIPS32_O32 1485 - select HAVE_KVM 1487 + select CPU_SUPPORTS_VZ 1486 1488 help 1487 1489 Choose this option to build a kernel for release 5 or later of the 1488 1490 MIPS64 architecture. This is a intermediate MIPS architecture ··· 1500 1502 select CPU_SUPPORTS_HUGEPAGES 1501 1503 select CPU_SUPPORTS_MSA 1502 1504 select MIPS_O32_FP64_SUPPORT if 32BIT || MIPS32_O32 1503 - select HAVE_KVM 1505 + select CPU_SUPPORTS_VZ 1504 1506 help 1505 1507 Choose this option to build a kernel for release 6 or later of the 1506 1508 MIPS64 architecture. New MIPS processors, starting with the Warrior ··· 1515 1517 select CPU_SUPPORTS_HIGHMEM 1516 1518 select CPU_SUPPORTS_MSA 1517 1519 select CPU_SUPPORTS_CPUFREQ 1520 + select CPU_SUPPORTS_VZ 1518 1521 select CPU_MIPSR2_IRQ_VI 1519 1522 select CPU_MIPSR2_IRQ_EI 1520 - select HAVE_KVM 1521 1523 select MIPS_O32_FP64_SUPPORT 1522 1524 help 1523 1525 Choose this option to build a kernel for MIPS Warrior P5600 CPU. ··· 1639 1641 select USB_EHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN 1640 1642 select USB_OHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN 1641 1643 select MIPS_L1_CACHE_SHIFT_7 1642 - select HAVE_KVM 1644 + select CPU_SUPPORTS_VZ 1643 1645 help 1644 1646 The Cavium Octeon processor is a highly integrated chip containing 1645 1647 many ethernet hardware widgets for networking tasks. The processor ··· 2032 2034 config CPU_SUPPORTS_HUGEPAGES 2033 2035 bool 2034 2036 depends on !(32BIT && (PHYS_ADDR_T_64BIT || EVA)) 2037 + config CPU_SUPPORTS_VZ 2038 + bool 2035 2039 config MIPS_PGD_C0_CONTEXT 2036 2040 bool 2037 2041 depends on 64BIT

-2

arch/mips/include/uapi/asm/kvm.h

··· 20 20 * Some parts derived from the x86 version of this file. 21 21 */ 22 22 23 - #define __KVM_HAVE_READONLY_MEM 24 - 25 23 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 26 24 27 25 /*

+2 -1

arch/mips/kvm/Kconfig

··· 17 17 18 18 config KVM 19 19 tristate "Kernel-based Virtual Machine (KVM) support" 20 - depends on HAVE_KVM 20 + depends on CPU_SUPPORTS_VZ 21 21 depends on MIPS_FP_SUPPORT 22 22 select EXPORT_UASM 23 23 select KVM_COMMON ··· 26 26 select KVM_MMIO 27 27 select KVM_GENERIC_MMU_NOTIFIER 28 28 select KVM_GENERIC_HARDWARE_ENABLING 29 + select HAVE_KVM_READONLY_MEM 29 30 help 30 31 Support for hosting Guest kernels. 31 32

+44 -1

arch/powerpc/include/uapi/asm/kvm.h

··· 28 28 #define __KVM_HAVE_PPC_SMT 29 29 #define __KVM_HAVE_IRQCHIP 30 30 #define __KVM_HAVE_IRQ_LINE 31 - #define __KVM_HAVE_GUEST_DEBUG 32 31 33 32 /* Not always available, but if it is, this is the correct offset. */ 34 33 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 ··· 731 732 732 733 #define KVM_XIVE_TIMA_PAGE_OFFSET 0 733 734 #define KVM_XIVE_ESB_PAGE_OFFSET 4 735 + 736 + /* for KVM_PPC_GET_PVINFO */ 737 + 738 + #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) 739 + 740 + struct kvm_ppc_pvinfo { 741 + /* out */ 742 + __u32 flags; 743 + __u32 hcall[4]; 744 + __u8 pad[108]; 745 + }; 746 + 747 + /* for KVM_PPC_GET_SMMU_INFO */ 748 + #define KVM_PPC_PAGE_SIZES_MAX_SZ 8 749 + 750 + struct kvm_ppc_one_page_size { 751 + __u32 page_shift; /* Page shift (or 0) */ 752 + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ 753 + }; 754 + 755 + struct kvm_ppc_one_seg_page_size { 756 + __u32 page_shift; /* Base page shift of segment (or 0) */ 757 + __u32 slb_enc; /* SLB encoding for BookS */ 758 + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; 759 + }; 760 + 761 + #define KVM_PPC_PAGE_SIZES_REAL 0x00000001 762 + #define KVM_PPC_1T_SEGMENTS 0x00000002 763 + #define KVM_PPC_NO_HASH 0x00000004 764 + 765 + struct kvm_ppc_smmu_info { 766 + __u64 flags; 767 + __u32 slb_size; 768 + __u16 data_keys; /* # storage keys supported for data */ 769 + __u16 instr_keys; /* # storage keys supported for instructions */ 770 + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; 771 + }; 772 + 773 + /* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ 774 + struct kvm_ppc_resize_hpt { 775 + __u64 flags; 776 + __u32 shift; 777 + __u32 pad; 778 + }; 734 779 735 780 #endif /* __LINUX_KVM_POWERPC_H */

-1

arch/powerpc/kvm/Kconfig

··· 22 22 select KVM_COMMON 23 23 select HAVE_KVM_VCPU_ASYNC_IOCTL 24 24 select KVM_VFIO 25 - select IRQ_BYPASS_MANAGER 26 25 select HAVE_KVM_IRQ_BYPASS 27 26 28 27 config KVM_BOOK3S_HANDLER

+1 -2

arch/powerpc/kvm/powerpc.c

··· 2538 2538 vcpu->kvm->arch.kvm_ops->create_vcpu_debugfs(vcpu, debugfs_dentry); 2539 2539 } 2540 2540 2541 - int kvm_arch_create_vm_debugfs(struct kvm *kvm) 2541 + void kvm_arch_create_vm_debugfs(struct kvm *kvm) 2542 2542 { 2543 2543 if (kvm->arch.kvm_ops->create_vm_debugfs) 2544 2544 kvm->arch.kvm_ops->create_vm_debugfs(kvm); 2545 - return 0; 2546 2545 }

+2 -1

arch/riscv/include/uapi/asm/kvm.h

··· 16 16 #include <asm/ptrace.h> 17 17 18 18 #define __KVM_HAVE_IRQ_LINE 19 - #define __KVM_HAVE_READONLY_MEM 20 19 21 20 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 22 21 ··· 165 166 KVM_RISCV_ISA_EXT_ZVFH, 166 167 KVM_RISCV_ISA_EXT_ZVFHMIN, 167 168 KVM_RISCV_ISA_EXT_ZFA, 169 + KVM_RISCV_ISA_EXT_ZTSO, 170 + KVM_RISCV_ISA_EXT_ZACAS, 168 171 KVM_RISCV_ISA_EXT_MAX, 169 172 }; 170 173

+1

arch/riscv/kvm/Kconfig

··· 24 24 select HAVE_KVM_IRQ_ROUTING 25 25 select HAVE_KVM_MSI 26 26 select HAVE_KVM_VCPU_ASYNC_IOCTL 27 + select HAVE_KVM_READONLY_MEM 27 28 select KVM_COMMON 28 29 select KVM_GENERIC_DIRTYLOG_READ_PROTECT 29 30 select KVM_GENERIC_HARDWARE_ENABLING

+13

arch/riscv/kvm/vcpu_insn.c

··· 7 7 #include <linux/bitops.h> 8 8 #include <linux/kvm_host.h> 9 9 10 + #include <asm/cpufeature.h> 11 + 10 12 #define INSN_OPCODE_MASK 0x007c 11 13 #define INSN_OPCODE_SHIFT 2 12 14 #define INSN_OPCODE_SYSTEM 28 ··· 215 213 unsigned long wr_mask); 216 214 }; 217 215 216 + static int seed_csr_rmw(struct kvm_vcpu *vcpu, unsigned int csr_num, 217 + unsigned long *val, unsigned long new_val, 218 + unsigned long wr_mask) 219 + { 220 + if (!riscv_isa_extension_available(vcpu->arch.isa, ZKR)) 221 + return KVM_INSN_ILLEGAL_TRAP; 222 + 223 + return KVM_INSN_EXIT_TO_USER_SPACE; 224 + } 225 + 218 226 static const struct csr_func csr_funcs[] = { 219 227 KVM_RISCV_VCPU_AIA_CSR_FUNCS 220 228 KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS 229 + { .base = CSR_SEED, .count = 1, .func = seed_csr_rmw }, 221 230 }; 222 231 223 232 /**

+4

arch/riscv/kvm/vcpu_onereg.c

··· 40 40 KVM_ISA_EXT_ARR(SVINVAL), 41 41 KVM_ISA_EXT_ARR(SVNAPOT), 42 42 KVM_ISA_EXT_ARR(SVPBMT), 43 + KVM_ISA_EXT_ARR(ZACAS), 43 44 KVM_ISA_EXT_ARR(ZBA), 44 45 KVM_ISA_EXT_ARR(ZBB), 45 46 KVM_ISA_EXT_ARR(ZBC), ··· 67 66 KVM_ISA_EXT_ARR(ZKSED), 68 67 KVM_ISA_EXT_ARR(ZKSH), 69 68 KVM_ISA_EXT_ARR(ZKT), 69 + KVM_ISA_EXT_ARR(ZTSO), 70 70 KVM_ISA_EXT_ARR(ZVBB), 71 71 KVM_ISA_EXT_ARR(ZVBC), 72 72 KVM_ISA_EXT_ARR(ZVFH), ··· 119 117 case KVM_RISCV_ISA_EXT_SSTC: 120 118 case KVM_RISCV_ISA_EXT_SVINVAL: 121 119 case KVM_RISCV_ISA_EXT_SVNAPOT: 120 + case KVM_RISCV_ISA_EXT_ZACAS: 122 121 case KVM_RISCV_ISA_EXT_ZBA: 123 122 case KVM_RISCV_ISA_EXT_ZBB: 124 123 case KVM_RISCV_ISA_EXT_ZBC: ··· 144 141 case KVM_RISCV_ISA_EXT_ZKSED: 145 142 case KVM_RISCV_ISA_EXT_ZKSH: 146 143 case KVM_RISCV_ISA_EXT_ZKT: 144 + case KVM_RISCV_ISA_EXT_ZTSO: 147 145 case KVM_RISCV_ISA_EXT_ZVBB: 148 146 case KVM_RISCV_ISA_EXT_ZVBC: 149 147 case KVM_RISCV_ISA_EXT_ZVFH:

-1

arch/s390/Kconfig

··· 195 195 select HAVE_KPROBES 196 196 select HAVE_KPROBES_ON_FTRACE 197 197 select HAVE_KRETPROBES 198 - select HAVE_KVM 199 198 select HAVE_LIVEPATCH 200 199 select HAVE_MEMBLOCK_PHYS_MAP 201 200 select HAVE_MOD_ARCH_SPECIFIC

+314 -1

arch/s390/include/uapi/asm/kvm.h

··· 12 12 #include <linux/types.h> 13 13 14 14 #define __KVM_S390 15 - #define __KVM_HAVE_GUEST_DEBUG 15 + 16 + struct kvm_s390_skeys { 17 + __u64 start_gfn; 18 + __u64 count; 19 + __u64 skeydata_addr; 20 + __u32 flags; 21 + __u32 reserved[9]; 22 + }; 23 + 24 + #define KVM_S390_CMMA_PEEK (1 << 0) 25 + 26 + /** 27 + * kvm_s390_cmma_log - Used for CMMA migration. 28 + * 29 + * Used both for input and output. 30 + * 31 + * @start_gfn: Guest page number to start from. 32 + * @count: Size of the result buffer. 33 + * @flags: Control operation mode via KVM_S390_CMMA_* flags 34 + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty 35 + * pages are still remaining. 36 + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set 37 + * in the PGSTE. 38 + * @values: Pointer to the values buffer. 39 + * 40 + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. 41 + */ 42 + struct kvm_s390_cmma_log { 43 + __u64 start_gfn; 44 + __u32 count; 45 + __u32 flags; 46 + union { 47 + __u64 remaining; 48 + __u64 mask; 49 + }; 50 + __u64 values; 51 + }; 52 + 53 + #define KVM_S390_RESET_POR 1 54 + #define KVM_S390_RESET_CLEAR 2 55 + #define KVM_S390_RESET_SUBSYSTEM 4 56 + #define KVM_S390_RESET_CPU_INIT 8 57 + #define KVM_S390_RESET_IPL 16 58 + 59 + /* for KVM_S390_MEM_OP */ 60 + struct kvm_s390_mem_op { 61 + /* in */ 62 + __u64 gaddr; /* the guest address */ 63 + __u64 flags; /* flags */ 64 + __u32 size; /* amount of bytes */ 65 + __u32 op; /* type of operation */ 66 + __u64 buf; /* buffer in userspace */ 67 + union { 68 + struct { 69 + __u8 ar; /* the access register number */ 70 + __u8 key; /* access key, ignored if flag unset */ 71 + __u8 pad1[6]; /* ignored */ 72 + __u64 old_addr; /* ignored if cmpxchg flag unset */ 73 + }; 74 + __u32 sida_offset; /* offset into the sida */ 75 + __u8 reserved[32]; /* ignored */ 76 + }; 77 + }; 78 + /* types for kvm_s390_mem_op->op */ 79 + #define KVM_S390_MEMOP_LOGICAL_READ 0 80 + #define KVM_S390_MEMOP_LOGICAL_WRITE 1 81 + #define KVM_S390_MEMOP_SIDA_READ 2 82 + #define KVM_S390_MEMOP_SIDA_WRITE 3 83 + #define KVM_S390_MEMOP_ABSOLUTE_READ 4 84 + #define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 85 + #define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 86 + 87 + /* flags for kvm_s390_mem_op->flags */ 88 + #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) 89 + #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) 90 + #define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) 91 + 92 + /* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ 93 + #define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) 94 + #define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) 95 + 96 + struct kvm_s390_psw { 97 + __u64 mask; 98 + __u64 addr; 99 + }; 100 + 101 + /* valid values for type in kvm_s390_interrupt */ 102 + #define KVM_S390_SIGP_STOP 0xfffe0000u 103 + #define KVM_S390_PROGRAM_INT 0xfffe0001u 104 + #define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u 105 + #define KVM_S390_RESTART 0xfffe0003u 106 + #define KVM_S390_INT_PFAULT_INIT 0xfffe0004u 107 + #define KVM_S390_INT_PFAULT_DONE 0xfffe0005u 108 + #define KVM_S390_MCHK 0xfffe1000u 109 + #define KVM_S390_INT_CLOCK_COMP 0xffff1004u 110 + #define KVM_S390_INT_CPU_TIMER 0xffff1005u 111 + #define KVM_S390_INT_VIRTIO 0xffff2603u 112 + #define KVM_S390_INT_SERVICE 0xffff2401u 113 + #define KVM_S390_INT_EMERGENCY 0xffff1201u 114 + #define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u 115 + /* Anything below 0xfffe0000u is taken by INT_IO */ 116 + #define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ 117 + (((schid)) | \ 118 + ((ssid) << 16) | \ 119 + ((cssid) << 18) | \ 120 + ((ai) << 26)) 121 + #define KVM_S390_INT_IO_MIN 0x00000000u 122 + #define KVM_S390_INT_IO_MAX 0xfffdffffu 123 + #define KVM_S390_INT_IO_AI_MASK 0x04000000u 124 + 125 + 126 + struct kvm_s390_interrupt { 127 + __u32 type; 128 + __u32 parm; 129 + __u64 parm64; 130 + }; 131 + 132 + struct kvm_s390_io_info { 133 + __u16 subchannel_id; 134 + __u16 subchannel_nr; 135 + __u32 io_int_parm; 136 + __u32 io_int_word; 137 + }; 138 + 139 + struct kvm_s390_ext_info { 140 + __u32 ext_params; 141 + __u32 pad; 142 + __u64 ext_params2; 143 + }; 144 + 145 + struct kvm_s390_pgm_info { 146 + __u64 trans_exc_code; 147 + __u64 mon_code; 148 + __u64 per_address; 149 + __u32 data_exc_code; 150 + __u16 code; 151 + __u16 mon_class_nr; 152 + __u8 per_code; 153 + __u8 per_atmid; 154 + __u8 exc_access_id; 155 + __u8 per_access_id; 156 + __u8 op_access_id; 157 + #define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 158 + #define KVM_S390_PGM_FLAGS_ILC_0 0x02 159 + #define KVM_S390_PGM_FLAGS_ILC_1 0x04 160 + #define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 161 + #define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 162 + __u8 flags; 163 + __u8 pad[2]; 164 + }; 165 + 166 + struct kvm_s390_prefix_info { 167 + __u32 address; 168 + }; 169 + 170 + struct kvm_s390_extcall_info { 171 + __u16 code; 172 + }; 173 + 174 + struct kvm_s390_emerg_info { 175 + __u16 code; 176 + }; 177 + 178 + #define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 179 + struct kvm_s390_stop_info { 180 + __u32 flags; 181 + }; 182 + 183 + struct kvm_s390_mchk_info { 184 + __u64 cr14; 185 + __u64 mcic; 186 + __u64 failing_storage_address; 187 + __u32 ext_damage_code; 188 + __u32 pad; 189 + __u8 fixed_logout[16]; 190 + }; 191 + 192 + struct kvm_s390_irq { 193 + __u64 type; 194 + union { 195 + struct kvm_s390_io_info io; 196 + struct kvm_s390_ext_info ext; 197 + struct kvm_s390_pgm_info pgm; 198 + struct kvm_s390_emerg_info emerg; 199 + struct kvm_s390_extcall_info extcall; 200 + struct kvm_s390_prefix_info prefix; 201 + struct kvm_s390_stop_info stop; 202 + struct kvm_s390_mchk_info mchk; 203 + char reserved[64]; 204 + } u; 205 + }; 206 + 207 + struct kvm_s390_irq_state { 208 + __u64 buf; 209 + __u32 flags; /* will stay unused for compatibility reasons */ 210 + __u32 len; 211 + __u32 reserved[4]; /* will stay unused for compatibility reasons */ 212 + }; 213 + 214 + struct kvm_s390_ucas_mapping { 215 + __u64 user_addr; 216 + __u64 vcpu_addr; 217 + __u64 length; 218 + }; 219 + 220 + struct kvm_s390_pv_sec_parm { 221 + __u64 origin; 222 + __u64 length; 223 + }; 224 + 225 + struct kvm_s390_pv_unp { 226 + __u64 addr; 227 + __u64 size; 228 + __u64 tweak; 229 + }; 230 + 231 + enum pv_cmd_dmp_id { 232 + KVM_PV_DUMP_INIT, 233 + KVM_PV_DUMP_CONFIG_STOR_STATE, 234 + KVM_PV_DUMP_COMPLETE, 235 + KVM_PV_DUMP_CPU, 236 + }; 237 + 238 + struct kvm_s390_pv_dmp { 239 + __u64 subcmd; 240 + __u64 buff_addr; 241 + __u64 buff_len; 242 + __u64 gaddr; /* For dump storage state */ 243 + __u64 reserved[4]; 244 + }; 245 + 246 + enum pv_cmd_info_id { 247 + KVM_PV_INFO_VM, 248 + KVM_PV_INFO_DUMP, 249 + }; 250 + 251 + struct kvm_s390_pv_info_dump { 252 + __u64 dump_cpu_buffer_len; 253 + __u64 dump_config_mem_buffer_per_1m; 254 + __u64 dump_config_finalize_len; 255 + }; 256 + 257 + struct kvm_s390_pv_info_vm { 258 + __u64 inst_calls_list[4]; 259 + __u64 max_cpus; 260 + __u64 max_guests; 261 + __u64 max_guest_addr; 262 + __u64 feature_indication; 263 + }; 264 + 265 + struct kvm_s390_pv_info_header { 266 + __u32 id; 267 + __u32 len_max; 268 + __u32 len_written; 269 + __u32 reserved; 270 + }; 271 + 272 + struct kvm_s390_pv_info { 273 + struct kvm_s390_pv_info_header header; 274 + union { 275 + struct kvm_s390_pv_info_dump dump; 276 + struct kvm_s390_pv_info_vm vm; 277 + }; 278 + }; 279 + 280 + enum pv_cmd_id { 281 + KVM_PV_ENABLE, 282 + KVM_PV_DISABLE, 283 + KVM_PV_SET_SEC_PARMS, 284 + KVM_PV_UNPACK, 285 + KVM_PV_VERIFY, 286 + KVM_PV_PREP_RESET, 287 + KVM_PV_UNSHARE_ALL, 288 + KVM_PV_INFO, 289 + KVM_PV_DUMP, 290 + KVM_PV_ASYNC_CLEANUP_PREPARE, 291 + KVM_PV_ASYNC_CLEANUP_PERFORM, 292 + }; 293 + 294 + struct kvm_pv_cmd { 295 + __u32 cmd; /* Command to be executed */ 296 + __u16 rc; /* Ultravisor return code */ 297 + __u16 rrc; /* Ultravisor return reason code */ 298 + __u64 data; /* Data or address */ 299 + __u32 flags; /* flags for future extensions. Must be 0 for now */ 300 + __u32 reserved[3]; 301 + }; 302 + 303 + struct kvm_s390_zpci_op { 304 + /* in */ 305 + __u32 fh; /* target device */ 306 + __u8 op; /* operation to perform */ 307 + __u8 pad[3]; 308 + union { 309 + /* for KVM_S390_ZPCIOP_REG_AEN */ 310 + struct { 311 + __u64 ibv; /* Guest addr of interrupt bit vector */ 312 + __u64 sb; /* Guest addr of summary bit */ 313 + __u32 flags; 314 + __u32 noi; /* Number of interrupts */ 315 + __u8 isc; /* Guest interrupt subclass */ 316 + __u8 sbo; /* Offset of guest summary bit vector */ 317 + __u16 pad; 318 + } reg_aen; 319 + __u64 reserved[8]; 320 + } u; 321 + }; 322 + 323 + /* types for kvm_s390_zpci_op->op */ 324 + #define KVM_S390_ZPCIOP_REG_AEN 0 325 + #define KVM_S390_ZPCIOP_DEREG_AEN 1 326 + 327 + /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ 328 + #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) 16 329 17 330 /* Device control API: s390-specific devices */ 18 331 #define KVM_DEV_FLIC_GET_ALL_IRQS 1

-1

arch/s390/kvm/Kconfig

··· 19 19 config KVM 20 20 def_tristate y 21 21 prompt "Kernel-based Virtual Machine (KVM) support" 22 - depends on HAVE_KVM 23 22 select HAVE_KVM_CPU_RELAX_INTERCEPT 24 23 select HAVE_KVM_VCPU_ASYNC_IOCTL 25 24 select KVM_ASYNC_PF

+1 -1

arch/s390/kvm/diag.c

··· 102 102 parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL) 103 103 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 104 104 105 - if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr)) 105 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr)) 106 106 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 107 107 108 108 vcpu->arch.pfault_token = parm.token_addr;

+7 -7

arch/s390/kvm/gaccess.c

··· 665 665 case ASCE_TYPE_REGION1: { 666 666 union region1_table_entry rfte; 667 667 668 - if (kvm_is_error_gpa(vcpu->kvm, ptr)) 668 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) 669 669 return PGM_ADDRESSING; 670 670 if (deref_table(vcpu->kvm, ptr, &rfte.val)) 671 671 return -EFAULT; ··· 683 683 case ASCE_TYPE_REGION2: { 684 684 union region2_table_entry rste; 685 685 686 - if (kvm_is_error_gpa(vcpu->kvm, ptr)) 686 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) 687 687 return PGM_ADDRESSING; 688 688 if (deref_table(vcpu->kvm, ptr, &rste.val)) 689 689 return -EFAULT; ··· 701 701 case ASCE_TYPE_REGION3: { 702 702 union region3_table_entry rtte; 703 703 704 - if (kvm_is_error_gpa(vcpu->kvm, ptr)) 704 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) 705 705 return PGM_ADDRESSING; 706 706 if (deref_table(vcpu->kvm, ptr, &rtte.val)) 707 707 return -EFAULT; ··· 729 729 case ASCE_TYPE_SEGMENT: { 730 730 union segment_table_entry ste; 731 731 732 - if (kvm_is_error_gpa(vcpu->kvm, ptr)) 732 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) 733 733 return PGM_ADDRESSING; 734 734 if (deref_table(vcpu->kvm, ptr, &ste.val)) 735 735 return -EFAULT; ··· 749 749 ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; 750 750 } 751 751 } 752 - if (kvm_is_error_gpa(vcpu->kvm, ptr)) 752 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr)) 753 753 return PGM_ADDRESSING; 754 754 if (deref_table(vcpu->kvm, ptr, &pte.val)) 755 755 return -EFAULT; ··· 771 771 *prot = PROT_TYPE_IEP; 772 772 return PGM_PROTECTION; 773 773 } 774 - if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) 774 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr)) 775 775 return PGM_ADDRESSING; 776 776 *gpa = raddr.addr; 777 777 return 0; ··· 958 958 return rc; 959 959 } else { 960 960 gpa = kvm_s390_real_to_abs(vcpu, ga); 961 - if (kvm_is_error_gpa(vcpu->kvm, gpa)) { 961 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { 962 962 rc = PGM_ADDRESSING; 963 963 prot = PROT_NONE; 964 964 }

+2 -2

arch/s390/kvm/interrupt.c

··· 1031 1031 return 0; 1032 1032 } 1033 1033 ext = fi->srv_signal; 1034 - /* only clear the event bit */ 1034 + /* only clear the event bits */ 1035 1035 fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING; 1036 1036 clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); 1037 1037 spin_unlock(&fi->lock); ··· 1041 1041 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, 1042 1042 ext.ext_params, 0); 1043 1043 1044 - return write_sclp(vcpu, SCCB_EVENT_PENDING); 1044 + return write_sclp(vcpu, ext.ext_params & SCCB_EVENT_PENDING); 1045 1045 } 1046 1046 1047 1047 static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu)

+3 -3

arch/s390/kvm/kvm-s390.c

··· 2878 2878 2879 2879 srcu_idx = srcu_read_lock(&kvm->srcu); 2880 2880 2881 - if (kvm_is_error_gpa(kvm, mop->gaddr)) { 2881 + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { 2882 2882 r = PGM_ADDRESSING; 2883 2883 goto out_unlock; 2884 2884 } ··· 2940 2940 2941 2941 srcu_idx = srcu_read_lock(&kvm->srcu); 2942 2942 2943 - if (kvm_is_error_gpa(kvm, mop->gaddr)) { 2943 + if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { 2944 2944 r = PGM_ADDRESSING; 2945 2945 goto out_unlock; 2946 2946 } ··· 3153 3153 */ 3154 3154 static void kvm_s390_set_crycb_format(struct kvm *kvm) 3155 3155 { 3156 - kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb; 3156 + kvm->arch.crypto.crycbd = virt_to_phys(kvm->arch.crypto.crycb); 3157 3157 3158 3158 /* Clear the CRYCB format bits - i.e., set format 0 by default */ 3159 3159 kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK);

+2 -2

arch/s390/kvm/priv.c

··· 149 149 * first page, since address is 8k aligned and memory pieces are always 150 150 * at least 1MB aligned and have at least a size of 1MB. 151 151 */ 152 - if (kvm_is_error_gpa(vcpu->kvm, address)) 152 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, address)) 153 153 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 154 154 155 155 kvm_s390_set_prefix(vcpu, address); ··· 464 464 return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); 465 465 addr = kvm_s390_real_to_abs(vcpu, addr); 466 466 467 - if (kvm_is_error_gpa(vcpu->kvm, addr)) 467 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr)) 468 468 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 469 469 /* 470 470 * We don't expect errors on modern systems, and do not care

+1 -1

arch/s390/kvm/sigp.c

··· 172 172 * first page, since address is 8k aligned and memory pieces are always 173 173 * at least 1MB aligned and have at least a size of 1MB. 174 174 */ 175 - if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) { 175 + if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) { 176 176 *reg &= 0xffffffff00000000UL; 177 177 *reg |= SIGP_STATUS_INVALID_PARAMETER; 178 178 return SIGP_CC_STATUS_STORED;

-1

arch/x86/Kconfig

··· 245 245 select HAVE_FUNCTION_ERROR_INJECTION 246 246 select HAVE_KRETPROBES 247 247 select HAVE_RETHOOK 248 - select HAVE_KVM 249 248 select HAVE_LIVEPATCH if X86_64 250 249 select HAVE_MIXED_BREAKPOINTS_REGS 251 250 select HAVE_MOD_ARCH_SPECIFIC

+1 -1

arch/x86/include/asm/hardirq.h

··· 15 15 unsigned int irq_spurious_count; 16 16 unsigned int icr_read_retry_count; 17 17 #endif 18 - #ifdef CONFIG_HAVE_KVM 18 + #if IS_ENABLED(CONFIG_KVM) 19 19 unsigned int kvm_posted_intr_ipis; 20 20 unsigned int kvm_posted_intr_wakeup_ipis; 21 21 unsigned int kvm_posted_intr_nested_ipis;

+1 -1

arch/x86/include/asm/idtentry.h

··· 741 741 # endif 742 742 #endif 743 743 744 - #ifdef CONFIG_HAVE_KVM 744 + #if IS_ENABLED(CONFIG_KVM) 745 745 DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); 746 746 DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); 747 747 DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi);

+1 -1

arch/x86/include/asm/irq.h

··· 29 29 30 30 extern void fixup_irqs(void); 31 31 32 - #ifdef CONFIG_HAVE_KVM 32 + #if IS_ENABLED(CONFIG_KVM) 33 33 extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); 34 34 #endif 35 35

-2

arch/x86/include/asm/irq_vectors.h

··· 84 84 #define HYPERVISOR_CALLBACK_VECTOR 0xf3 85 85 86 86 /* Vector for KVM to deliver posted interrupt IPI */ 87 - #ifdef CONFIG_HAVE_KVM 88 87 #define POSTED_INTR_VECTOR 0xf2 89 88 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 90 89 #define POSTED_INTR_NESTED_VECTOR 0xf0 91 - #endif 92 90 93 91 #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef 94 92

-1

arch/x86/include/asm/kvm-x86-ops.h

··· 103 103 KVM_X86_OP(get_exit_info) 104 104 KVM_X86_OP(check_intercept) 105 105 KVM_X86_OP(handle_exit_irqoff) 106 - KVM_X86_OP(request_immediate_exit) 107 106 KVM_X86_OP(sched_in) 108 107 KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) 109 108 KVM_X86_OP_OPTIONAL(vcpu_blocking)

+1 -3

arch/x86/include/asm/kvm-x86-pmu-ops.h

··· 12 12 * a NULL definition, for example if "static_call_cond()" will be used 13 13 * at the call sites. 14 14 */ 15 - KVM_X86_PMU_OP(hw_event_available) 16 - KVM_X86_PMU_OP(pmc_idx_to_pmc) 17 15 KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) 18 16 KVM_X86_PMU_OP(msr_idx_to_pmc) 19 - KVM_X86_PMU_OP(is_valid_rdpmc_ecx) 17 + KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early) 20 18 KVM_X86_PMU_OP(is_valid_msr) 21 19 KVM_X86_PMU_OP(get_msr) 22 20 KVM_X86_PMU_OP(set_msr)

+22 -6

arch/x86/include/asm/kvm_host.h

··· 536 536 #define KVM_PMC_MAX_FIXED 3 537 537 #define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) 538 538 #define KVM_AMD_PMC_MAX_GENERIC 6 539 + 539 540 struct kvm_pmu { 540 541 u8 version; 541 542 unsigned nr_arch_gp_counters; ··· 1469 1468 */ 1470 1469 bool shadow_root_allocated; 1471 1470 1471 + #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING 1472 + /* 1473 + * If set, the VM has (or had) an external write tracking user, and 1474 + * thus all write tracking metadata has been allocated, even if KVM 1475 + * itself isn't using write tracking. 1476 + */ 1477 + bool external_write_tracking_enabled; 1478 + #endif 1479 + 1472 1480 #if IS_ENABLED(CONFIG_HYPERV) 1473 1481 hpa_t hv_root_tdp; 1474 1482 spinlock_t hv_root_tdp_lock; ··· 1675 1665 void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); 1676 1666 1677 1667 int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); 1678 - enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); 1668 + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, 1669 + bool force_immediate_exit); 1679 1670 int (*handle_exit)(struct kvm_vcpu *vcpu, 1680 1671 enum exit_fastpath_completion exit_fastpath); 1681 1672 int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); ··· 1743 1732 enum x86_intercept_stage stage, 1744 1733 struct x86_exception *exception); 1745 1734 void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); 1746 - 1747 - void (*request_immediate_exit)(struct kvm_vcpu *vcpu); 1748 1735 1749 1736 void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); 1750 1737 ··· 1891 1882 } 1892 1883 #endif /* CONFIG_HYPERV */ 1893 1884 1885 + enum kvm_intr_type { 1886 + /* Values are arbitrary, but must be non-zero. */ 1887 + KVM_HANDLING_IRQ = 1, 1888 + KVM_HANDLING_NMI, 1889 + }; 1890 + 1891 + /* Enable perf NMI and timer modes to work, and minimise false positives. */ 1894 1892 #define kvm_arch_pmi_in_guest(vcpu) \ 1895 - ((vcpu) && (vcpu)->arch.handling_intr_from_guest) 1893 + ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \ 1894 + (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI))) 1896 1895 1897 1896 void __init kvm_mmu_x86_module_init(void); 1898 1897 int kvm_mmu_vendor_module_init(void); ··· 2065 2048 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 2066 2049 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 2067 2050 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 2068 - void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 2051 + unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); 2069 2052 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 2070 2053 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 2071 2054 int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); ··· 2258 2241 2259 2242 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); 2260 2243 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); 2261 - void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); 2262 2244 2263 2245 void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, 2264 2246 u32 size);

+4 -4

arch/x86/include/asm/svm.h

··· 358 358 struct vmcb_seg ldtr; 359 359 struct vmcb_seg idtr; 360 360 struct vmcb_seg tr; 361 - u64 vmpl0_ssp; 362 - u64 vmpl1_ssp; 363 - u64 vmpl2_ssp; 364 - u64 vmpl3_ssp; 361 + u64 pl0_ssp; 362 + u64 pl1_ssp; 363 + u64 pl2_ssp; 364 + u64 pl3_ssp; 365 365 u64 u_cet; 366 366 u8 reserved_0xc8[2]; 367 367 u8 vmpl;

+1

arch/x86/include/asm/vmxfeatures.h

··· 25 25 #define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */ 26 26 #define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */ 27 27 #define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */ 28 + #define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* 5-level EPT paging */ 28 29 29 30 /* Aggregated APIC features 24-27 */ 30 31 #define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */

+277 -8

arch/x86/include/uapi/asm/kvm.h

··· 7 7 * 8 8 */ 9 9 10 + #include <linux/const.h> 11 + #include <linux/bits.h> 10 12 #include <linux/types.h> 11 13 #include <linux/ioctl.h> 12 14 #include <linux/stddef.h> ··· 42 40 #define __KVM_HAVE_IRQ_LINE 43 41 #define __KVM_HAVE_MSI 44 42 #define __KVM_HAVE_USER_NMI 45 - #define __KVM_HAVE_GUEST_DEBUG 46 43 #define __KVM_HAVE_MSIX 47 44 #define __KVM_HAVE_MCE 48 45 #define __KVM_HAVE_PIT_STATE2 ··· 50 49 #define __KVM_HAVE_DEBUGREGS 51 50 #define __KVM_HAVE_XSAVE 52 51 #define __KVM_HAVE_XCRS 53 - #define __KVM_HAVE_READONLY_MEM 54 52 55 53 /* Architectural interrupt line count. */ 56 54 #define KVM_NR_INTERRUPTS 256 ··· 526 526 #define KVM_PMU_EVENT_ALLOW 0 527 527 #define KVM_PMU_EVENT_DENY 1 528 528 529 - #define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) 529 + #define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) 530 530 #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) 531 + 532 + /* for KVM_CAP_MCE */ 533 + struct kvm_x86_mce { 534 + __u64 status; 535 + __u64 addr; 536 + __u64 misc; 537 + __u64 mcg_status; 538 + __u8 bank; 539 + __u8 pad1[7]; 540 + __u64 pad2[3]; 541 + }; 542 + 543 + /* for KVM_CAP_XEN_HVM */ 544 + #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) 545 + #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) 546 + #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) 547 + #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) 548 + #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) 549 + #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) 550 + #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) 551 + #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) 552 + #define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) 553 + 554 + struct kvm_xen_hvm_config { 555 + __u32 flags; 556 + __u32 msr; 557 + __u64 blob_addr_32; 558 + __u64 blob_addr_64; 559 + __u8 blob_size_32; 560 + __u8 blob_size_64; 561 + __u8 pad2[30]; 562 + }; 563 + 564 + struct kvm_xen_hvm_attr { 565 + __u16 type; 566 + __u16 pad[3]; 567 + union { 568 + __u8 long_mode; 569 + __u8 vector; 570 + __u8 runstate_update_flag; 571 + union { 572 + __u64 gfn; 573 + #define KVM_XEN_INVALID_GFN ((__u64)-1) 574 + __u64 hva; 575 + } shared_info; 576 + struct { 577 + __u32 send_port; 578 + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ 579 + __u32 flags; 580 + #define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) 581 + #define KVM_XEN_EVTCHN_UPDATE (1 << 1) 582 + #define KVM_XEN_EVTCHN_RESET (1 << 2) 583 + /* 584 + * Events sent by the guest are either looped back to 585 + * the guest itself (potentially on a different port#) 586 + * or signalled via an eventfd. 587 + */ 588 + union { 589 + struct { 590 + __u32 port; 591 + __u32 vcpu; 592 + __u32 priority; 593 + } port; 594 + struct { 595 + __u32 port; /* Zero for eventfd */ 596 + __s32 fd; 597 + } eventfd; 598 + __u32 padding[4]; 599 + } deliver; 600 + } evtchn; 601 + __u32 xen_version; 602 + __u64 pad[8]; 603 + } u; 604 + }; 605 + 606 + 607 + /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ 608 + #define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 609 + #define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 610 + #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 611 + /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ 612 + #define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 613 + #define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 614 + /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ 615 + #define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 616 + /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ 617 + #define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 618 + 619 + struct kvm_xen_vcpu_attr { 620 + __u16 type; 621 + __u16 pad[3]; 622 + union { 623 + __u64 gpa; 624 + #define KVM_XEN_INVALID_GPA ((__u64)-1) 625 + __u64 hva; 626 + __u64 pad[8]; 627 + struct { 628 + __u64 state; 629 + __u64 state_entry_time; 630 + __u64 time_running; 631 + __u64 time_runnable; 632 + __u64 time_blocked; 633 + __u64 time_offline; 634 + } runstate; 635 + __u32 vcpu_id; 636 + struct { 637 + __u32 port; 638 + __u32 priority; 639 + __u64 expires_ns; 640 + } timer; 641 + __u8 vector; 642 + } u; 643 + }; 644 + 645 + /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ 646 + #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 647 + #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 648 + #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 649 + #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 650 + #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 651 + #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 652 + /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ 653 + #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 654 + #define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 655 + #define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 656 + /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ 657 + #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 658 + 659 + /* Secure Encrypted Virtualization command */ 660 + enum sev_cmd_id { 661 + /* Guest initialization commands */ 662 + KVM_SEV_INIT = 0, 663 + KVM_SEV_ES_INIT, 664 + /* Guest launch commands */ 665 + KVM_SEV_LAUNCH_START, 666 + KVM_SEV_LAUNCH_UPDATE_DATA, 667 + KVM_SEV_LAUNCH_UPDATE_VMSA, 668 + KVM_SEV_LAUNCH_SECRET, 669 + KVM_SEV_LAUNCH_MEASURE, 670 + KVM_SEV_LAUNCH_FINISH, 671 + /* Guest migration commands (outgoing) */ 672 + KVM_SEV_SEND_START, 673 + KVM_SEV_SEND_UPDATE_DATA, 674 + KVM_SEV_SEND_UPDATE_VMSA, 675 + KVM_SEV_SEND_FINISH, 676 + /* Guest migration commands (incoming) */ 677 + KVM_SEV_RECEIVE_START, 678 + KVM_SEV_RECEIVE_UPDATE_DATA, 679 + KVM_SEV_RECEIVE_UPDATE_VMSA, 680 + KVM_SEV_RECEIVE_FINISH, 681 + /* Guest status and debug commands */ 682 + KVM_SEV_GUEST_STATUS, 683 + KVM_SEV_DBG_DECRYPT, 684 + KVM_SEV_DBG_ENCRYPT, 685 + /* Guest certificates commands */ 686 + KVM_SEV_CERT_EXPORT, 687 + /* Attestation report */ 688 + KVM_SEV_GET_ATTESTATION_REPORT, 689 + /* Guest Migration Extension */ 690 + KVM_SEV_SEND_CANCEL, 691 + 692 + KVM_SEV_NR_MAX, 693 + }; 694 + 695 + struct kvm_sev_cmd { 696 + __u32 id; 697 + __u64 data; 698 + __u32 error; 699 + __u32 sev_fd; 700 + }; 701 + 702 + struct kvm_sev_launch_start { 703 + __u32 handle; 704 + __u32 policy; 705 + __u64 dh_uaddr; 706 + __u32 dh_len; 707 + __u64 session_uaddr; 708 + __u32 session_len; 709 + }; 710 + 711 + struct kvm_sev_launch_update_data { 712 + __u64 uaddr; 713 + __u32 len; 714 + }; 715 + 716 + 717 + struct kvm_sev_launch_secret { 718 + __u64 hdr_uaddr; 719 + __u32 hdr_len; 720 + __u64 guest_uaddr; 721 + __u32 guest_len; 722 + __u64 trans_uaddr; 723 + __u32 trans_len; 724 + }; 725 + 726 + struct kvm_sev_launch_measure { 727 + __u64 uaddr; 728 + __u32 len; 729 + }; 730 + 731 + struct kvm_sev_guest_status { 732 + __u32 handle; 733 + __u32 policy; 734 + __u32 state; 735 + }; 736 + 737 + struct kvm_sev_dbg { 738 + __u64 src_uaddr; 739 + __u64 dst_uaddr; 740 + __u32 len; 741 + }; 742 + 743 + struct kvm_sev_attestation_report { 744 + __u8 mnonce[16]; 745 + __u64 uaddr; 746 + __u32 len; 747 + }; 748 + 749 + struct kvm_sev_send_start { 750 + __u32 policy; 751 + __u64 pdh_cert_uaddr; 752 + __u32 pdh_cert_len; 753 + __u64 plat_certs_uaddr; 754 + __u32 plat_certs_len; 755 + __u64 amd_certs_uaddr; 756 + __u32 amd_certs_len; 757 + __u64 session_uaddr; 758 + __u32 session_len; 759 + }; 760 + 761 + struct kvm_sev_send_update_data { 762 + __u64 hdr_uaddr; 763 + __u32 hdr_len; 764 + __u64 guest_uaddr; 765 + __u32 guest_len; 766 + __u64 trans_uaddr; 767 + __u32 trans_len; 768 + }; 769 + 770 + struct kvm_sev_receive_start { 771 + __u32 handle; 772 + __u32 policy; 773 + __u64 pdh_uaddr; 774 + __u32 pdh_len; 775 + __u64 session_uaddr; 776 + __u32 session_len; 777 + }; 778 + 779 + struct kvm_sev_receive_update_data { 780 + __u64 hdr_uaddr; 781 + __u32 hdr_len; 782 + __u64 guest_uaddr; 783 + __u32 guest_len; 784 + __u64 trans_uaddr; 785 + __u32 trans_len; 786 + }; 787 + 788 + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) 789 + #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) 790 + 791 + struct kvm_hyperv_eventfd { 792 + __u32 conn_id; 793 + __s32 fd; 794 + __u32 flags; 795 + __u32 padding[3]; 796 + }; 797 + 798 + #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff 799 + #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) 531 800 532 801 /* 533 802 * Masked event layout. ··· 818 549 ((__u64)(!!(exclude)) << 55)) 819 550 820 551 #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ 821 - (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) 822 - #define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) 823 - #define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) 824 - #define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) 552 + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) 553 + #define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) 554 + #define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) 555 + #define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) 825 556 #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) 826 557 827 558 /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ ··· 829 560 #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ 830 561 831 562 /* x86-specific KVM_EXIT_HYPERCALL flags. */ 832 - #define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) 563 + #define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) 833 564 834 565 #define KVM_X86_DEFAULT_VM 0 835 566 #define KVM_X86_SW_PROTECTED_VM 1

+1 -1

arch/x86/include/uapi/asm/kvm_para.h

··· 92 92 #define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3) 93 93 94 94 /* MSR_KVM_ASYNC_PF_INT */ 95 - #define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) 95 + #define KVM_ASYNC_PF_VEC_MASK __GENMASK(7, 0) 96 96 97 97 /* MSR_KVM_MIGRATION_CONTROL */ 98 98 #define KVM_MIGRATION_READY (1 << 0)

+2

arch/x86/kernel/cpu/feat_ctl.c

··· 72 72 c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD); 73 73 if (ept & VMX_EPT_1GB_PAGE_BIT) 74 74 c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB); 75 + if (ept & VMX_EPT_PAGE_WALK_5_BIT) 76 + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL); 75 77 76 78 /* Synthetic APIC features that are aggregates of multiple features. */ 77 79 if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&

+1 -1

arch/x86/kernel/idt.c

··· 153 153 #ifdef CONFIG_X86_LOCAL_APIC 154 154 INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), 155 155 INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), 156 - # ifdef CONFIG_HAVE_KVM 156 + # if IS_ENABLED(CONFIG_KVM) 157 157 INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), 158 158 INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), 159 159 INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),

+2 -2

arch/x86/kernel/irq.c

··· 164 164 #if defined(CONFIG_X86_IO_APIC) 165 165 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 166 166 #endif 167 - #ifdef CONFIG_HAVE_KVM 167 + #if IS_ENABLED(CONFIG_KVM) 168 168 seq_printf(p, "%*s: ", prec, "PIN"); 169 169 for_each_online_cpu(j) 170 170 seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); ··· 290 290 } 291 291 #endif 292 292 293 - #ifdef CONFIG_HAVE_KVM 293 + #if IS_ENABLED(CONFIG_KVM) 294 294 static void dummy_handler(void) {} 295 295 static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; 296 296

+1 -3

arch/x86/kvm/Kconfig

··· 7 7 8 8 menuconfig VIRTUALIZATION 9 9 bool "Virtualization" 10 - depends on HAVE_KVM || X86 11 10 default y 12 11 help 13 12 Say Y here to get to see options for using your Linux host to run other ··· 19 20 20 21 config KVM 21 22 tristate "Kernel-based Virtual Machine (KVM) support" 22 - depends on HAVE_KVM 23 23 depends on HIGH_RES_TIMERS 24 24 depends on X86_LOCAL_APIC 25 25 select KVM_COMMON ··· 27 29 select HAVE_KVM_PFNCACHE 28 30 select HAVE_KVM_DIRTY_RING_TSO 29 31 select HAVE_KVM_DIRTY_RING_ACQ_REL 30 - select IRQ_BYPASS_MANAGER 31 32 select HAVE_KVM_IRQ_BYPASS 32 33 select HAVE_KVM_IRQ_ROUTING 34 + select HAVE_KVM_READONLY_MEM 33 35 select KVM_ASYNC_PF 34 36 select USER_RETURN_NOTIFIER 35 37 select KVM_MMIO

+1 -2

arch/x86/kvm/debugfs.c

··· 189 189 .release = kvm_mmu_rmaps_stat_release, 190 190 }; 191 191 192 - int kvm_arch_create_vm_debugfs(struct kvm *kvm) 192 + void kvm_arch_create_vm_debugfs(struct kvm *kvm) 193 193 { 194 194 debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, 195 195 &mmu_rmaps_stat_fops); 196 - return 0; 197 196 }

+20 -27

arch/x86/kvm/emulate.c

··· 1820 1820 return X86EMUL_CONTINUE; 1821 1821 } 1822 1822 1823 - static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) 1823 + static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len) 1824 1824 { 1825 1825 struct segmented_address addr; 1826 1826 1827 - rsp_increment(ctxt, -bytes); 1827 + rsp_increment(ctxt, -len); 1828 1828 addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); 1829 1829 addr.seg = VCPU_SREG_SS; 1830 1830 1831 - return segmented_write(ctxt, addr, data, bytes); 1831 + return segmented_write(ctxt, addr, data, len); 1832 1832 } 1833 1833 1834 1834 static int em_push(struct x86_emulate_ctxt *ctxt) 1835 1835 { 1836 1836 /* Disable writeback. */ 1837 1837 ctxt->dst.type = OP_NONE; 1838 - return push(ctxt, &ctxt->src.val, ctxt->op_bytes); 1838 + return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes); 1839 1839 } 1840 1840 1841 1841 static int emulate_pop(struct x86_emulate_ctxt *ctxt, ··· 1863 1863 void *dest, int len) 1864 1864 { 1865 1865 int rc; 1866 - unsigned long val, change_mask; 1866 + unsigned long val = 0; 1867 + unsigned long change_mask; 1867 1868 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; 1868 1869 int cpl = ctxt->ops->cpl(ctxt); 1869 1870 ··· 1921 1920 return X86EMUL_UNHANDLEABLE; 1922 1921 1923 1922 rbp = reg_read(ctxt, VCPU_REGS_RBP); 1924 - rc = push(ctxt, &rbp, stack_size(ctxt)); 1923 + rc = emulate_push(ctxt, &rbp, stack_size(ctxt)); 1925 1924 if (rc != X86EMUL_CONTINUE) 1926 1925 return rc; 1927 1926 assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP), ··· 1955 1954 static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) 1956 1955 { 1957 1956 int seg = ctxt->src2.val; 1958 - unsigned long selector; 1957 + unsigned long selector = 0; 1959 1958 int rc; 1960 1959 1961 1960 rc = emulate_pop(ctxt, &selector, 2); ··· 2001 2000 { 2002 2001 int rc = X86EMUL_CONTINUE; 2003 2002 int reg = VCPU_REGS_RDI; 2004 - u32 val; 2003 + u32 val = 0; 2005 2004 2006 2005 while (reg >= VCPU_REGS_RAX) { 2007 2006 if (reg == VCPU_REGS_RSP) { ··· 2230 2229 static int em_ret(struct x86_emulate_ctxt *ctxt) 2231 2230 { 2232 2231 int rc; 2233 - unsigned long eip; 2232 + unsigned long eip = 0; 2234 2233 2235 2234 rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); 2236 2235 if (rc != X86EMUL_CONTINUE) ··· 2242 2241 static int em_ret_far(struct x86_emulate_ctxt *ctxt) 2243 2242 { 2244 2243 int rc; 2245 - unsigned long eip, cs; 2244 + unsigned long eip = 0; 2245 + unsigned long cs = 0; 2246 2246 int cpl = ctxt->ops->cpl(ctxt); 2247 2247 struct desc_struct new_desc; 2248 2248 ··· 3013 3011 ret = em_push(ctxt); 3014 3012 } 3015 3013 3016 - ops->get_dr(ctxt, 7, &dr7); 3014 + dr7 = ops->get_dr(ctxt, 7); 3017 3015 ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN)); 3018 3016 3019 3017 return ret; ··· 3186 3184 static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) 3187 3185 { 3188 3186 int rc; 3189 - unsigned long eip; 3187 + unsigned long eip = 0; 3190 3188 3191 3189 rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); 3192 3190 if (rc != X86EMUL_CONTINUE) ··· 3868 3866 return X86EMUL_CONTINUE; 3869 3867 } 3870 3868 3871 - static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) 3872 - { 3873 - unsigned long dr7; 3874 - 3875 - ctxt->ops->get_dr(ctxt, 7, &dr7); 3876 - 3877 - return dr7 & DR7_GD; 3878 - } 3879 - 3880 3869 static int check_dr_read(struct x86_emulate_ctxt *ctxt) 3881 3870 { 3882 3871 int dr = ctxt->modrm_reg; ··· 3880 3887 if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) 3881 3888 return emulate_ud(ctxt); 3882 3889 3883 - if (check_dr7_gd(ctxt)) { 3890 + if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) { 3884 3891 ulong dr6; 3885 3892 3886 - ctxt->ops->get_dr(ctxt, 6, &dr6); 3893 + dr6 = ctxt->ops->get_dr(ctxt, 6); 3887 3894 dr6 &= ~DR_TRAP_BITS; 3888 3895 dr6 |= DR6_BD | DR6_ACTIVE_LOW; 3889 3896 ctxt->ops->set_dr(ctxt, 6, dr6); ··· 3955 3962 * protected mode. 3956 3963 */ 3957 3964 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 3958 - ctxt->ops->check_pmc(ctxt, rcx)) 3965 + ctxt->ops->check_rdpmc_early(ctxt, rcx)) 3959 3966 return emulate_gp(ctxt, 0); 3960 3967 3961 3968 return X86EMUL_CONTINUE; ··· 4498 4505 }; 4499 4506 4500 4507 static const struct gprefix three_byte_0f_38_f0 = { 4501 - ID(0, &instr_dual_0f_38_f0), N, N, N 4508 + ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N 4502 4509 }; 4503 4510 4504 4511 static const struct gprefix three_byte_0f_38_f1 = { 4505 - ID(0, &instr_dual_0f_38_f1), N, N, N 4512 + ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N 4506 4513 }; 4507 4514 4508 4515 /* ··· 5442 5449 ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); 5443 5450 break; 5444 5451 case 0x21: /* mov from dr to reg */ 5445 - ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); 5452 + ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg); 5446 5453 break; 5447 5454 case 0x40 ... 0x4f: /* cmov */ 5448 5455 if (test_cc(ctxt->b, ctxt->eflags))

+2 -2

arch/x86/kvm/kvm_emulate.h

··· 203 203 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); 204 204 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); 205 205 int (*cpl)(struct x86_emulate_ctxt *ctxt); 206 - void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); 206 + ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr); 207 207 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); 208 208 int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); 209 209 int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); 210 210 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); 211 - int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); 211 + int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc); 212 212 int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); 213 213 void (*halt)(struct x86_emulate_ctxt *ctxt); 214 214 void (*wbinvd)(struct x86_emulate_ctxt *ctxt);

+30 -2

arch/x86/kvm/lapic.c

··· 41 41 #include "ioapic.h" 42 42 #include "trace.h" 43 43 #include "x86.h" 44 + #include "xen.h" 44 45 #include "cpuid.h" 45 46 #include "hyperv.h" 46 47 #include "smm.h" ··· 124 123 { 125 124 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 126 125 } 126 + 127 + __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); 128 + EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); 127 129 128 130 __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); 129 131 __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); ··· 503 499 } 504 500 505 501 /* Check if there are APF page ready requests pending */ 506 - if (enabled) 502 + if (enabled) { 507 503 kvm_make_request(KVM_REQ_APF_READY, apic->vcpu); 504 + kvm_xen_sw_enable_lapic(apic->vcpu); 505 + } 508 506 } 509 507 510 508 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) ··· 2472 2466 { 2473 2467 struct kvm_lapic *apic = vcpu->arch.apic; 2474 2468 2475 - if (!vcpu->arch.apic) 2469 + if (!vcpu->arch.apic) { 2470 + static_branch_dec(&kvm_has_noapic_vcpu); 2476 2471 return; 2472 + } 2477 2473 2478 2474 hrtimer_cancel(&apic->lapic_timer.timer); 2479 2475 ··· 2817 2809 2818 2810 ASSERT(vcpu != NULL); 2819 2811 2812 + if (!irqchip_in_kernel(vcpu->kvm)) { 2813 + static_branch_inc(&kvm_has_noapic_vcpu); 2814 + return 0; 2815 + } 2816 + 2820 2817 apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); 2821 2818 if (!apic) 2822 2819 goto nomem; ··· 2859 2846 vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; 2860 2847 static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ 2861 2848 kvm_iodevice_init(&apic->dev, &apic_mmio_ops); 2849 + 2850 + /* 2851 + * Defer evaluating inhibits until the vCPU is first run, as this vCPU 2852 + * will not get notified of any changes until this vCPU is visible to 2853 + * other vCPUs (marked online and added to the set of vCPUs). 2854 + * 2855 + * Opportunistically mark APICv active as VMX in particularly is highly 2856 + * unlikely to have inhibits. Ignore the current per-VM APICv state so 2857 + * that vCPU creation is guaranteed to run with a deterministic value, 2858 + * the request will ensure the vCPU gets the correct state before VM-Entry. 2859 + */ 2860 + if (enable_apicv) { 2861 + apic->apicv_active = true; 2862 + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 2863 + } 2862 2864 2863 2865 return 0; 2864 2866 nomem_free_apic:

+24 -13

arch/x86/kvm/mmu/mmu.c

··· 3575 3575 if (WARN_ON_ONCE(!sp)) 3576 3576 return; 3577 3577 3578 - if (is_tdp_mmu_page(sp)) 3578 + if (is_tdp_mmu_page(sp)) { 3579 + lockdep_assert_held_read(&kvm->mmu_lock); 3579 3580 kvm_tdp_mmu_put_root(kvm, sp); 3580 - else if (!--sp->root_count && sp->role.invalid) 3581 - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 3581 + } else { 3582 + lockdep_assert_held_write(&kvm->mmu_lock); 3583 + if (!--sp->root_count && sp->role.invalid) 3584 + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 3585 + } 3582 3586 3583 3587 *root_hpa = INVALID_PAGE; 3584 3588 } ··· 3591 3587 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, 3592 3588 ulong roots_to_free) 3593 3589 { 3590 + bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; 3594 3591 int i; 3595 3592 LIST_HEAD(invalid_list); 3596 3593 bool free_active_root; ··· 3614 3609 return; 3615 3610 } 3616 3611 3617 - write_lock(&kvm->mmu_lock); 3612 + if (is_tdp_mmu) 3613 + read_lock(&kvm->mmu_lock); 3614 + else 3615 + write_lock(&kvm->mmu_lock); 3618 3616 3619 3617 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3620 3618 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) ··· 3643 3635 mmu->root.pgd = 0; 3644 3636 } 3645 3637 3646 - kvm_mmu_commit_zap_page(kvm, &invalid_list); 3647 - write_unlock(&kvm->mmu_lock); 3638 + if (is_tdp_mmu) { 3639 + read_unlock(&kvm->mmu_lock); 3640 + WARN_ON_ONCE(!list_empty(&invalid_list)); 3641 + } else { 3642 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 3643 + write_unlock(&kvm->mmu_lock); 3644 + } 3648 3645 } 3649 3646 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); 3650 3647 ··· 3706 3693 unsigned i; 3707 3694 int r; 3708 3695 3696 + if (tdp_mmu_enabled) 3697 + return kvm_tdp_mmu_alloc_root(vcpu); 3698 + 3709 3699 write_lock(&vcpu->kvm->mmu_lock); 3710 3700 r = make_mmu_pages_available(vcpu); 3711 3701 if (r < 0) 3712 3702 goto out_unlock; 3713 3703 3714 - if (tdp_mmu_enabled) { 3715 - root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); 3716 - mmu->root.hpa = root; 3717 - } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { 3704 + if (shadow_root_level >= PT64_ROOT_4LEVEL) { 3718 3705 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); 3719 3706 mmu->root.hpa = root; 3720 3707 } else if (shadow_root_level == PT32E_ROOT_LEVEL) { ··· 7052 7039 7053 7040 kvm_mmu_reset_all_pte_masks(); 7054 7041 7055 - pte_list_desc_cache = kmem_cache_create("pte_list_desc", 7056 - sizeof(struct pte_list_desc), 7057 - 0, SLAB_ACCOUNT, NULL); 7042 + pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); 7058 7043 if (!pte_list_desc_cache) 7059 7044 goto out; 7060 7045

+66 -2

arch/x86/kvm/mmu/page_track.c

··· 20 20 #include "mmu_internal.h" 21 21 #include "page_track.h" 22 22 23 + static bool kvm_external_write_tracking_enabled(struct kvm *kvm) 24 + { 25 + #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING 26 + /* 27 + * Read external_write_tracking_enabled before related pointers. Pairs 28 + * with the smp_store_release in kvm_page_track_write_tracking_enable(). 29 + */ 30 + return smp_load_acquire(&kvm->arch.external_write_tracking_enabled); 31 + #else 32 + return false; 33 + #endif 34 + } 35 + 23 36 bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) 24 37 { 25 - return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || 26 - !tdp_enabled || kvm_shadow_root_allocated(kvm); 38 + return kvm_external_write_tracking_enabled(kvm) || 39 + kvm_shadow_root_allocated(kvm) || !tdp_enabled; 27 40 } 28 41 29 42 void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) ··· 166 153 return init_srcu_struct(&head->track_srcu); 167 154 } 168 155 156 + static int kvm_enable_external_write_tracking(struct kvm *kvm) 157 + { 158 + struct kvm_memslots *slots; 159 + struct kvm_memory_slot *slot; 160 + int r = 0, i, bkt; 161 + 162 + mutex_lock(&kvm->slots_arch_lock); 163 + 164 + /* 165 + * Check for *any* write tracking user (not just external users) under 166 + * lock. This avoids unnecessary work, e.g. if KVM itself is using 167 + * write tracking, or if two external users raced when registering. 168 + */ 169 + if (kvm_page_track_write_tracking_enabled(kvm)) 170 + goto out_success; 171 + 172 + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { 173 + slots = __kvm_memslots(kvm, i); 174 + kvm_for_each_memslot(slot, bkt, slots) { 175 + /* 176 + * Intentionally do NOT free allocations on failure to 177 + * avoid having to track which allocations were made 178 + * now versus when the memslot was created. The 179 + * metadata is guaranteed to be freed when the slot is 180 + * freed, and will be kept/used if userspace retries 181 + * the failed ioctl() instead of killing the VM. 182 + */ 183 + r = kvm_page_track_write_tracking_alloc(slot); 184 + if (r) 185 + goto out_unlock; 186 + } 187 + } 188 + 189 + out_success: 190 + /* 191 + * Ensure that external_write_tracking_enabled becomes true strictly 192 + * after all the related pointers are set. 193 + */ 194 + smp_store_release(&kvm->arch.external_write_tracking_enabled, true); 195 + out_unlock: 196 + mutex_unlock(&kvm->slots_arch_lock); 197 + return r; 198 + } 199 + 169 200 /* 170 201 * register the notifier so that event interception for the tracked guest 171 202 * pages can be received. ··· 218 161 struct kvm_page_track_notifier_node *n) 219 162 { 220 163 struct kvm_page_track_notifier_head *head; 164 + int r; 221 165 222 166 if (!kvm || kvm->mm != current->mm) 223 167 return -ESRCH; 168 + 169 + if (!kvm_external_write_tracking_enabled(kvm)) { 170 + r = kvm_enable_external_write_tracking(kvm); 171 + if (r) 172 + return r; 173 + } 224 174 225 175 kvm_get_kvm(kvm); 226 176

+87 -37

arch/x86/kvm/mmu/tdp_mmu.c

··· 149 149 * If shared is set, this function is operating under the MMU lock in read 150 150 * mode. 151 151 */ 152 - #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ 153 - for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ 154 - ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 155 - _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ 156 - if (kvm_mmu_page_as_id(_root) != _as_id) { \ 152 + #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \ 153 + for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ 154 + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 155 + _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ 156 + if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ 157 157 } else 158 158 159 159 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ ··· 171 171 * Holding mmu_lock for write obviates the need for RCU protection as the list 172 172 * is guaranteed to be stable. 173 173 */ 174 - #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 175 - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 176 - if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 177 - kvm_mmu_page_as_id(_root) != _as_id) { \ 174 + #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \ 175 + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 176 + if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 177 + ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ 178 + ((_only_valid) && (_root)->role.invalid))) { \ 178 179 } else 180 + 181 + #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 182 + __for_each_tdp_mmu_root(_kvm, _root, _as_id, false) 183 + 184 + #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ 185 + __for_each_tdp_mmu_root(_kvm, _root, _as_id, true) 179 186 180 187 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 181 188 { ··· 223 216 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 224 217 } 225 218 226 - hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 219 + int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) 227 220 { 228 - union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 221 + struct kvm_mmu *mmu = vcpu->arch.mmu; 222 + union kvm_mmu_page_role role = mmu->root_role; 223 + int as_id = kvm_mmu_role_as_id(role); 229 224 struct kvm *kvm = vcpu->kvm; 230 225 struct kvm_mmu_page *root; 231 226 232 - lockdep_assert_held_write(&kvm->mmu_lock); 227 + /* 228 + * Check for an existing root before acquiring the pages lock to avoid 229 + * unnecessary serialization if multiple vCPUs are loading a new root. 230 + * E.g. when bringing up secondary vCPUs, KVM will already have created 231 + * a valid root on behalf of the primary vCPU. 232 + */ 233 + read_lock(&kvm->mmu_lock); 234 + 235 + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { 236 + if (root->role.word == role.word) 237 + goto out_read_unlock; 238 + } 239 + 240 + spin_lock(&kvm->arch.tdp_mmu_pages_lock); 233 241 234 242 /* 235 - * Check for an existing root before allocating a new one. Note, the 236 - * role check prevents consuming an invalid root. 243 + * Recheck for an existing root after acquiring the pages lock, another 244 + * vCPU may have raced ahead and created a new usable root. Manually 245 + * walk the list of roots as the standard macros assume that the pages 246 + * lock is *not* held. WARN if grabbing a reference to a usable root 247 + * fails, as the last reference to a root can only be put *after* the 248 + * root has been invalidated, which requires holding mmu_lock for write. 237 249 */ 238 - for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 250 + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 239 251 if (root->role.word == role.word && 240 - kvm_tdp_mmu_get_root(root)) 241 - goto out; 252 + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) 253 + goto out_spin_unlock; 242 254 } 243 255 244 256 root = tdp_mmu_alloc_sp(vcpu); ··· 271 245 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). 272 246 */ 273 247 refcount_set(&root->tdp_mmu_root_count, 2); 274 - 275 - spin_lock(&kvm->arch.tdp_mmu_pages_lock); 276 248 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 277 - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 278 249 279 - out: 280 - return __pa(root->spt); 250 + out_spin_unlock: 251 + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 252 + out_read_unlock: 253 + read_unlock(&kvm->mmu_lock); 254 + /* 255 + * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest 256 + * and actually consuming the root if it's invalidated after dropping 257 + * mmu_lock, and the root can't be freed as this vCPU holds a reference. 258 + */ 259 + mmu->root.hpa = __pa(root->spt); 260 + mmu->root.pgd = 0; 261 + return 0; 281 262 } 282 263 283 264 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, ··· 767 734 rcu_read_lock(); 768 735 769 736 /* 770 - * To avoid RCU stalls due to recursively removing huge swaths of SPs, 771 - * split the zap into two passes. On the first pass, zap at the 1gb 772 - * level, and then zap top-level SPs on the second pass. "1gb" is not 773 - * arbitrary, as KVM must be able to zap a 1gb shadow page without 774 - * inducing a stall to allow in-place replacement with a 1gb hugepage. 737 + * Zap roots in multiple passes of decreasing granularity, i.e. zap at 738 + * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all 739 + * preempt models) or mmu_lock contention (full or real-time models). 740 + * Zapping at finer granularity marginally increases the total time of 741 + * the zap, but in most cases the zap itself isn't latency sensitive. 775 742 * 776 - * Because zapping a SP recurses on its children, stepping down to 777 - * PG_LEVEL_4K in the iterator itself is unnecessary. 743 + * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps 744 + * in order to mimic the page fault path, which can replace a 1GiB page 745 + * table with an equivalent 1GiB hugepage, i.e. can get saddled with 746 + * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This 747 + * allows verifying that KVM can safely zap 1GiB regions, e.g. without 748 + * inducing RCU stalls, without relying on a relatively rare event 749 + * (zapping roots is orders of magnitude more common). Note, because 750 + * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K 751 + * in the iterator itself is unnecessary. 778 752 */ 753 + if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 754 + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); 755 + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); 756 + } 779 757 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 780 758 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 781 759 ··· 844 800 continue; 845 801 846 802 tdp_mmu_iter_set_spte(kvm, &iter, 0); 847 - flush = true; 803 + 804 + /* 805 + * Zappings SPTEs in invalid roots doesn't require a TLB flush, 806 + * see kvm_tdp_mmu_zap_invalidated_roots() for details. 807 + */ 808 + if (!root->role.invalid) 809 + flush = true; 848 810 } 849 811 850 812 rcu_read_unlock(); ··· 863 813 } 864 814 865 815 /* 866 - * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 867 - * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 868 - * more SPTEs were zapped since the MMU lock was last acquired. 816 + * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. 817 + * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if 818 + * one or more SPTEs were zapped since the MMU lock was last acquired. 869 819 */ 870 820 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) 871 821 { 872 822 struct kvm_mmu_page *root; 873 823 874 824 lockdep_assert_held_write(&kvm->mmu_lock); 875 - for_each_tdp_mmu_root_yield_safe(kvm, root) 825 + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) 876 826 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); 877 827 878 828 return flush; ··· 946 896 * the VM is being destroyed). 947 897 * 948 898 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. 949 - * See kvm_tdp_mmu_get_vcpu_root_hpa(). 899 + * See kvm_tdp_mmu_alloc_root(). 950 900 */ 951 901 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 952 902 { ··· 1672 1622 { 1673 1623 struct kvm_mmu_page *root; 1674 1624 1675 - for_each_tdp_mmu_root(kvm, root, slot->as_id) 1625 + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1676 1626 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1677 1627 } 1678 1628 ··· 1790 1740 bool spte_set = false; 1791 1741 1792 1742 lockdep_assert_held_write(&kvm->mmu_lock); 1793 - for_each_tdp_mmu_root(kvm, root, slot->as_id) 1743 + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1794 1744 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1795 1745 1796 1746 return spte_set;

+1 -1

arch/x86/kvm/mmu/tdp_mmu.h

··· 10 10 void kvm_mmu_init_tdp_mmu(struct kvm *kvm); 11 11 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); 12 12 13 - hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); 13 + int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu); 14 14 15 15 __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) 16 16 {

+103 -60

arch/x86/kvm/pmu.c

··· 29 29 struct x86_pmu_capability __read_mostly kvm_pmu_cap; 30 30 EXPORT_SYMBOL_GPL(kvm_pmu_cap); 31 31 32 + struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; 33 + EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); 34 + 32 35 /* Precise Distribution of Instructions Retired (PDIR) */ 33 36 static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { 34 37 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), ··· 70 67 * all perf counters (both gp and fixed). The mapping relationship 71 68 * between pmc and perf counters is as the following: 72 69 * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters 73 - * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed 70 + * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed 74 71 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H 75 72 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters 76 73 */ ··· 414 411 static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, 415 412 int idx) 416 413 { 417 - int fixed_idx = idx - INTEL_PMC_IDX_FIXED; 414 + int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX; 418 415 419 416 if (filter->action == KVM_PMU_EVENT_DENY && 420 417 test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) ··· 444 441 static bool pmc_event_is_allowed(struct kvm_pmc *pmc) 445 442 { 446 443 return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && 447 - static_call(kvm_x86_pmu_hw_event_available)(pmc) && 448 444 check_pmu_event_filter(pmc); 449 445 } 450 446 451 - static void reprogram_counter(struct kvm_pmc *pmc) 447 + static int reprogram_counter(struct kvm_pmc *pmc) 452 448 { 453 449 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 454 450 u64 eventsel = pmc->eventsel; ··· 458 456 emulate_overflow = pmc_pause_counter(pmc); 459 457 460 458 if (!pmc_event_is_allowed(pmc)) 461 - goto reprogram_complete; 459 + return 0; 462 460 463 461 if (emulate_overflow) 464 462 __kvm_perf_overflow(pmc, false); ··· 468 466 469 467 if (pmc_is_fixed(pmc)) { 470 468 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, 471 - pmc->idx - INTEL_PMC_IDX_FIXED); 469 + pmc->idx - KVM_FIXED_PMC_BASE_IDX); 472 470 if (fixed_ctr_ctrl & 0x1) 473 471 eventsel |= ARCH_PERFMON_EVENTSEL_OS; 474 472 if (fixed_ctr_ctrl & 0x2) ··· 479 477 } 480 478 481 479 if (pmc->current_config == new_config && pmc_resume_counter(pmc)) 482 - goto reprogram_complete; 480 + return 0; 483 481 484 482 pmc_release_perf_event(pmc); 485 483 486 484 pmc->current_config = new_config; 487 485 488 - /* 489 - * If reprogramming fails, e.g. due to contention, leave the counter's 490 - * regprogram bit set, i.e. opportunistically try again on the next PMU 491 - * refresh. Don't make a new request as doing so can stall the guest 492 - * if reprogramming repeatedly fails. 493 - */ 494 - if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, 495 - (eventsel & pmu->raw_event_mask), 496 - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 497 - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 498 - eventsel & ARCH_PERFMON_EVENTSEL_INT)) 499 - return; 500 - 501 - reprogram_complete: 502 - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); 486 + return pmc_reprogram_counter(pmc, PERF_TYPE_RAW, 487 + (eventsel & pmu->raw_event_mask), 488 + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 489 + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 490 + eventsel & ARCH_PERFMON_EVENTSEL_INT); 503 491 } 504 492 505 493 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) 506 494 { 495 + DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); 507 496 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 497 + struct kvm_pmc *pmc; 508 498 int bit; 509 499 510 - for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { 511 - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); 500 + bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX); 512 501 513 - if (unlikely(!pmc)) { 514 - clear_bit(bit, pmu->reprogram_pmi); 515 - continue; 516 - } 502 + /* 503 + * The reprogramming bitmap can be written asynchronously by something 504 + * other than the task that holds vcpu->mutex, take care to clear only 505 + * the bits that will actually processed. 506 + */ 507 + BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t)); 508 + atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi); 517 509 518 - reprogram_counter(pmc); 510 + kvm_for_each_pmc(pmu, pmc, bit, bitmap) { 511 + /* 512 + * If reprogramming fails, e.g. due to contention, re-set the 513 + * regprogram bit set, i.e. opportunistically try again on the 514 + * next PMU refresh. Don't make a new request as doing so can 515 + * stall the guest if reprogramming repeatedly fails. 516 + */ 517 + if (reprogram_counter(pmc)) 518 + set_bit(pmc->idx, pmu->reprogram_pmi); 519 519 } 520 520 521 521 /* ··· 529 525 kvm_pmu_cleanup(vcpu); 530 526 } 531 527 532 - /* check if idx is a valid index to access PMU */ 533 - bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) 528 + int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) 534 529 { 535 - return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); 530 + /* 531 + * On Intel, VMX interception has priority over RDPMC exceptions that 532 + * aren't already handled by the emulator, i.e. there are no additional 533 + * check needed for Intel PMUs. 534 + * 535 + * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts, 536 + * i.e. an invalid PMC results in a #GP, not #VMEXIT. 537 + */ 538 + if (!kvm_pmu_ops.check_rdpmc_early) 539 + return 0; 540 + 541 + return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx); 536 542 } 537 543 538 544 bool is_vmware_backdoor_pmc(u32 pmc_idx) ··· 581 567 582 568 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 583 569 { 584 - bool fast_mode = idx & (1u << 31); 585 570 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 586 571 struct kvm_pmc *pmc; 587 - u64 mask = fast_mode ? ~0u : ~0ull; 572 + u64 mask = ~0ull; 588 573 589 574 if (!pmu->version) 590 575 return 1; ··· 729 716 730 717 bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); 731 718 732 - for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { 733 - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); 734 - if (!pmc) 735 - continue; 736 - 719 + kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { 737 720 pmc_stop_counter(pmc); 738 721 pmc->counter = 0; 739 722 pmc->emulated_counter = 0; ··· 750 741 */ 751 742 void kvm_pmu_refresh(struct kvm_vcpu *vcpu) 752 743 { 744 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 745 + 753 746 if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) 754 747 return; 755 748 ··· 761 750 */ 762 751 kvm_pmu_reset(vcpu); 763 752 764 - bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); 765 - static_call(kvm_x86_pmu_refresh)(vcpu); 753 + pmu->version = 0; 754 + pmu->nr_arch_gp_counters = 0; 755 + pmu->nr_arch_fixed_counters = 0; 756 + pmu->counter_bitmask[KVM_PMC_GP] = 0; 757 + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; 758 + pmu->reserved_bits = 0xffffffff00200000ull; 759 + pmu->raw_event_mask = X86_RAW_EVENT_MASK; 760 + pmu->global_ctrl_mask = ~0ull; 761 + pmu->global_status_mask = ~0ull; 762 + pmu->fixed_ctr_ctrl_mask = ~0ull; 763 + pmu->pebs_enable_mask = ~0ull; 764 + pmu->pebs_data_cfg_mask = ~0ull; 765 + bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); 766 + 767 + if (vcpu->kvm->arch.enable_pmu) 768 + static_call(kvm_x86_pmu_refresh)(vcpu); 766 769 } 767 770 768 771 void kvm_pmu_init(struct kvm_vcpu *vcpu) ··· 801 776 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, 802 777 pmu->pmc_in_use, X86_PMC_IDX_MAX); 803 778 804 - for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { 805 - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); 806 - 807 - if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) 779 + kvm_for_each_pmc(pmu, pmc, i, bitmask) { 780 + if (pmc->perf_event && !pmc_speculative_in_use(pmc)) 808 781 pmc_stop_counter(pmc); 809 782 } 810 783 ··· 822 799 kvm_pmu_request_counter_reprogram(pmc); 823 800 } 824 801 825 - static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, 826 - unsigned int perf_hw_id) 827 - { 828 - return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & 829 - AMD64_RAW_EVENT_MASK_NB); 830 - } 831 - 832 802 static inline bool cpl_is_matched(struct kvm_pmc *pmc) 833 803 { 834 804 bool select_os, select_user; ··· 833 817 select_user = config & ARCH_PERFMON_EVENTSEL_USR; 834 818 } else { 835 819 config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, 836 - pmc->idx - INTEL_PMC_IDX_FIXED); 820 + pmc->idx - KVM_FIXED_PMC_BASE_IDX); 837 821 select_os = config & 0x1; 838 822 select_user = config & 0x2; 839 823 } 840 824 825 + /* 826 + * Skip the CPL lookup, which isn't free on Intel, if the result will 827 + * be the same regardless of the CPL. 828 + */ 829 + if (select_os == select_user) 830 + return select_os; 831 + 841 832 return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; 842 833 } 843 834 844 - void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) 835 + void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) 845 836 { 837 + DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); 846 838 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 847 839 struct kvm_pmc *pmc; 848 840 int i; 849 841 850 - for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { 851 - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); 842 + BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); 852 843 853 - if (!pmc || !pmc_event_is_allowed(pmc)) 844 + if (!kvm_pmu_has_perf_global_ctrl(pmu)) 845 + bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); 846 + else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx, 847 + (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) 848 + return; 849 + 850 + kvm_for_each_pmc(pmu, pmc, i, bitmap) { 851 + /* 852 + * Ignore checks for edge detect (all events currently emulated 853 + * but KVM are always rising edges), pin control (unsupported 854 + * by modern CPUs), and counter mask and its invert flag (KVM 855 + * doesn't emulate multiple events in a single clock cycle). 856 + * 857 + * Note, the uppermost nibble of AMD's mask overlaps Intel's 858 + * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved 859 + * bits (bits 35:34). Checking the "in HLE/RTM transaction" 860 + * flags is correct as the vCPU can't be in a transaction if 861 + * KVM is emulating an instruction. Checking the reserved bits 862 + * might be wrong if they are defined in the future, but so 863 + * could ignoring them, so do the simple thing for now. 864 + */ 865 + if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || 866 + !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) 854 867 continue; 855 868 856 - /* Ignore checks for edge detect, pin control, invert and CMASK bits */ 857 - if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) 858 - kvm_pmu_incr_counter(pmc); 869 + kvm_pmu_incr_counter(pmc); 859 870 } 860 871 } 861 872 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);

+51 -6

arch/x86/kvm/pmu.h

··· 4 4 5 5 #include <linux/nospec.h> 6 6 7 + #include <asm/kvm_host.h> 8 + 7 9 #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) 8 10 #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) 9 11 #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) ··· 20 18 #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 21 19 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 22 20 21 + #define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED 22 + 23 + struct kvm_pmu_emulated_event_selectors { 24 + u64 INSTRUCTIONS_RETIRED; 25 + u64 BRANCH_INSTRUCTIONS_RETIRED; 26 + }; 27 + 23 28 struct kvm_pmu_ops { 24 - bool (*hw_event_available)(struct kvm_pmc *pmc); 25 - struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); 26 29 struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, 27 30 unsigned int idx, u64 *mask); 28 31 struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); 29 - bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); 32 + int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx); 30 33 bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); 31 34 int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 32 35 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); ··· 61 54 */ 62 55 return pmu->version > 1; 63 56 } 57 + 58 + /* 59 + * KVM tracks all counters in 64-bit bitmaps, with general purpose counters 60 + * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0 61 + * is tracked internally via index 32. On Intel, (AMD doesn't support fixed 62 + * counters), this mirrors how fixed counters are mapped to PERF_GLOBAL_CTRL 63 + * and similar MSRs, i.e. tracking fixed counters at base index 32 reduces the 64 + * amounter of boilerplate needed to iterate over PMCs *and* simplifies common 65 + * enabling/disable/reset operations. 66 + * 67 + * WARNING! This helper is only for lookups that are initiated by KVM, it is 68 + * NOT safe for guest lookups, e.g. will do the wrong thing if passed a raw 69 + * ECX value from RDPMC (fixed counters are accessed by setting bit 30 in ECX 70 + * for RDPMC, not by adding 32 to the fixed counter index). 71 + */ 72 + static inline struct kvm_pmc *kvm_pmc_idx_to_pmc(struct kvm_pmu *pmu, int idx) 73 + { 74 + if (idx < pmu->nr_arch_gp_counters) 75 + return &pmu->gp_counters[idx]; 76 + 77 + idx -= KVM_FIXED_PMC_BASE_IDX; 78 + if (idx >= 0 && idx < pmu->nr_arch_fixed_counters) 79 + return &pmu->fixed_counters[idx]; 80 + 81 + return NULL; 82 + } 83 + 84 + #define kvm_for_each_pmc(pmu, pmc, i, bitmap) \ 85 + for_each_set_bit(i, bitmap, X86_PMC_IDX_MAX) \ 86 + if (!(pmc = kvm_pmc_idx_to_pmc(pmu, i))) \ 87 + continue; \ 88 + else \ 64 89 65 90 static inline u64 pmc_bitmask(struct kvm_pmc *pmc) 66 91 { ··· 170 131 171 132 if (pmc_is_fixed(pmc)) 172 133 return fixed_ctrl_field(pmu->fixed_ctr_ctrl, 173 - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; 134 + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; 174 135 175 136 return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; 176 137 } 177 138 178 139 extern struct x86_pmu_capability kvm_pmu_cap; 140 + extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; 179 141 180 142 static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) 181 143 { ··· 218 178 pmu_ops->MAX_NR_GP_COUNTERS); 219 179 kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, 220 180 KVM_PMC_MAX_FIXED); 181 + 182 + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = 183 + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); 184 + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = 185 + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 221 186 } 222 187 223 188 static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) ··· 261 216 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); 262 217 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); 263 218 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); 264 - bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); 219 + int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx); 265 220 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); 266 221 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 267 222 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); ··· 270 225 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); 271 226 void kvm_pmu_destroy(struct kvm_vcpu *vcpu); 272 227 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); 273 - void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); 228 + void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel); 274 229 275 230 bool is_vmware_backdoor_pmc(u32 pmc_idx); 276 231

+4 -11

arch/x86/kvm/smm.c

··· 184 184 struct kvm_smram_state_32 *smram) 185 185 { 186 186 struct desc_ptr dt; 187 - unsigned long val; 188 187 int i; 189 188 190 189 smram->cr0 = kvm_read_cr0(vcpu); ··· 194 195 for (i = 0; i < 8; i++) 195 196 smram->gprs[i] = kvm_register_read_raw(vcpu, i); 196 197 197 - kvm_get_dr(vcpu, 6, &val); 198 - smram->dr6 = (u32)val; 199 - kvm_get_dr(vcpu, 7, &val); 200 - smram->dr7 = (u32)val; 198 + smram->dr6 = (u32)vcpu->arch.dr6; 199 + smram->dr7 = (u32)vcpu->arch.dr7; 201 200 202 201 enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); 203 202 enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); ··· 228 231 struct kvm_smram_state_64 *smram) 229 232 { 230 233 struct desc_ptr dt; 231 - unsigned long val; 232 234 int i; 233 235 234 236 for (i = 0; i < 16; i++) ··· 236 240 smram->rip = kvm_rip_read(vcpu); 237 241 smram->rflags = kvm_get_rflags(vcpu); 238 242 239 - 240 - kvm_get_dr(vcpu, 6, &val); 241 - smram->dr6 = val; 242 - kvm_get_dr(vcpu, 7, &val); 243 - smram->dr7 = val; 243 + smram->dr6 = vcpu->arch.dr6; 244 + smram->dr7 = vcpu->arch.dr7; 244 245 245 246 smram->cr0 = kvm_read_cr0(vcpu); 246 247 smram->cr3 = kvm_read_cr3(vcpu);

+8 -14

arch/x86/kvm/svm/pmu.c

··· 25 25 PMU_TYPE_EVNTSEL, 26 26 }; 27 27 28 - static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) 28 + static struct kvm_pmc *amd_pmu_get_pmc(struct kvm_pmu *pmu, int pmc_idx) 29 29 { 30 30 unsigned int num_counters = pmu->nr_arch_gp_counters; 31 31 ··· 70 70 return NULL; 71 71 } 72 72 73 - return amd_pmc_idx_to_pmc(pmu, idx); 73 + return amd_pmu_get_pmc(pmu, idx); 74 74 } 75 75 76 - static bool amd_hw_event_available(struct kvm_pmc *pmc) 77 - { 78 - return true; 79 - } 80 - 81 - static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) 76 + static int amd_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) 82 77 { 83 78 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 84 79 85 - idx &= ~(3u << 30); 80 + if (idx >= pmu->nr_arch_gp_counters) 81 + return -EINVAL; 86 82 87 - return idx < pmu->nr_arch_gp_counters; 83 + return 0; 88 84 } 89 85 90 86 /* idx is the ECX register of RDPMC instruction */ 91 87 static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, 92 88 unsigned int idx, u64 *mask) 93 89 { 94 - return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30)); 90 + return amd_pmu_get_pmc(vcpu_to_pmu(vcpu), idx); 95 91 } 96 92 97 93 static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) ··· 229 233 } 230 234 231 235 struct kvm_pmu_ops amd_pmu_ops __initdata = { 232 - .hw_event_available = amd_hw_event_available, 233 - .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, 234 236 .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, 235 237 .msr_idx_to_pmc = amd_msr_idx_to_pmc, 236 - .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx, 238 + .check_rdpmc_early = amd_check_rdpmc_early, 237 239 .is_valid_msr = amd_is_valid_msr, 238 240 .get_msr = amd_pmu_get_msr, 239 241 .set_msr = amd_pmu_set_msr,

+12 -13

arch/x86/kvm/svm/svm.c

··· 2735 2735 { 2736 2736 struct vcpu_svm *svm = to_svm(vcpu); 2737 2737 int reg, dr; 2738 - unsigned long val; 2739 2738 int err = 0; 2740 2739 2741 2740 /* ··· 2762 2763 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 2763 2764 if (dr >= 16) { /* mov to DRn */ 2764 2765 dr -= 16; 2765 - val = kvm_register_read(vcpu, reg); 2766 - err = kvm_set_dr(vcpu, dr, val); 2766 + err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 2767 2767 } else { 2768 - kvm_get_dr(vcpu, dr, &val); 2769 - kvm_register_write(vcpu, reg, val); 2768 + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 2770 2769 } 2771 2770 2772 2771 return kvm_complete_insn_gp(vcpu, err); ··· 4089 4092 4090 4093 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4091 4094 { 4095 + if (is_guest_mode(vcpu)) 4096 + return EXIT_FASTPATH_NONE; 4097 + 4092 4098 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && 4093 4099 to_svm(vcpu)->vmcb->control.exit_info_1) 4094 4100 return handle_fastpath_set_msr_irqoff(vcpu); ··· 4115 4115 guest_state_exit_irqoff(); 4116 4116 } 4117 4117 4118 - static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) 4118 + static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, 4119 + bool force_immediate_exit) 4119 4120 { 4120 4121 struct vcpu_svm *svm = to_svm(vcpu); 4121 4122 bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); 4122 4123 4123 - trace_kvm_entry(vcpu); 4124 + trace_kvm_entry(vcpu, force_immediate_exit); 4124 4125 4125 4126 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4126 4127 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; ··· 4140 4139 * is enough to force an immediate vmexit. 4141 4140 */ 4142 4141 disable_nmi_singlestep(svm); 4143 - smp_send_reschedule(vcpu->cpu); 4142 + force_immediate_exit = true; 4144 4143 } 4144 + 4145 + if (force_immediate_exit) 4146 + smp_send_reschedule(vcpu->cpu); 4145 4147 4146 4148 pre_svm_run(vcpu); 4147 4149 ··· 4240 4236 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4241 4237 4242 4238 svm_complete_interrupts(vcpu); 4243 - 4244 - if (is_guest_mode(vcpu)) 4245 - return EXIT_FASTPATH_NONE; 4246 4239 4247 4240 return svm_exit_handlers_fastpath(vcpu); 4248 4241 } ··· 5007 5006 5008 5007 .check_intercept = svm_check_intercept, 5009 5008 .handle_exit_irqoff = svm_handle_exit_irqoff, 5010 - 5011 - .request_immediate_exit = __kvm_request_immediate_exit, 5012 5009 5013 5010 .sched_in = svm_sched_in, 5014 5011

+6 -3

arch/x86/kvm/trace.h

··· 15 15 * Tracepoint for guest mode entry. 16 16 */ 17 17 TRACE_EVENT(kvm_entry, 18 - TP_PROTO(struct kvm_vcpu *vcpu), 19 - TP_ARGS(vcpu), 18 + TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit), 19 + TP_ARGS(vcpu, force_immediate_exit), 20 20 21 21 TP_STRUCT__entry( 22 22 __field( unsigned int, vcpu_id ) 23 23 __field( unsigned long, rip ) 24 + __field( bool, immediate_exit ) 24 25 ), 25 26 26 27 TP_fast_assign( 27 28 __entry->vcpu_id = vcpu->vcpu_id; 28 29 __entry->rip = kvm_rip_read(vcpu); 30 + __entry->immediate_exit = force_immediate_exit; 29 31 ), 30 32 31 - TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) 33 + TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, 34 + __entry->immediate_exit ? "[immediate exit]" : "") 32 35 ); 33 36 34 37 /*

+2 -2

arch/x86/kvm/vmx/nested.c

··· 3606 3606 return 1; 3607 3607 } 3608 3608 3609 - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 3609 + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3610 3610 3611 3611 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3612 3612 return nested_vmx_failInvalid(vcpu); ··· 4433 4433 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4434 4434 4435 4435 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4436 - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4436 + vmcs12->guest_dr7 = vcpu->arch.dr7; 4437 4437 4438 4438 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4439 4439 vmcs12->guest_ia32_efer = vcpu->arch.efer;

+86 -136

arch/x86/kvm/vmx/pmu_intel.c

··· 20 20 #include "nested.h" 21 21 #include "pmu.h" 22 22 23 + /* 24 + * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX 25 + * to encode the "type" of counter to read, i.e. this is not a "base". And to 26 + * further confuse things, non-architectural PMUs use bit 31 as a flag for 27 + * "fast" reads, whereas the "type" is an explicit value. 28 + */ 29 + #define INTEL_RDPMC_GP 0 30 + #define INTEL_RDPMC_FIXED INTEL_PMC_FIXED_RDPMC_BASE 31 + 32 + #define INTEL_RDPMC_TYPE_MASK GENMASK(31, 16) 33 + #define INTEL_RDPMC_INDEX_MASK GENMASK(15, 0) 34 + 23 35 #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) 24 - 25 - enum intel_pmu_architectural_events { 26 - /* 27 - * The order of the architectural events matters as support for each 28 - * event is enumerated via CPUID using the index of the event. 29 - */ 30 - INTEL_ARCH_CPU_CYCLES, 31 - INTEL_ARCH_INSTRUCTIONS_RETIRED, 32 - INTEL_ARCH_REFERENCE_CYCLES, 33 - INTEL_ARCH_LLC_REFERENCES, 34 - INTEL_ARCH_LLC_MISSES, 35 - INTEL_ARCH_BRANCHES_RETIRED, 36 - INTEL_ARCH_BRANCHES_MISPREDICTED, 37 - 38 - NR_REAL_INTEL_ARCH_EVENTS, 39 - 40 - /* 41 - * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a. 42 - * TSC reference cycles. The architectural reference cycles event may 43 - * or may not actually use the TSC as the reference, e.g. might use the 44 - * core crystal clock or the bus clock (yeah, "architectural"). 45 - */ 46 - PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS, 47 - NR_INTEL_ARCH_EVENTS, 48 - }; 49 - 50 - static struct { 51 - u8 eventsel; 52 - u8 unit_mask; 53 - } const intel_arch_events[] = { 54 - [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 }, 55 - [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 }, 56 - [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 }, 57 - [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f }, 58 - [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 }, 59 - [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 }, 60 - [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 }, 61 - [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 }, 62 - }; 63 - 64 - /* mapping between fixed pmc index and intel_arch_events array */ 65 - static int fixed_pmc_events[] = { 66 - [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED, 67 - [1] = INTEL_ARCH_CPU_CYCLES, 68 - [2] = PSEUDO_ARCH_REFERENCE_CYCLES, 69 - }; 70 36 71 37 static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) 72 38 { ··· 50 84 51 85 pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); 52 86 53 - __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); 87 + __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use); 54 88 kvm_pmu_request_counter_reprogram(pmc); 55 89 } 56 - } 57 - 58 - static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) 59 - { 60 - if (pmc_idx < INTEL_PMC_IDX_FIXED) { 61 - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, 62 - MSR_P6_EVNTSEL0); 63 - } else { 64 - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; 65 - 66 - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); 67 - } 68 - } 69 - 70 - static bool intel_hw_event_available(struct kvm_pmc *pmc) 71 - { 72 - struct kvm_pmu *pmu = pmc_to_pmu(pmc); 73 - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 74 - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 75 - int i; 76 - 77 - BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS); 78 - 79 - /* 80 - * Disallow events reported as unavailable in guest CPUID. Note, this 81 - * doesn't apply to pseudo-architectural events. 82 - */ 83 - for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) { 84 - if (intel_arch_events[i].eventsel != event_select || 85 - intel_arch_events[i].unit_mask != unit_mask) 86 - continue; 87 - 88 - return pmu->available_event_types & BIT(i); 89 - } 90 - 91 - return true; 92 - } 93 - 94 - static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) 95 - { 96 - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 97 - bool fixed = idx & (1u << 30); 98 - 99 - idx &= ~(3u << 30); 100 - 101 - return fixed ? idx < pmu->nr_arch_fixed_counters 102 - : idx < pmu->nr_arch_gp_counters; 103 90 } 104 91 105 92 static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, 106 93 unsigned int idx, u64 *mask) 107 94 { 95 + unsigned int type = idx & INTEL_RDPMC_TYPE_MASK; 108 96 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 109 - bool fixed = idx & (1u << 30); 110 97 struct kvm_pmc *counters; 111 98 unsigned int num_counters; 99 + u64 bitmask; 112 100 113 - idx &= ~(3u << 30); 114 - if (fixed) { 101 + /* 102 + * The encoding of ECX for RDPMC is different for architectural versus 103 + * non-architecturals PMUs (PMUs with version '0'). For architectural 104 + * PMUs, bits 31:16 specify the PMC type and bits 15:0 specify the PMC 105 + * index. For non-architectural PMUs, bit 31 is a "fast" flag, and 106 + * bits 30:0 specify the PMC index. 107 + * 108 + * Yell and reject attempts to read PMCs for a non-architectural PMU, 109 + * as KVM doesn't support such PMUs. 110 + */ 111 + if (WARN_ON_ONCE(!pmu->version)) 112 + return NULL; 113 + 114 + /* 115 + * General Purpose (GP) PMCs are supported on all PMUs, and fixed PMCs 116 + * are supported on all architectural PMUs, i.e. on all virtual PMUs 117 + * supported by KVM. Note, KVM only emulates fixed PMCs for PMU v2+, 118 + * but the type itself is still valid, i.e. let RDPMC fail due to 119 + * accessing a non-existent counter. Reject attempts to read all other 120 + * types, which are unknown/unsupported. 121 + */ 122 + switch (type) { 123 + case INTEL_RDPMC_FIXED: 115 124 counters = pmu->fixed_counters; 116 125 num_counters = pmu->nr_arch_fixed_counters; 117 - } else { 126 + bitmask = pmu->counter_bitmask[KVM_PMC_FIXED]; 127 + break; 128 + case INTEL_RDPMC_GP: 118 129 counters = pmu->gp_counters; 119 130 num_counters = pmu->nr_arch_gp_counters; 131 + bitmask = pmu->counter_bitmask[KVM_PMC_GP]; 132 + break; 133 + default: 134 + return NULL; 120 135 } 136 + 137 + idx &= INTEL_RDPMC_INDEX_MASK; 121 138 if (idx >= num_counters) 122 139 return NULL; 123 - *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; 140 + 141 + *mask &= bitmask; 124 142 return &counters[array_index_nospec(idx, num_counters)]; 125 143 } 126 144 ··· 414 464 return 0; 415 465 } 416 466 417 - static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) 467 + /* 468 + * Map fixed counter events to architectural general purpose event encodings. 469 + * Perf doesn't provide APIs to allow KVM to directly program a fixed counter, 470 + * and so KVM instead programs the architectural event to effectively request 471 + * the fixed counter. Perf isn't guaranteed to use a fixed counter and may 472 + * instead program the encoding into a general purpose counter, e.g. if a 473 + * different perf_event is already utilizing the requested counter, but the end 474 + * result is the same (ignoring the fact that using a general purpose counter 475 + * will likely exacerbate counter contention). 476 + * 477 + * Forcibly inlined to allow asserting on @index at build time, and there should 478 + * never be more than one user. 479 + */ 480 + static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) 418 481 { 419 - int i; 482 + const enum perf_hw_id fixed_pmc_perf_ids[] = { 483 + [0] = PERF_COUNT_HW_INSTRUCTIONS, 484 + [1] = PERF_COUNT_HW_CPU_CYCLES, 485 + [2] = PERF_COUNT_HW_REF_CPU_CYCLES, 486 + }; 487 + u64 eventsel; 420 488 421 - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED); 489 + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED); 490 + BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED); 422 491 423 - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { 424 - int index = array_index_nospec(i, KVM_PMC_MAX_FIXED); 425 - struct kvm_pmc *pmc = &pmu->fixed_counters[index]; 426 - u32 event = fixed_pmc_events[index]; 427 - 428 - pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | 429 - intel_arch_events[event].eventsel; 430 - } 492 + /* 493 + * Yell if perf reports support for a fixed counter but perf doesn't 494 + * have a known encoding for the associated general purpose event. 495 + */ 496 + eventsel = perf_get_hw_event_config(fixed_pmc_perf_ids[index]); 497 + WARN_ON_ONCE(!eventsel && index < kvm_pmu_cap.num_counters_fixed); 498 + return eventsel; 431 499 } 432 500 433 501 static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ··· 459 491 u64 counter_mask; 460 492 int i; 461 493 462 - pmu->nr_arch_gp_counters = 0; 463 - pmu->nr_arch_fixed_counters = 0; 464 - pmu->counter_bitmask[KVM_PMC_GP] = 0; 465 - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; 466 - pmu->version = 0; 467 - pmu->reserved_bits = 0xffffffff00200000ull; 468 - pmu->raw_event_mask = X86_RAW_EVENT_MASK; 469 - pmu->global_ctrl_mask = ~0ull; 470 - pmu->global_status_mask = ~0ull; 471 - pmu->fixed_ctr_ctrl_mask = ~0ull; 472 - pmu->pebs_enable_mask = ~0ull; 473 - pmu->pebs_data_cfg_mask = ~0ull; 474 - 475 494 memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); 476 495 477 496 /* ··· 470 515 return; 471 516 472 517 entry = kvm_find_cpuid_entry(vcpu, 0xa); 473 - if (!entry || !vcpu->kvm->arch.enable_pmu) 518 + if (!entry) 474 519 return; 520 + 475 521 eax.full = entry->eax; 476 522 edx.full = entry->edx; 477 523 ··· 499 543 kvm_pmu_cap.bit_width_fixed); 500 544 pmu->counter_bitmask[KVM_PMC_FIXED] = 501 545 ((u64)1 << edx.split.bit_width_fixed) - 1; 502 - setup_fixed_pmc_eventsel(pmu); 503 546 } 504 547 505 548 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 506 549 pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); 507 550 counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | 508 - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); 551 + (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); 509 552 pmu->global_ctrl_mask = counter_mask; 510 553 511 554 /* ··· 548 593 pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; 549 594 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { 550 595 pmu->fixed_ctr_ctrl_mask &= 551 - ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); 596 + ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); 552 597 } 553 598 pmu->pebs_data_cfg_mask = ~0xff00000full; 554 599 } else { ··· 574 619 for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { 575 620 pmu->fixed_counters[i].type = KVM_PMC_FIXED; 576 621 pmu->fixed_counters[i].vcpu = vcpu; 577 - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; 622 + pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; 578 623 pmu->fixed_counters[i].current_config = 0; 624 + pmu->fixed_counters[i].eventsel = intel_get_fixed_pmc_eventsel(i); 579 625 } 580 626 581 627 lbr_desc->records.nr = 0; ··· 704 748 struct kvm_pmc *pmc = NULL; 705 749 int bit, hw_idx; 706 750 707 - for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, 708 - X86_PMC_IDX_MAX) { 709 - pmc = intel_pmc_idx_to_pmc(pmu, bit); 710 - 711 - if (!pmc || !pmc_speculative_in_use(pmc) || 751 + kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { 752 + if (!pmc_speculative_in_use(pmc) || 712 753 !pmc_is_globally_enabled(pmc) || !pmc->perf_event) 713 754 continue; 714 755 ··· 720 767 } 721 768 722 769 struct kvm_pmu_ops intel_pmu_ops __initdata = { 723 - .hw_event_available = intel_hw_event_available, 724 - .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, 725 770 .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, 726 771 .msr_idx_to_pmc = intel_msr_idx_to_pmc, 727 - .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx, 728 772 .is_valid_msr = intel_is_valid_msr, 729 773 .get_msr = intel_pmu_get_msr, 730 774 .set_msr = intel_pmu_set_msr,

+84 -73

arch/x86/kvm/vmx/vmx.c

··· 50 50 #include <asm/spec-ctrl.h> 51 51 #include <asm/vmx.h> 52 52 53 + #include <trace/events/ipi.h> 54 + 53 55 #include "capabilities.h" 54 56 #include "cpuid.h" 55 57 #include "hyperv.h" ··· 162 160 163 161 /* 164 162 * List of MSRs that can be directly passed to the guest. 165 - * In addition to these x2apic and PT MSRs are handled specially. 163 + * In addition to these x2apic, PT and LBR MSRs are handled specially. 166 164 */ 167 165 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 168 166 MSR_IA32_SPEC_CTRL, ··· 670 668 return flexpriority_enabled && lapic_in_kernel(vcpu); 671 669 } 672 670 673 - static int possible_passthrough_msr_slot(u32 msr) 671 + static int vmx_get_passthrough_msr_slot(u32 msr) 674 672 { 675 - u32 i; 676 - 677 - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) 678 - if (vmx_possible_passthrough_msrs[i] == msr) 679 - return i; 680 - 681 - return -ENOENT; 682 - } 683 - 684 - static bool is_valid_passthrough_msr(u32 msr) 685 - { 686 - bool r; 673 + int i; 687 674 688 675 switch (msr) { 689 676 case 0x800 ... 0x8ff: 690 677 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 691 - return true; 678 + return -ENOENT; 692 679 case MSR_IA32_RTIT_STATUS: 693 680 case MSR_IA32_RTIT_OUTPUT_BASE: 694 681 case MSR_IA32_RTIT_OUTPUT_MASK: ··· 692 701 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 693 702 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 694 703 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 695 - return true; 704 + return -ENOENT; 696 705 } 697 706 698 - r = possible_passthrough_msr_slot(msr) != -ENOENT; 707 + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 708 + if (vmx_possible_passthrough_msrs[i] == msr) 709 + return i; 710 + } 699 711 700 - WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 701 - 702 - return r; 712 + WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 713 + return -ENOENT; 703 714 } 704 715 705 716 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) ··· 1283 1290 unsigned long fs_base, gs_base; 1284 1291 u16 fs_sel, gs_sel; 1285 1292 int i; 1286 - 1287 - vmx->req_immediate_exit = false; 1288 1293 1289 1294 /* 1290 1295 * Note that guest MSRs to be saved/restored can also be changed ··· 3955 3964 { 3956 3965 struct vcpu_vmx *vmx = to_vmx(vcpu); 3957 3966 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3967 + int idx; 3958 3968 3959 3969 if (!cpu_has_vmx_msr_bitmap()) 3960 3970 return; ··· 3965 3973 /* 3966 3974 * Mark the desired intercept state in shadow bitmap, this is needed 3967 3975 * for resync when the MSR filters change. 3968 - */ 3969 - if (is_valid_passthrough_msr(msr)) { 3970 - int idx = possible_passthrough_msr_slot(msr); 3971 - 3972 - if (idx != -ENOENT) { 3973 - if (type & MSR_TYPE_R) 3974 - clear_bit(idx, vmx->shadow_msr_intercept.read); 3975 - if (type & MSR_TYPE_W) 3976 - clear_bit(idx, vmx->shadow_msr_intercept.write); 3977 - } 3976 + */ 3977 + idx = vmx_get_passthrough_msr_slot(msr); 3978 + if (idx >= 0) { 3979 + if (type & MSR_TYPE_R) 3980 + clear_bit(idx, vmx->shadow_msr_intercept.read); 3981 + if (type & MSR_TYPE_W) 3982 + clear_bit(idx, vmx->shadow_msr_intercept.write); 3978 3983 } 3979 3984 3980 3985 if ((type & MSR_TYPE_R) && ··· 3997 4008 { 3998 4009 struct vcpu_vmx *vmx = to_vmx(vcpu); 3999 4010 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4011 + int idx; 4000 4012 4001 4013 if (!cpu_has_vmx_msr_bitmap()) 4002 4014 return; ··· 4007 4017 /* 4008 4018 * Mark the desired intercept state in shadow bitmap, this is needed 4009 4019 * for resync when the MSR filter changes. 4010 - */ 4011 - if (is_valid_passthrough_msr(msr)) { 4012 - int idx = possible_passthrough_msr_slot(msr); 4013 - 4014 - if (idx != -ENOENT) { 4015 - if (type & MSR_TYPE_R) 4016 - set_bit(idx, vmx->shadow_msr_intercept.read); 4017 - if (type & MSR_TYPE_W) 4018 - set_bit(idx, vmx->shadow_msr_intercept.write); 4019 - } 4020 + */ 4021 + idx = vmx_get_passthrough_msr_slot(msr); 4022 + if (idx >= 0) { 4023 + if (type & MSR_TYPE_R) 4024 + set_bit(idx, vmx->shadow_msr_intercept.read); 4025 + if (type & MSR_TYPE_W) 4026 + set_bit(idx, vmx->shadow_msr_intercept.write); 4020 4027 } 4021 4028 4022 4029 if (type & MSR_TYPE_R) ··· 4123 4136 { 4124 4137 struct vcpu_vmx *vmx = to_vmx(vcpu); 4125 4138 u32 i; 4139 + 4140 + if (!cpu_has_vmx_msr_bitmap()) 4141 + return; 4126 4142 4127 4143 /* 4128 4144 * Redo intercept permissions for MSRs that KVM is passing through to ··· 5566 5576 5567 5577 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5568 5578 if (exit_qualification & TYPE_MOV_FROM_DR) { 5569 - unsigned long val; 5570 - 5571 - kvm_get_dr(vcpu, dr, &val); 5572 - kvm_register_write(vcpu, reg, val); 5579 + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5573 5580 err = 0; 5574 5581 } else { 5575 5582 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); ··· 5988 6001 return 1; 5989 6002 } 5990 6003 5991 - static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) 6004 + static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 6005 + bool force_immediate_exit) 5992 6006 { 5993 6007 struct vcpu_vmx *vmx = to_vmx(vcpu); 5994 6008 5995 - if (!vmx->req_immediate_exit && 5996 - !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { 5997 - kvm_lapic_expired_hv_timer(vcpu); 6009 + /* 6010 + * In the *extremely* unlikely scenario that this is a spurious VM-Exit 6011 + * due to the timer expiring while it was "soft" disabled, just eat the 6012 + * exit and re-enter the guest. 6013 + */ 6014 + if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 5998 6015 return EXIT_FASTPATH_REENTER_GUEST; 5999 - } 6000 6016 6001 - return EXIT_FASTPATH_NONE; 6017 + /* 6018 + * If the timer expired because KVM used it to force an immediate exit, 6019 + * then mission accomplished. 6020 + */ 6021 + if (force_immediate_exit) 6022 + return EXIT_FASTPATH_EXIT_HANDLED; 6023 + 6024 + /* 6025 + * If L2 is active, go down the slow path as emulating the guest timer 6026 + * expiration likely requires synthesizing a nested VM-Exit. 6027 + */ 6028 + if (is_guest_mode(vcpu)) 6029 + return EXIT_FASTPATH_NONE; 6030 + 6031 + kvm_lapic_expired_hv_timer(vcpu); 6032 + return EXIT_FASTPATH_REENTER_GUEST; 6002 6033 } 6003 6034 6004 6035 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6005 6036 { 6006 - handle_fastpath_preemption_timer(vcpu); 6037 + /* 6038 + * This non-fastpath handler is reached if and only if the preemption 6039 + * timer was being used to emulate a guest timer while L2 is active. 6040 + * All other scenarios are supposed to be handled in the fastpath. 6041 + */ 6042 + WARN_ON_ONCE(!is_guest_mode(vcpu)); 6043 + kvm_lapic_expired_hv_timer(vcpu); 6007 6044 return 1; 6008 6045 } 6009 6046 ··· 6530 6519 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 6531 6520 vcpu->run->internal.data[0] = vectoring_info; 6532 6521 vcpu->run->internal.data[1] = exit_reason.full; 6533 - vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; 6522 + vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); 6534 6523 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { 6535 6524 vcpu->run->internal.data[ndata++] = 6536 6525 vmcs_read64(GUEST_PHYSICAL_ADDRESS); ··· 7169 7158 msrs[i].host, false); 7170 7159 } 7171 7160 7172 - static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 7161 + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7173 7162 { 7174 7163 struct vcpu_vmx *vmx = to_vmx(vcpu); 7175 7164 u64 tscl; 7176 7165 u32 delta_tsc; 7177 7166 7178 - if (vmx->req_immediate_exit) { 7167 + if (force_immediate_exit) { 7179 7168 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7180 7169 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7181 7170 } else if (vmx->hv_deadline_tsc != -1) { ··· 7228 7217 barrier_nospec(); 7229 7218 } 7230 7219 7231 - static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 7220 + static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7221 + bool force_immediate_exit) 7232 7222 { 7223 + /* 7224 + * If L2 is active, some VMX preemption timer exits can be handled in 7225 + * the fastpath even, all other exits must use the slow path. 7226 + */ 7227 + if (is_guest_mode(vcpu) && 7228 + to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) 7229 + return EXIT_FASTPATH_NONE; 7230 + 7233 7231 switch (to_vmx(vcpu)->exit_reason.basic) { 7234 7232 case EXIT_REASON_MSR_WRITE: 7235 7233 return handle_fastpath_set_msr_irqoff(vcpu); 7236 7234 case EXIT_REASON_PREEMPTION_TIMER: 7237 - return handle_fastpath_preemption_timer(vcpu); 7235 + return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7238 7236 default: 7239 7237 return EXIT_FASTPATH_NONE; 7240 7238 } ··· 7306 7286 guest_state_exit_irqoff(); 7307 7287 } 7308 7288 7309 - static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) 7289 + static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7310 7290 { 7311 7291 struct vcpu_vmx *vmx = to_vmx(vcpu); 7312 7292 unsigned long cr3, cr4; ··· 7333 7313 return EXIT_FASTPATH_NONE; 7334 7314 } 7335 7315 7336 - trace_kvm_entry(vcpu); 7316 + trace_kvm_entry(vcpu, force_immediate_exit); 7337 7317 7338 7318 if (vmx->ple_window_dirty) { 7339 7319 vmx->ple_window_dirty = false; ··· 7392 7372 vmx_passthrough_lbr_msrs(vcpu); 7393 7373 7394 7374 if (enable_preemption_timer) 7395 - vmx_update_hv_timer(vcpu); 7375 + vmx_update_hv_timer(vcpu, force_immediate_exit); 7376 + else if (force_immediate_exit) 7377 + smp_send_reschedule(vcpu->cpu); 7396 7378 7397 7379 kvm_wait_lapic_expire(vcpu); 7398 7380 ··· 7458 7436 vmx_recover_nmi_blocking(vmx); 7459 7437 vmx_complete_interrupts(vmx); 7460 7438 7461 - if (is_guest_mode(vcpu)) 7462 - return EXIT_FASTPATH_NONE; 7463 - 7464 - return vmx_exit_handlers_fastpath(vcpu); 7439 + return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7465 7440 } 7466 7441 7467 7442 static void vmx_vcpu_free(struct kvm_vcpu *vcpu) ··· 7938 7919 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7939 7920 } 7940 7921 7941 - static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) 7942 - { 7943 - to_vmx(vcpu)->req_immediate_exit = true; 7944 - } 7945 - 7946 7922 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, 7947 7923 struct x86_instruction_info *info) 7948 7924 { ··· 8390 8376 .check_intercept = vmx_check_intercept, 8391 8377 .handle_exit_irqoff = vmx_handle_exit_irqoff, 8392 8378 8393 - .request_immediate_exit = vmx_request_immediate_exit, 8394 - 8395 8379 .sched_in = vmx_sched_in, 8396 8380 8397 8381 .cpu_dirty_log_size = PML_ENTITY_NUM, ··· 8649 8637 if (!enable_preemption_timer) { 8650 8638 vmx_x86_ops.set_hv_timer = NULL; 8651 8639 vmx_x86_ops.cancel_hv_timer = NULL; 8652 - vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; 8653 8640 } 8654 8641 8655 8642 kvm_caps.supported_mce_cap |= MCG_LMCE_P;

-2

arch/x86/kvm/vmx/vmx.h

··· 332 332 unsigned int ple_window; 333 333 bool ple_window_dirty; 334 334 335 - bool req_immediate_exit; 336 - 337 335 /* Support for PML */ 338 336 #define PML_ENTITY_NUM 512 339 337 struct page *pml_pg;

+114 -114

arch/x86/kvm/x86.c

··· 1399 1399 } 1400 1400 EXPORT_SYMBOL_GPL(kvm_set_dr); 1401 1401 1402 - void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 1402 + unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) 1403 1403 { 1404 1404 size_t size = ARRAY_SIZE(vcpu->arch.db); 1405 1405 1406 1406 switch (dr) { 1407 1407 case 0 ... 3: 1408 - *val = vcpu->arch.db[array_index_nospec(dr, size)]; 1409 - break; 1408 + return vcpu->arch.db[array_index_nospec(dr, size)]; 1410 1409 case 4: 1411 1410 case 6: 1412 - *val = vcpu->arch.dr6; 1413 - break; 1411 + return vcpu->arch.dr6; 1414 1412 case 5: 1415 1413 default: /* 7 */ 1416 - *val = vcpu->arch.dr7; 1417 - break; 1414 + return vcpu->arch.dr7; 1418 1415 } 1419 1416 } 1420 1417 EXPORT_SYMBOL_GPL(kvm_get_dr); ··· 2857 2860 return v * clock->mult; 2858 2861 } 2859 2862 2860 - static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) 2863 + /* 2864 + * As with get_kvmclock_base_ns(), this counts from boot time, at the 2865 + * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). 2866 + */ 2867 + static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) 2861 2868 { 2862 2869 struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2863 2870 unsigned long seq; ··· 2874 2873 ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode); 2875 2874 ns >>= gtod->raw_clock.shift; 2876 2875 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); 2876 + } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 2877 + *t = ns; 2878 + 2879 + return mode; 2880 + } 2881 + 2882 + /* 2883 + * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with 2884 + * no boot time offset. 2885 + */ 2886 + static int do_monotonic(s64 *t, u64 *tsc_timestamp) 2887 + { 2888 + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 2889 + unsigned long seq; 2890 + int mode; 2891 + u64 ns; 2892 + 2893 + do { 2894 + seq = read_seqcount_begin(&gtod->seq); 2895 + ns = gtod->clock.base_cycles; 2896 + ns += vgettsc(&gtod->clock, tsc_timestamp, &mode); 2897 + ns >>= gtod->clock.shift; 2898 + ns += ktime_to_ns(gtod->clock.offset); 2877 2899 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 2878 2900 *t = ns; 2879 2901 ··· 2924 2900 return mode; 2925 2901 } 2926 2902 2927 - /* returns true if host is using TSC based clocksource */ 2903 + /* 2904 + * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and 2905 + * reports the TSC value from which it do so. Returns true if host is 2906 + * using TSC based clocksource. 2907 + */ 2928 2908 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) 2929 2909 { 2930 2910 /* checked again under seqlock below */ 2931 2911 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) 2932 2912 return false; 2933 2913 2934 - return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, 2935 - tsc_timestamp)); 2914 + return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns, 2915 + tsc_timestamp)); 2936 2916 } 2937 2917 2938 - /* returns true if host is using TSC based clocksource */ 2918 + /* 2919 + * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did 2920 + * so. Returns true if host is using TSC based clocksource. 2921 + */ 2922 + bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) 2923 + { 2924 + /* checked again under seqlock below */ 2925 + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) 2926 + return false; 2927 + 2928 + return gtod_is_based_on_tsc(do_monotonic(kernel_ns, 2929 + tsc_timestamp)); 2930 + } 2931 + 2932 + /* 2933 + * Calculates CLOCK_REALTIME and reports the TSC value from which it did 2934 + * so. Returns true if host is using TSC based clocksource. 2935 + * 2936 + * DO NOT USE this for anything related to migration. You want CLOCK_TAI 2937 + * for that. 2938 + */ 2939 2939 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, 2940 2940 u64 *tsc_timestamp) 2941 2941 { ··· 3206 3158 3207 3159 guest_hv_clock->version = ++vcpu->hv_clock.version; 3208 3160 3209 - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); 3161 + kvm_gpc_mark_dirty_in_slot(gpc); 3210 3162 read_unlock_irqrestore(&gpc->lock, flags); 3211 3163 3212 3164 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); ··· 4728 4680 KVM_XEN_HVM_CONFIG_SHARED_INFO | 4729 4681 KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | 4730 4682 KVM_XEN_HVM_CONFIG_EVTCHN_SEND | 4731 - KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; 4683 + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE | 4684 + KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA; 4732 4685 if (sched_info_on()) 4733 4686 r |= KVM_XEN_HVM_CONFIG_RUNSTATE | 4734 4687 KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; ··· 5113 5064 int idx; 5114 5065 5115 5066 if (vcpu->preempted) { 5116 - if (!vcpu->arch.guest_state_protected) 5117 - vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); 5067 + vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu); 5118 5068 5119 5069 /* 5120 5070 * Take the srcu lock as memslots will be accessed to check the gfn ··· 5560 5512 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 5561 5513 struct kvm_debugregs *dbgregs) 5562 5514 { 5563 - unsigned long val; 5515 + unsigned int i; 5564 5516 5565 5517 memset(dbgregs, 0, sizeof(*dbgregs)); 5566 - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 5567 - kvm_get_dr(vcpu, 6, &val); 5568 - dbgregs->dr6 = val; 5518 + 5519 + BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); 5520 + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) 5521 + dbgregs->db[i] = vcpu->arch.db[i]; 5522 + 5523 + dbgregs->dr6 = vcpu->arch.dr6; 5569 5524 dbgregs->dr7 = vcpu->arch.dr7; 5570 5525 } 5571 5526 5572 5527 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 5573 5528 struct kvm_debugregs *dbgregs) 5574 5529 { 5530 + unsigned int i; 5531 + 5575 5532 if (dbgregs->flags) 5576 5533 return -EINVAL; 5577 5534 ··· 5585 5532 if (!kvm_dr7_valid(dbgregs->dr7)) 5586 5533 return -EINVAL; 5587 5534 5588 - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 5535 + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) 5536 + vcpu->arch.db[i] = dbgregs->db[i]; 5537 + 5589 5538 kvm_update_dr0123(vcpu); 5590 5539 vcpu->arch.dr6 = dbgregs->dr6; 5591 5540 vcpu->arch.dr7 = dbgregs->dr7; ··· 8235 8180 kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); 8236 8181 } 8237 8182 8238 - static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 8239 - unsigned long *dest) 8183 + static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr) 8240 8184 { 8241 - kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); 8185 + return kvm_get_dr(emul_to_vcpu(ctxt), dr); 8242 8186 } 8243 8187 8244 8188 static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, ··· 8459 8405 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); 8460 8406 } 8461 8407 8462 - static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, 8463 - u32 pmc) 8408 + static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) 8464 8409 { 8465 - if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc)) 8466 - return 0; 8467 - return -EINVAL; 8410 + return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc); 8468 8411 } 8469 8412 8470 8413 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, ··· 8593 8542 .set_msr_with_filter = emulator_set_msr_with_filter, 8594 8543 .get_msr_with_filter = emulator_get_msr_with_filter, 8595 8544 .get_msr = emulator_get_msr, 8596 - .check_pmc = emulator_check_pmc, 8545 + .check_rdpmc_early = emulator_check_rdpmc_early, 8597 8546 .read_pmc = emulator_read_pmc, 8598 8547 .halt = emulator_halt, 8599 8548 .wbinvd = emulator_wbinvd, ··· 8854 8803 8855 8804 kvm_release_pfn_clean(pfn); 8856 8805 8857 - /* The instructions are well-emulated on direct mmu. */ 8858 - if (vcpu->arch.mmu->root_role.direct) { 8859 - unsigned int indirect_shadow_pages; 8860 - 8861 - write_lock(&vcpu->kvm->mmu_lock); 8862 - indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; 8863 - write_unlock(&vcpu->kvm->mmu_lock); 8864 - 8865 - if (indirect_shadow_pages) 8866 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 8867 - 8868 - return true; 8869 - } 8870 - 8871 8806 /* 8872 - * if emulation was due to access to shadowed page table 8873 - * and it failed try to unshadow page and re-enter the 8874 - * guest to let CPU execute the instruction. 8807 + * If emulation may have been triggered by a write to a shadowed page 8808 + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the 8809 + * guest to let the CPU re-execute the instruction in the hope that the 8810 + * CPU can cleanly execute the instruction that KVM failed to emulate. 8875 8811 */ 8876 - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 8812 + if (vcpu->kvm->arch.indirect_shadow_pages) 8813 + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); 8877 8814 8878 8815 /* 8879 - * If the access faults on its page table, it can not 8880 - * be fixed by unprotecting shadow page and it should 8881 - * be reported to userspace. 8816 + * If the failed instruction faulted on an access to page tables that 8817 + * are used to translate any part of the instruction, KVM can't resolve 8818 + * the issue by unprotecting the gfn, as zapping the shadow page will 8819 + * result in the instruction taking a !PRESENT page fault and thus put 8820 + * the vCPU into an infinite loop of page faults. E.g. KVM will create 8821 + * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and 8822 + * then zap the SPTE to unprotect the gfn, and then do it all over 8823 + * again. Report the error to userspace. 8882 8824 */ 8883 8825 return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); 8884 8826 } ··· 8966 8922 if (unlikely(!r)) 8967 8923 return 0; 8968 8924 8969 - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); 8925 + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); 8970 8926 8971 8927 /* 8972 8928 * rflags is the old, "raw" value of the flags. The new value has ··· 9279 9235 */ 9280 9236 if (!ctxt->have_exception || 9281 9237 exception_type(ctxt->exception.vector) == EXCPT_TRAP) { 9282 - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); 9238 + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); 9283 9239 if (ctxt->is_branch) 9284 - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 9240 + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 9285 9241 kvm_rip_write(vcpu, ctxt->eip); 9286 9242 if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) 9287 9243 r = kvm_vcpu_do_singlestep(vcpu); ··· 9692 9648 *(int *)ret = kvm_x86_check_processor_compatibility(); 9693 9649 } 9694 9650 9695 - static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) 9651 + int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) 9696 9652 { 9697 9653 u64 host_pat; 9698 9654 int r, cpu; 9655 + 9656 + guard(mutex)(&vendor_module_lock); 9699 9657 9700 9658 if (kvm_x86_ops.hardware_enable) { 9701 9659 pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); ··· 9826 9780 free_percpu(user_return_msrs); 9827 9781 out_free_x86_emulator_cache: 9828 9782 kmem_cache_destroy(x86_emulator_cache); 9829 - return r; 9830 - } 9831 - 9832 - int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) 9833 - { 9834 - int r; 9835 - 9836 - mutex_lock(&vendor_module_lock); 9837 - r = __kvm_x86_vendor_init(ops); 9838 - mutex_unlock(&vendor_module_lock); 9839 - 9840 9783 return r; 9841 9784 } 9842 9785 EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); ··· 10724 10689 static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); 10725 10690 } 10726 10691 10727 - void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) 10728 - { 10729 - smp_send_reschedule(vcpu->cpu); 10730 - } 10731 - EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); 10732 - 10733 10692 /* 10734 10693 * Called within kvm->srcu read side. 10735 10694 * Returns 1 to let vcpu_run() continue the guest execution loop without ··· 10973 10944 goto cancel_injection; 10974 10945 } 10975 10946 10976 - if (req_immediate_exit) { 10947 + if (req_immediate_exit) 10977 10948 kvm_make_request(KVM_REQ_EVENT, vcpu); 10978 - static_call(kvm_x86_request_immediate_exit)(vcpu); 10979 - } 10980 10949 10981 10950 fpregs_assert_state_consistent(); 10982 10951 if (test_thread_flag(TIF_NEED_FPU_LOAD)) ··· 11005 10978 WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && 11006 10979 (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); 11007 10980 11008 - exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); 10981 + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); 11009 10982 if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) 11010 10983 break; 11011 10984 ··· 12092 12065 vcpu->arch.regs_avail = ~0; 12093 12066 vcpu->arch.regs_dirty = ~0; 12094 12067 12095 - kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN); 12068 + kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); 12096 12069 12097 12070 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 12098 12071 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; ··· 12103 12076 if (r < 0) 12104 12077 return r; 12105 12078 12106 - if (irqchip_in_kernel(vcpu->kvm)) { 12107 - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); 12108 - if (r < 0) 12109 - goto fail_mmu_destroy; 12110 - 12111 - /* 12112 - * Defer evaluating inhibits until the vCPU is first run, as 12113 - * this vCPU will not get notified of any changes until this 12114 - * vCPU is visible to other vCPUs (marked online and added to 12115 - * the set of vCPUs). Opportunistically mark APICv active as 12116 - * VMX in particularly is highly unlikely to have inhibits. 12117 - * Ignore the current per-VM APICv state so that vCPU creation 12118 - * is guaranteed to run with a deterministic value, the request 12119 - * will ensure the vCPU gets the correct state before VM-Entry. 12120 - */ 12121 - if (enable_apicv) { 12122 - vcpu->arch.apic->apicv_active = true; 12123 - kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 12124 - } 12125 - } else 12126 - static_branch_inc(&kvm_has_noapic_vcpu); 12079 + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); 12080 + if (r < 0) 12081 + goto fail_mmu_destroy; 12127 12082 12128 12083 r = -ENOMEM; 12129 12084 ··· 12226 12217 srcu_read_unlock(&vcpu->kvm->srcu, idx); 12227 12218 free_page((unsigned long)vcpu->arch.pio_data); 12228 12219 kvfree(vcpu->arch.cpuid_entries); 12229 - if (!lapic_in_kernel(vcpu)) 12230 - static_branch_dec(&kvm_has_noapic_vcpu); 12231 12220 } 12232 12221 12233 12222 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) ··· 12501 12494 { 12502 12495 return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; 12503 12496 } 12504 - 12505 - __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); 12506 - EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); 12507 12497 12508 12498 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) 12509 12499 { ··· 13104 13100 13105 13101 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 13106 13102 { 13107 - if (kvm_vcpu_apicv_active(vcpu) && 13108 - static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) 13109 - return true; 13103 + return kvm_vcpu_apicv_active(vcpu) && 13104 + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu); 13105 + } 13110 13106 13111 - return false; 13107 + bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 13108 + { 13109 + return vcpu->arch.preempted_in_kernel; 13112 13110 } 13113 13111 13114 13112 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) ··· 13132 13126 { 13133 13127 if (vcpu->arch.guest_state_protected) 13134 13128 return true; 13135 - 13136 - if (vcpu != kvm_get_running_vcpu()) 13137 - return vcpu->arch.preempted_in_kernel; 13138 13129 13139 13130 return static_call(kvm_x86_get_cpl)(vcpu) == 0; 13140 13131 } ··· 13927 13924 13928 13925 static void __exit kvm_x86_exit(void) 13929 13926 { 13930 - /* 13931 - * If module_init() is implemented, module_exit() must also be 13932 - * implemented to allow module unload. 13933 - */ 13927 + WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu)); 13934 13928 } 13935 13929 module_exit(kvm_x86_exit);

+1 -6

arch/x86/kvm/x86.h

··· 294 294 295 295 u64 get_kvmclock_ns(struct kvm *kvm); 296 296 uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); 297 + bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); 297 298 298 299 int kvm_read_guest_virt(struct kvm_vcpu *vcpu, 299 300 gva_t addr, void *val, unsigned int bytes, ··· 431 430 { 432 431 return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; 433 432 } 434 - 435 - enum kvm_intr_type { 436 - /* Values are arbitrary, but must be non-zero. */ 437 - KVM_HANDLING_IRQ = 1, 438 - KVM_HANDLING_NMI, 439 - }; 440 433 441 434 static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, 442 435 enum kvm_intr_type intr)

+219 -96

arch/x86/kvm/xen.c

··· 10 10 #include "x86.h" 11 11 #include "xen.h" 12 12 #include "hyperv.h" 13 - #include "lapic.h" 13 + #include "irq.h" 14 14 15 15 #include <linux/eventfd.h> 16 16 #include <linux/kvm_host.h> ··· 24 24 #include <xen/interface/sched.h> 25 25 26 26 #include <asm/xen/cpuid.h> 27 + #include <asm/pvclock.h> 27 28 28 29 #include "cpuid.h" 29 30 #include "trace.h" ··· 35 34 36 35 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); 37 36 38 - static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) 37 + static int kvm_xen_shared_info_init(struct kvm *kvm) 39 38 { 40 39 struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; 41 40 struct pvclock_wall_clock *wc; 42 - gpa_t gpa = gfn_to_gpa(gfn); 43 41 u32 *wc_sec_hi; 44 42 u32 wc_version; 45 43 u64 wall_nsec; 46 44 int ret = 0; 47 45 int idx = srcu_read_lock(&kvm->srcu); 48 46 49 - if (gfn == KVM_XEN_INVALID_GFN) { 50 - kvm_gpc_deactivate(gpc); 51 - goto out; 52 - } 47 + read_lock_irq(&gpc->lock); 48 + while (!kvm_gpc_check(gpc, PAGE_SIZE)) { 49 + read_unlock_irq(&gpc->lock); 53 50 54 - do { 55 - ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); 51 + ret = kvm_gpc_refresh(gpc, PAGE_SIZE); 56 52 if (ret) 57 53 goto out; 58 54 59 - /* 60 - * This code mirrors kvm_write_wall_clock() except that it writes 61 - * directly through the pfn cache and doesn't mark the page dirty. 62 - */ 63 - wall_nsec = kvm_get_wall_clock_epoch(kvm); 64 - 65 - /* It could be invalid again already, so we need to check */ 66 55 read_lock_irq(&gpc->lock); 56 + } 67 57 68 - if (gpc->valid) 69 - break; 70 - 71 - read_unlock_irq(&gpc->lock); 72 - } while (1); 58 + /* 59 + * This code mirrors kvm_write_wall_clock() except that it writes 60 + * directly through the pfn cache and doesn't mark the page dirty. 61 + */ 62 + wall_nsec = kvm_get_wall_clock_epoch(kvm); 73 63 74 64 /* Paranoia checks on the 32-bit struct layout */ 75 65 BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); ··· 150 158 return HRTIMER_NORESTART; 151 159 } 152 160 153 - static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) 161 + static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, 162 + bool linux_wa) 154 163 { 164 + int64_t kernel_now, delta; 165 + uint64_t guest_now; 166 + 167 + /* 168 + * The guest provides the requested timeout in absolute nanoseconds 169 + * of the KVM clock — as *it* sees it, based on the scaled TSC and 170 + * the pvclock information provided by KVM. 171 + * 172 + * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW 173 + * so use CLOCK_MONOTONIC. In the timescales covered by timers, the 174 + * difference won't matter much as there is no cumulative effect. 175 + * 176 + * Calculate the time for some arbitrary point in time around "now" 177 + * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the 178 + * delta between the kvmclock "now" value and the guest's requested 179 + * timeout, apply the "Linux workaround" described below, and add 180 + * the resulting delta to the CLOCK_MONOTONIC "now" value, to get 181 + * the absolute CLOCK_MONOTONIC time at which the timer should 182 + * fire. 183 + */ 184 + if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && 185 + static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 186 + uint64_t host_tsc, guest_tsc; 187 + 188 + if (!IS_ENABLED(CONFIG_64BIT) || 189 + !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { 190 + /* 191 + * Don't fall back to get_kvmclock_ns() because it's 192 + * broken; it has a systemic error in its results 193 + * because it scales directly from host TSC to 194 + * nanoseconds, and doesn't scale first to guest TSC 195 + * and *then* to nanoseconds as the guest does. 196 + * 197 + * There is a small error introduced here because time 198 + * continues to elapse between the ktime_get() and the 199 + * subsequent rdtsc(). But not the systemic drift due 200 + * to get_kvmclock_ns(). 201 + */ 202 + kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */ 203 + host_tsc = rdtsc(); 204 + } 205 + 206 + /* Calculate the guest kvmclock as the guest would do it. */ 207 + guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); 208 + guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, 209 + guest_tsc); 210 + } else { 211 + /* 212 + * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. 213 + * 214 + * Also if the guest PV clock hasn't been set up yet, as is 215 + * likely to be the case during migration when the vCPU has 216 + * not been run yet. It would be possible to calculate the 217 + * scaling factors properly in that case but there's not much 218 + * point in doing so. The get_kvmclock_ns() drift accumulates 219 + * over time, so it's OK to use it at startup. Besides, on 220 + * migration there's going to be a little bit of skew in the 221 + * precise moment at which timers fire anyway. Often they'll 222 + * be in the "past" by the time the VM is running again after 223 + * migration. 224 + */ 225 + guest_now = get_kvmclock_ns(vcpu->kvm); 226 + kernel_now = ktime_get(); 227 + } 228 + 229 + delta = guest_abs - guest_now; 230 + 231 + /* 232 + * Xen has a 'Linux workaround' in do_set_timer_op() which checks for 233 + * negative absolute timeout values (caused by integer overflow), and 234 + * for values about 13 days in the future (2^50ns) which would be 235 + * caused by jiffies overflow. For those cases, Xen sets the timeout 236 + * 100ms in the future (not *too* soon, since if a guest really did 237 + * set a long timeout on purpose we don't want to keep churning CPU 238 + * time by waking it up). Emulate Xen's workaround when starting the 239 + * timer in response to __HYPERVISOR_set_timer_op. 240 + */ 241 + if (linux_wa && 242 + unlikely((int64_t)guest_abs < 0 || 243 + (delta > 0 && (uint32_t) (delta >> 50) != 0))) { 244 + delta = 100 * NSEC_PER_MSEC; 245 + guest_abs = guest_now + delta; 246 + } 247 + 155 248 /* 156 249 * Avoid races with the old timer firing. Checking timer_expires 157 250 * to avoid calling hrtimer_cancel() will only have false positives ··· 248 171 atomic_set(&vcpu->arch.xen.timer_pending, 0); 249 172 vcpu->arch.xen.timer_expires = guest_abs; 250 173 251 - if (delta_ns <= 0) { 174 + if (delta <= 0) 252 175 xen_timer_callback(&vcpu->arch.xen.timer); 253 - } else { 254 - ktime_t ktime_now = ktime_get(); 176 + else 255 177 hrtimer_start(&vcpu->arch.xen.timer, 256 - ktime_add_ns(ktime_now, delta_ns), 178 + ktime_add_ns(kernel_now, delta), 257 179 HRTIMER_MODE_ABS_HARD); 258 - } 259 180 } 260 181 261 182 static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) ··· 527 452 smp_wmb(); 528 453 } 529 454 530 - if (user_len2) 455 + if (user_len2) { 456 + kvm_gpc_mark_dirty_in_slot(gpc2); 531 457 read_unlock(&gpc2->lock); 458 + } 532 459 460 + kvm_gpc_mark_dirty_in_slot(gpc1); 533 461 read_unlock_irqrestore(&gpc1->lock, flags); 534 - 535 - mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT); 536 - if (user_len2) 537 - mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT); 538 462 } 539 463 540 464 void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) ··· 567 493 kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); 568 494 } 569 495 570 - static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) 496 + void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) 571 497 { 572 498 struct kvm_lapic_irq irq = { }; 573 - int r; 574 499 575 500 irq.dest_id = v->vcpu_id; 576 501 irq.vector = v->arch.xen.upcall_vector; ··· 578 505 irq.delivery_mode = APIC_DM_FIXED; 579 506 irq.level = 1; 580 507 581 - /* The fast version will always work for physical unicast */ 582 - WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); 508 + kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL); 583 509 } 584 510 585 511 /* ··· 637 565 : "0" (evtchn_pending_sel32)); 638 566 WRITE_ONCE(vi->evtchn_upcall_pending, 1); 639 567 } 568 + 569 + kvm_gpc_mark_dirty_in_slot(gpc); 640 570 read_unlock_irqrestore(&gpc->lock, flags); 641 571 642 572 /* For the per-vCPU lapic vector, deliver it as MSI. */ 643 573 if (v->arch.xen.upcall_vector) 644 574 kvm_xen_inject_vcpu_vector(v); 645 - 646 - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); 647 575 } 648 576 649 577 int __kvm_xen_has_interrupt(struct kvm_vcpu *v) ··· 707 635 } else { 708 636 mutex_lock(&kvm->arch.xen.xen_lock); 709 637 kvm->arch.xen.long_mode = !!data->u.long_mode; 638 + 639 + /* 640 + * Re-initialize shared_info to put the wallclock in the 641 + * correct place. Whilst it's not necessary to do this 642 + * unless the mode is actually changed, it does no harm 643 + * to make the call anyway. 644 + */ 645 + r = kvm->arch.xen.shinfo_cache.active ? 646 + kvm_xen_shared_info_init(kvm) : 0; 710 647 mutex_unlock(&kvm->arch.xen.xen_lock); 711 - r = 0; 712 648 } 713 649 break; 714 650 715 651 case KVM_XEN_ATTR_TYPE_SHARED_INFO: 652 + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { 653 + int idx; 654 + 716 655 mutex_lock(&kvm->arch.xen.xen_lock); 717 - r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); 656 + 657 + idx = srcu_read_lock(&kvm->srcu); 658 + 659 + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { 660 + gfn_t gfn = data->u.shared_info.gfn; 661 + 662 + if (gfn == KVM_XEN_INVALID_GFN) { 663 + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); 664 + r = 0; 665 + } else { 666 + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, 667 + gfn_to_gpa(gfn), PAGE_SIZE); 668 + } 669 + } else { 670 + void __user * hva = u64_to_user_ptr(data->u.shared_info.hva); 671 + 672 + if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) { 673 + r = -EINVAL; 674 + } else if (!hva) { 675 + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); 676 + r = 0; 677 + } else { 678 + r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, 679 + (unsigned long)hva, PAGE_SIZE); 680 + } 681 + } 682 + 683 + srcu_read_unlock(&kvm->srcu, idx); 684 + 685 + if (!r && kvm->arch.xen.shinfo_cache.active) 686 + r = kvm_xen_shared_info_init(kvm); 687 + 718 688 mutex_unlock(&kvm->arch.xen.xen_lock); 719 689 break; 720 - 690 + } 721 691 case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: 722 692 if (data->u.vector && data->u.vector < 0x10) 723 693 r = -EINVAL; ··· 813 699 break; 814 700 815 701 case KVM_XEN_ATTR_TYPE_SHARED_INFO: 816 - if (kvm->arch.xen.shinfo_cache.active) 702 + if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache)) 817 703 data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); 818 704 else 819 705 data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; 706 + r = 0; 707 + break; 708 + 709 + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: 710 + if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache)) 711 + data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva; 712 + else 713 + data->u.shared_info.hva = 0; 820 714 r = 0; 821 715 break; 822 716 ··· 864 742 865 743 switch (data->type) { 866 744 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: 745 + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: 867 746 /* No compat necessary here. */ 868 747 BUILD_BUG_ON(sizeof(struct vcpu_info) != 869 748 sizeof(struct compat_vcpu_info)); 870 749 BUILD_BUG_ON(offsetof(struct vcpu_info, time) != 871 750 offsetof(struct compat_vcpu_info, time)); 872 751 873 - if (data->u.gpa == KVM_XEN_INVALID_GPA) { 874 - kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); 875 - r = 0; 876 - break; 752 + if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) { 753 + if (data->u.gpa == KVM_XEN_INVALID_GPA) { 754 + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); 755 + r = 0; 756 + break; 757 + } 758 + 759 + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, 760 + data->u.gpa, sizeof(struct vcpu_info)); 761 + } else { 762 + if (data->u.hva == 0) { 763 + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); 764 + r = 0; 765 + break; 766 + } 767 + 768 + r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache, 769 + data->u.hva, sizeof(struct vcpu_info)); 877 770 } 878 771 879 - r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, 880 - data->u.gpa, sizeof(struct vcpu_info)); 881 772 if (!r) 882 773 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 883 774 ··· 1079 944 1080 945 /* Start the timer if the new value has a valid vector+expiry. */ 1081 946 if (data->u.timer.port && data->u.timer.expires_ns) 1082 - kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, 1083 - data->u.timer.expires_ns - 1084 - get_kvmclock_ns(vcpu->kvm)); 947 + kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false); 1085 948 1086 949 r = 0; 1087 950 break; ··· 1110 977 1111 978 switch (data->type) { 1112 979 case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: 1113 - if (vcpu->arch.xen.vcpu_info_cache.active) 980 + if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache)) 1114 981 data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; 1115 982 else 1116 983 data->u.gpa = KVM_XEN_INVALID_GPA; 984 + r = 0; 985 + break; 986 + 987 + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: 988 + if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache)) 989 + data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva; 990 + else 991 + data->u.hva = 0; 1117 992 r = 0; 1118 993 break; 1119 994 ··· 1234 1093 u32 page_num = data & ~PAGE_MASK; 1235 1094 u64 page_addr = data & PAGE_MASK; 1236 1095 bool lm = is_long_mode(vcpu); 1096 + int r = 0; 1237 1097 1238 - /* Latch long_mode for shared_info pages etc. */ 1239 - vcpu->kvm->arch.xen.long_mode = lm; 1098 + mutex_lock(&kvm->arch.xen.xen_lock); 1099 + if (kvm->arch.xen.long_mode != lm) { 1100 + kvm->arch.xen.long_mode = lm; 1101 + 1102 + /* 1103 + * Re-initialize shared_info to put the wallclock in the 1104 + * correct place. 1105 + */ 1106 + if (kvm->arch.xen.shinfo_cache.active && 1107 + kvm_xen_shared_info_init(kvm)) 1108 + r = 1; 1109 + } 1110 + mutex_unlock(&kvm->arch.xen.xen_lock); 1111 + 1112 + if (r) 1113 + return r; 1240 1114 1241 1115 /* 1242 1116 * If Xen hypercall intercept is enabled, fill the hypercall ··· 1552 1396 { 1553 1397 struct vcpu_set_singleshot_timer oneshot; 1554 1398 struct x86_exception e; 1555 - s64 delta; 1556 1399 1557 1400 if (!kvm_xen_timer_enabled(vcpu)) 1558 1401 return false; ··· 1585 1430 return true; 1586 1431 } 1587 1432 1588 - /* A delta <= 0 results in an immediate callback, which is what we want */ 1589 - delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); 1590 - kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); 1433 + kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false); 1591 1434 *r = 0; 1592 1435 return true; 1593 1436 ··· 1608 1455 if (!kvm_xen_timer_enabled(vcpu)) 1609 1456 return false; 1610 1457 1611 - if (timeout) { 1612 - uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); 1613 - int64_t delta = timeout - guest_now; 1614 - 1615 - /* Xen has a 'Linux workaround' in do_set_timer_op() which 1616 - * checks for negative absolute timeout values (caused by 1617 - * integer overflow), and for values about 13 days in the 1618 - * future (2^50ns) which would be caused by jiffies 1619 - * overflow. For those cases, it sets the timeout 100ms in 1620 - * the future (not *too* soon, since if a guest really did 1621 - * set a long timeout on purpose we don't want to keep 1622 - * churning CPU time by waking it up). 1623 - */ 1624 - if (unlikely((int64_t)timeout < 0 || 1625 - (delta > 0 && (uint32_t) (delta >> 50) != 0))) { 1626 - delta = 100 * NSEC_PER_MSEC; 1627 - timeout = guest_now + delta; 1628 - } 1629 - 1630 - kvm_xen_start_timer(vcpu, timeout, delta); 1631 - } else { 1458 + if (timeout) 1459 + kvm_xen_start_timer(vcpu, timeout, true); 1460 + else 1632 1461 kvm_xen_stop_timer(vcpu); 1633 - } 1634 1462 1635 1463 *r = 0; 1636 1464 return true; ··· 1755 1621 WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); 1756 1622 } 1757 1623 1758 - if (!vcpu->arch.xen.vcpu_info_cache.active) 1759 - return -EINVAL; 1760 - 1761 1624 if (xe->port >= max_evtchn_port(kvm)) 1762 1625 return -EINVAL; 1763 1626 ··· 1862 1731 mm_borrowed = true; 1863 1732 } 1864 1733 1865 - mutex_lock(&kvm->arch.xen.xen_lock); 1866 - 1867 1734 /* 1868 1735 * It is theoretically possible for the page to be unmapped 1869 1736 * and the MMU notifier to invalidate the shared_info before ··· 1888 1759 rc = kvm_gpc_refresh(gpc, PAGE_SIZE); 1889 1760 srcu_read_unlock(&kvm->srcu, idx); 1890 1761 } while(!rc); 1891 - 1892 - mutex_unlock(&kvm->arch.xen.xen_lock); 1893 1762 1894 1763 if (mm_borrowed) 1895 1764 kthread_unuse_mm(kvm->mm); ··· 2236 2109 2237 2110 timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); 2238 2111 2239 - kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL, 2240 - KVM_HOST_USES_PFN); 2241 - kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL, 2242 - KVM_HOST_USES_PFN); 2243 - kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL, 2244 - KVM_HOST_USES_PFN); 2245 - kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL, 2246 - KVM_HOST_USES_PFN); 2112 + kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); 2113 + kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); 2114 + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm); 2115 + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm); 2247 2116 } 2248 2117 2249 2118 void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) ··· 2282 2159 { 2283 2160 mutex_init(&kvm->arch.xen.xen_lock); 2284 2161 idr_init(&kvm->arch.xen.evtchn_ports); 2285 - kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); 2162 + kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm); 2286 2163 } 2287 2164 2288 2165 void kvm_xen_destroy_vm(struct kvm *kvm)

+18

arch/x86/kvm/xen.h

··· 18 18 19 19 int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu); 20 20 void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu); 21 + void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu); 21 22 int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); 22 23 int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); 23 24 int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); ··· 36 35 struct kvm_kernel_irq_routing_entry *e, 37 36 const struct kvm_irq_routing_entry *ue); 38 37 void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); 38 + 39 + static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) 40 + { 41 + /* 42 + * The local APIC is being enabled. If the per-vCPU upcall vector is 43 + * set and the vCPU's evtchn_upcall_pending flag is set, inject the 44 + * interrupt. 45 + */ 46 + if (static_branch_unlikely(&kvm_xen_enabled.key) && 47 + vcpu->arch.xen.vcpu_info_cache.active && 48 + vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu)) 49 + kvm_xen_inject_vcpu_vector(vcpu); 50 + } 39 51 40 52 static inline bool kvm_xen_msr_enabled(struct kvm *kvm) 41 53 { ··· 112 98 } 113 99 114 100 static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) 101 + { 102 + } 103 + 104 + static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) 115 105 { 116 106 } 117 107

+18 -1

drivers/vfio/pci/vfio_pci_core.c

··· 1862 1862 /* 1863 1863 * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1864 1864 * change vm_flags within the fault handler. Set them now. 1865 + * 1866 + * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, 1867 + * allowing KVM stage 2 device mapping attributes to use Normal-NC 1868 + * rather than DEVICE_nGnRE, which allows guest mappings 1869 + * supporting write-combining attributes (WC). ARM does not 1870 + * architecturally guarantee this is safe, and indeed some MMIO 1871 + * regions like the GICv2 VCPU interface can trigger uncontained 1872 + * faults if Normal-NC is used. 1873 + * 1874 + * To safely use VFIO in KVM the platform must guarantee full 1875 + * safety in the guest where no action taken against a MMIO 1876 + * mapping can trigger an uncontained failure. The assumption is 1877 + * that most VFIO PCI platforms support this for both mapping types, 1878 + * at least in common flows, based on some expectations of how 1879 + * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in 1880 + * the VMA flags. 1865 1881 */ 1866 - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); 1882 + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | 1883 + VM_DONTEXPAND | VM_DONTDUMP); 1867 1884 vma->vm_ops = &vfio_pci_mmap_ops; 1868 1885 1869 1886 return 0;

+1 -1

drivers/vfio/vfio.h

··· 434 434 } 435 435 #endif 436 436 437 - #ifdef CONFIG_HAVE_KVM 437 + #if IS_ENABLED(CONFIG_KVM) 438 438 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm); 439 439 void vfio_device_put_kvm(struct vfio_device *device); 440 440 #else

+2 -2

drivers/vfio/vfio_main.c

··· 16 16 #include <linux/fs.h> 17 17 #include <linux/idr.h> 18 18 #include <linux/iommu.h> 19 - #ifdef CONFIG_HAVE_KVM 19 + #if IS_ENABLED(CONFIG_KVM) 20 20 #include <linux/kvm_host.h> 21 21 #endif 22 22 #include <linux/list.h> ··· 385 385 } 386 386 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 387 387 388 - #ifdef CONFIG_HAVE_KVM 388 + #if IS_ENABLED(CONFIG_KVM) 389 389 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 390 390 { 391 391 void (*pfn)(struct kvm *kvm);

-11

include/kvm/arm_pmu.h

··· 90 90 vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ 91 91 } while (0) 92 92 93 - /* 94 - * Evaluates as true when emulating PMUv3p5, and false otherwise. 95 - */ 96 - #define kvm_pmu_is_3p5(vcpu) ({ \ 97 - u64 val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); \ 98 - u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); \ 99 - \ 100 - pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5; \ 101 - }) 102 - 103 93 u8 kvm_arm_pmu_get_pmuver_limit(void); 104 94 u64 kvm_pmu_evtyper_mask(struct kvm *kvm); 105 95 int kvm_arm_set_default_pmu(struct kvm *kvm); ··· 158 168 } 159 169 160 170 #define kvm_vcpu_has_pmu(vcpu) ({ false; }) 161 - #define kvm_pmu_is_3p5(vcpu) ({ false; }) 162 171 static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} 163 172 static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} 164 173 static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {}

+5 -4

include/kvm/arm_vgic.h

··· 13 13 #include <linux/spinlock.h> 14 14 #include <linux/static_key.h> 15 15 #include <linux/types.h> 16 + #include <linux/xarray.h> 16 17 #include <kvm/iodev.h> 17 18 #include <linux/list.h> 18 19 #include <linux/jump_label.h> ··· 117 116 118 117 struct vgic_irq { 119 118 raw_spinlock_t irq_lock; /* Protects the content of the struct */ 120 - struct list_head lpi_list; /* Used to link all LPIs together */ 119 + struct rcu_head rcu; 121 120 struct list_head ap_list; 122 121 123 122 struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU ··· 274 273 */ 275 274 u64 propbaser; 276 275 277 - /* Protects the lpi_list and the count value below. */ 276 + /* Protects the lpi_list. */ 278 277 raw_spinlock_t lpi_list_lock; 279 - struct list_head lpi_list_head; 280 - int lpi_list_count; 278 + struct xarray lpi_xa; 279 + atomic_t lpi_count; 281 280 282 281 /* LPI translation cache */ 283 282 struct list_head lpi_translation_cache;

+1 -7

include/linux/bits.h

··· 4 4 5 5 #include <linux/const.h> 6 6 #include <vdso/bits.h> 7 + #include <uapi/linux/bits.h> 7 8 #include <asm/bitsperlong.h> 8 9 9 10 #define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) ··· 31 30 #define GENMASK_INPUT_CHECK(h, l) 0 32 31 #endif 33 32 34 - #define __GENMASK(h, l) \ 35 - (((~UL(0)) - (UL(1) << (l)) + 1) & \ 36 - (~UL(0) >> (BITS_PER_LONG - 1 - (h)))) 37 33 #define GENMASK(h, l) \ 38 34 (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) 39 - 40 - #define __GENMASK_ULL(h, l) \ 41 - (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ 42 - (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) 43 35 #define GENMASK_ULL(h, l) \ 44 36 (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) 45 37

+46 -14

include/linux/kvm_host.h

··· 148 148 149 149 #endif 150 150 151 + static inline bool kvm_is_error_gpa(gpa_t gpa) 152 + { 153 + return gpa == INVALID_GPA; 154 + } 155 + 151 156 #define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) 152 157 153 158 static inline bool is_error_page(struct page *page) ··· 243 238 struct list_head link; 244 239 struct list_head queue; 245 240 struct kvm_vcpu *vcpu; 246 - struct mm_struct *mm; 247 241 gpa_t cr2_or_gpa; 248 242 unsigned long addr; 249 243 struct kvm_arch_async_pf arch; ··· 1323 1319 * 1324 1320 * @gpc: struct gfn_to_pfn_cache object. 1325 1321 * @kvm: pointer to kvm instance. 1326 - * @vcpu: vCPU to be used for marking pages dirty and to be woken on 1327 - * invalidation. 1328 - * @usage: indicates if the resulting host physical PFN is used while 1329 - * the @vcpu is IN_GUEST_MODE (in which case invalidation of 1330 - * the cache from MMU notifiers---but not for KVM memslot 1331 - * changes!---will also force @vcpu to exit the guest and 1332 - * refresh the cache); and/or if the PFN used directly 1333 - * by KVM (and thus needs a kernel virtual mapping). 1334 1322 * 1335 1323 * This sets up a gfn_to_pfn_cache by initializing locks and assigning the 1336 1324 * immutable attributes. Note, the cache must be zero-allocated (or zeroed by 1337 1325 * the caller before init). 1338 1326 */ 1339 - void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, 1340 - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage); 1327 + void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm); 1341 1328 1342 1329 /** 1343 1330 * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest ··· 1347 1352 * to ensure that the cache is valid before accessing the target page. 1348 1353 */ 1349 1354 int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); 1355 + 1356 + /** 1357 + * kvm_gpc_activate_hva - prepare a cached kernel mapping and HPA for a given HVA. 1358 + * 1359 + * @gpc: struct gfn_to_pfn_cache object. 1360 + * @hva: userspace virtual address to map. 1361 + * @len: sanity check; the range being access must fit a single page. 1362 + * 1363 + * @return: 0 for success. 1364 + * -EINVAL for a mapping which would cross a page boundary. 1365 + * -EFAULT for an untranslatable guest physical address. 1366 + * 1367 + * The semantics of this function are the same as those of kvm_gpc_activate(). It 1368 + * merely bypasses a layer of address translation. 1369 + */ 1370 + int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long hva, unsigned long len); 1350 1371 1351 1372 /** 1352 1373 * kvm_gpc_check - check validity of a gfn_to_pfn_cache. ··· 1409 1398 * invocation. 1410 1399 */ 1411 1400 void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc); 1401 + 1402 + static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc) 1403 + { 1404 + return gpc->active && !kvm_is_error_gpa(gpc->gpa); 1405 + } 1406 + 1407 + static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc) 1408 + { 1409 + return gpc->active && kvm_is_error_gpa(gpc->gpa); 1410 + } 1412 1411 1413 1412 void kvm_sigset_activate(struct kvm_vcpu *vcpu); 1414 1413 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); ··· 1526 1505 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); 1527 1506 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); 1528 1507 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); 1508 + bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); 1529 1509 int kvm_arch_post_init_vm(struct kvm *kvm); 1530 1510 void kvm_arch_pre_destroy_vm(struct kvm *kvm); 1531 - int kvm_arch_create_vm_debugfs(struct kvm *kvm); 1511 + void kvm_arch_create_vm_debugfs(struct kvm *kvm); 1532 1512 1533 1513 #ifndef __KVM_HAVE_ARCH_VM_ALLOC 1534 1514 /* ··· 1810 1788 return (hpa_t)pfn << PAGE_SHIFT; 1811 1789 } 1812 1790 1813 - static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) 1791 + static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) 1814 1792 { 1815 1793 unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); 1816 1794 1817 - return kvm_is_error_hva(hva); 1795 + return !kvm_is_error_hva(hva); 1796 + } 1797 + 1798 + static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) 1799 + { 1800 + lockdep_assert_held(&gpc->lock); 1801 + 1802 + if (!gpc->memslot) 1803 + return; 1804 + 1805 + mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); 1818 1806 } 1819 1807 1820 1808 enum kvm_stat_kind {

-8

include/linux/kvm_types.h

··· 49 49 50 50 typedef hfn_t kvm_pfn_t; 51 51 52 - enum pfn_cache_usage { 53 - KVM_GUEST_USES_PFN = BIT(0), 54 - KVM_HOST_USES_PFN = BIT(1), 55 - KVM_GUEST_AND_HOST_USE_PFN = KVM_GUEST_USES_PFN | KVM_HOST_USES_PFN, 56 - }; 57 - 58 52 struct gfn_to_hva_cache { 59 53 u64 generation; 60 54 gpa_t gpa; ··· 63 69 unsigned long uhva; 64 70 struct kvm_memory_slot *memslot; 65 71 struct kvm *kvm; 66 - struct kvm_vcpu *vcpu; 67 72 struct list_head list; 68 73 rwlock_t lock; 69 74 struct mutex refresh_lock; 70 75 void *khva; 71 76 kvm_pfn_t pfn; 72 - enum pfn_cache_usage usage; 73 77 bool active; 74 78 bool valid; 75 79 };

+14

include/linux/mm.h

··· 391 391 # define VM_UFFD_MINOR VM_NONE 392 392 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ 393 393 394 + /* 395 + * This flag is used to connect VFIO to arch specific KVM code. It 396 + * indicates that the memory under this VMA is safe for use with any 397 + * non-cachable memory type inside KVM. Some VFIO devices, on some 398 + * platforms, are thought to be unsafe and can cause machine crashes 399 + * if KVM does not lock down the memory type. 400 + */ 401 + #ifdef CONFIG_64BIT 402 + #define VM_ALLOW_ANY_UNCACHED_BIT 39 403 + #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) 404 + #else 405 + #define VM_ALLOW_ANY_UNCACHED VM_NONE 406 + #endif 407 + 394 408 /* Bits set in the VMA until the stack is in its final location */ 395 409 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) 396 410

+4

include/uapi/asm-generic/bitsperlong.h

··· 24 24 #endif 25 25 #endif 26 26 27 + #ifndef __BITS_PER_LONG_LONG 28 + #define __BITS_PER_LONG_LONG 64 29 + #endif 30 + 27 31 #endif /* _UAPI__ASM_GENERIC_BITS_PER_LONG */

+15

include/uapi/linux/bits.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 + /* bits.h: Macros for dealing with bitmasks. */ 3 + 4 + #ifndef _UAPI_LINUX_BITS_H 5 + #define _UAPI_LINUX_BITS_H 6 + 7 + #define __GENMASK(h, l) \ 8 + (((~_UL(0)) - (_UL(1) << (l)) + 1) & \ 9 + (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) 10 + 11 + #define __GENMASK_ULL(h, l) \ 12 + (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \ 13 + (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) 14 + 15 + #endif /* _UAPI_LINUX_BITS_H */

+5 -684

include/uapi/linux/kvm.h

··· 16 16 17 17 #define KVM_API_VERSION 12 18 18 19 + /* 20 + * Backwards-compatible definitions. 21 + */ 22 + #define __KVM_HAVE_GUEST_DEBUG 23 + 19 24 /* for KVM_SET_USER_MEMORY_REGION */ 20 25 struct kvm_userspace_memory_region { 21 26 __u32 slot; ··· 89 84 }; 90 85 91 86 #define KVM_PIT_SPEAKER_DUMMY 1 92 - 93 - struct kvm_s390_skeys { 94 - __u64 start_gfn; 95 - __u64 count; 96 - __u64 skeydata_addr; 97 - __u32 flags; 98 - __u32 reserved[9]; 99 - }; 100 - 101 - #define KVM_S390_CMMA_PEEK (1 << 0) 102 - 103 - /** 104 - * kvm_s390_cmma_log - Used for CMMA migration. 105 - * 106 - * Used both for input and output. 107 - * 108 - * @start_gfn: Guest page number to start from. 109 - * @count: Size of the result buffer. 110 - * @flags: Control operation mode via KVM_S390_CMMA_* flags 111 - * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty 112 - * pages are still remaining. 113 - * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set 114 - * in the PGSTE. 115 - * @values: Pointer to the values buffer. 116 - * 117 - * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. 118 - */ 119 - struct kvm_s390_cmma_log { 120 - __u64 start_gfn; 121 - __u32 count; 122 - __u32 flags; 123 - union { 124 - __u64 remaining; 125 - __u64 mask; 126 - }; 127 - __u64 values; 128 - }; 129 87 130 88 struct kvm_hyperv_exit { 131 89 #define KVM_EXIT_HYPERV_SYNIC 1 ··· 283 315 __u32 ipb; 284 316 } s390_sieic; 285 317 /* KVM_EXIT_S390_RESET */ 286 - #define KVM_S390_RESET_POR 1 287 - #define KVM_S390_RESET_CLEAR 2 288 - #define KVM_S390_RESET_SUBSYSTEM 4 289 - #define KVM_S390_RESET_CPU_INIT 8 290 - #define KVM_S390_RESET_IPL 16 291 318 __u64 s390_reset_flags; 292 319 /* KVM_EXIT_S390_UCONTROL */ 293 320 struct { ··· 499 536 __u8 pad[5]; 500 537 }; 501 538 502 - /* for KVM_S390_MEM_OP */ 503 - struct kvm_s390_mem_op { 504 - /* in */ 505 - __u64 gaddr; /* the guest address */ 506 - __u64 flags; /* flags */ 507 - __u32 size; /* amount of bytes */ 508 - __u32 op; /* type of operation */ 509 - __u64 buf; /* buffer in userspace */ 510 - union { 511 - struct { 512 - __u8 ar; /* the access register number */ 513 - __u8 key; /* access key, ignored if flag unset */ 514 - __u8 pad1[6]; /* ignored */ 515 - __u64 old_addr; /* ignored if cmpxchg flag unset */ 516 - }; 517 - __u32 sida_offset; /* offset into the sida */ 518 - __u8 reserved[32]; /* ignored */ 519 - }; 520 - }; 521 - /* types for kvm_s390_mem_op->op */ 522 - #define KVM_S390_MEMOP_LOGICAL_READ 0 523 - #define KVM_S390_MEMOP_LOGICAL_WRITE 1 524 - #define KVM_S390_MEMOP_SIDA_READ 2 525 - #define KVM_S390_MEMOP_SIDA_WRITE 3 526 - #define KVM_S390_MEMOP_ABSOLUTE_READ 4 527 - #define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 528 - #define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 529 - 530 - /* flags for kvm_s390_mem_op->flags */ 531 - #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) 532 - #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) 533 - #define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) 534 - 535 - /* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ 536 - #define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) 537 - #define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) 538 - 539 539 /* for KVM_INTERRUPT */ 540 540 struct kvm_interrupt { 541 541 /* in */ ··· 563 637 __u32 mp_state; 564 638 }; 565 639 566 - struct kvm_s390_psw { 567 - __u64 mask; 568 - __u64 addr; 569 - }; 570 - 571 - /* valid values for type in kvm_s390_interrupt */ 572 - #define KVM_S390_SIGP_STOP 0xfffe0000u 573 - #define KVM_S390_PROGRAM_INT 0xfffe0001u 574 - #define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u 575 - #define KVM_S390_RESTART 0xfffe0003u 576 - #define KVM_S390_INT_PFAULT_INIT 0xfffe0004u 577 - #define KVM_S390_INT_PFAULT_DONE 0xfffe0005u 578 - #define KVM_S390_MCHK 0xfffe1000u 579 - #define KVM_S390_INT_CLOCK_COMP 0xffff1004u 580 - #define KVM_S390_INT_CPU_TIMER 0xffff1005u 581 - #define KVM_S390_INT_VIRTIO 0xffff2603u 582 - #define KVM_S390_INT_SERVICE 0xffff2401u 583 - #define KVM_S390_INT_EMERGENCY 0xffff1201u 584 - #define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u 585 - /* Anything below 0xfffe0000u is taken by INT_IO */ 586 - #define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ 587 - (((schid)) | \ 588 - ((ssid) << 16) | \ 589 - ((cssid) << 18) | \ 590 - ((ai) << 26)) 591 - #define KVM_S390_INT_IO_MIN 0x00000000u 592 - #define KVM_S390_INT_IO_MAX 0xfffdffffu 593 - #define KVM_S390_INT_IO_AI_MASK 0x04000000u 594 - 595 - 596 - struct kvm_s390_interrupt { 597 - __u32 type; 598 - __u32 parm; 599 - __u64 parm64; 600 - }; 601 - 602 - struct kvm_s390_io_info { 603 - __u16 subchannel_id; 604 - __u16 subchannel_nr; 605 - __u32 io_int_parm; 606 - __u32 io_int_word; 607 - }; 608 - 609 - struct kvm_s390_ext_info { 610 - __u32 ext_params; 611 - __u32 pad; 612 - __u64 ext_params2; 613 - }; 614 - 615 - struct kvm_s390_pgm_info { 616 - __u64 trans_exc_code; 617 - __u64 mon_code; 618 - __u64 per_address; 619 - __u32 data_exc_code; 620 - __u16 code; 621 - __u16 mon_class_nr; 622 - __u8 per_code; 623 - __u8 per_atmid; 624 - __u8 exc_access_id; 625 - __u8 per_access_id; 626 - __u8 op_access_id; 627 - #define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 628 - #define KVM_S390_PGM_FLAGS_ILC_0 0x02 629 - #define KVM_S390_PGM_FLAGS_ILC_1 0x04 630 - #define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 631 - #define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 632 - __u8 flags; 633 - __u8 pad[2]; 634 - }; 635 - 636 - struct kvm_s390_prefix_info { 637 - __u32 address; 638 - }; 639 - 640 - struct kvm_s390_extcall_info { 641 - __u16 code; 642 - }; 643 - 644 - struct kvm_s390_emerg_info { 645 - __u16 code; 646 - }; 647 - 648 - #define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 649 - struct kvm_s390_stop_info { 650 - __u32 flags; 651 - }; 652 - 653 - struct kvm_s390_mchk_info { 654 - __u64 cr14; 655 - __u64 mcic; 656 - __u64 failing_storage_address; 657 - __u32 ext_damage_code; 658 - __u32 pad; 659 - __u8 fixed_logout[16]; 660 - }; 661 - 662 - struct kvm_s390_irq { 663 - __u64 type; 664 - union { 665 - struct kvm_s390_io_info io; 666 - struct kvm_s390_ext_info ext; 667 - struct kvm_s390_pgm_info pgm; 668 - struct kvm_s390_emerg_info emerg; 669 - struct kvm_s390_extcall_info extcall; 670 - struct kvm_s390_prefix_info prefix; 671 - struct kvm_s390_stop_info stop; 672 - struct kvm_s390_mchk_info mchk; 673 - char reserved[64]; 674 - } u; 675 - }; 676 - 677 - struct kvm_s390_irq_state { 678 - __u64 buf; 679 - __u32 flags; /* will stay unused for compatibility reasons */ 680 - __u32 len; 681 - __u32 reserved[4]; /* will stay unused for compatibility reasons */ 682 - }; 683 - 684 640 /* for KVM_SET_GUEST_DEBUG */ 685 641 686 642 #define KVM_GUESTDBG_ENABLE 0x00000001 ··· 616 808 __u32 flags; 617 809 __u64 args[4]; 618 810 __u8 pad[64]; 619 - }; 620 - 621 - /* for KVM_PPC_GET_PVINFO */ 622 - 623 - #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) 624 - 625 - struct kvm_ppc_pvinfo { 626 - /* out */ 627 - __u32 flags; 628 - __u32 hcall[4]; 629 - __u8 pad[108]; 630 - }; 631 - 632 - /* for KVM_PPC_GET_SMMU_INFO */ 633 - #define KVM_PPC_PAGE_SIZES_MAX_SZ 8 634 - 635 - struct kvm_ppc_one_page_size { 636 - __u32 page_shift; /* Page shift (or 0) */ 637 - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ 638 - }; 639 - 640 - struct kvm_ppc_one_seg_page_size { 641 - __u32 page_shift; /* Base page shift of segment (or 0) */ 642 - __u32 slb_enc; /* SLB encoding for BookS */ 643 - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; 644 - }; 645 - 646 - #define KVM_PPC_PAGE_SIZES_REAL 0x00000001 647 - #define KVM_PPC_1T_SEGMENTS 0x00000002 648 - #define KVM_PPC_NO_HASH 0x00000004 649 - 650 - struct kvm_ppc_smmu_info { 651 - __u64 flags; 652 - __u32 slb_size; 653 - __u16 data_keys; /* # storage keys supported for data */ 654 - __u16 instr_keys; /* # storage keys supported for instructions */ 655 - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; 656 - }; 657 - 658 - /* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ 659 - struct kvm_ppc_resize_hpt { 660 - __u64 flags; 661 - __u32 shift; 662 - __u32 pad; 663 811 }; 664 812 665 813 #define KVMIO 0xAE ··· 687 923 /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ 688 924 #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 689 925 #define KVM_CAP_USER_NMI 22 690 - #ifdef __KVM_HAVE_GUEST_DEBUG 691 926 #define KVM_CAP_SET_GUEST_DEBUG 23 692 - #endif 693 927 #ifdef __KVM_HAVE_PIT 694 928 #define KVM_CAP_REINJECT_CONTROL 24 695 929 #endif ··· 918 1156 #define KVM_CAP_GUEST_MEMFD 234 919 1157 #define KVM_CAP_VM_TYPES 235 920 1158 921 - #ifdef KVM_CAP_IRQ_ROUTING 922 - 923 1159 struct kvm_irq_routing_irqchip { 924 1160 __u32 irqchip; 925 1161 __u32 pin; ··· 981 1221 __u32 flags; 982 1222 struct kvm_irq_routing_entry entries[]; 983 1223 }; 984 - 985 - #endif 986 - 987 - #ifdef KVM_CAP_MCE 988 - /* x86 MCE */ 989 - struct kvm_x86_mce { 990 - __u64 status; 991 - __u64 addr; 992 - __u64 misc; 993 - __u64 mcg_status; 994 - __u8 bank; 995 - __u8 pad1[7]; 996 - __u64 pad2[3]; 997 - }; 998 - #endif 999 - 1000 - #ifdef KVM_CAP_XEN_HVM 1001 - #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) 1002 - #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) 1003 - #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) 1004 - #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) 1005 - #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) 1006 - #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) 1007 - #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) 1008 - #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) 1009 - 1010 - struct kvm_xen_hvm_config { 1011 - __u32 flags; 1012 - __u32 msr; 1013 - __u64 blob_addr_32; 1014 - __u64 blob_addr_64; 1015 - __u8 blob_size_32; 1016 - __u8 blob_size_64; 1017 - __u8 pad2[30]; 1018 - }; 1019 - #endif 1020 1224 1021 1225 #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) 1022 1226 /* ··· 1166 1442 struct kvm_userspace_memory_region2) 1167 1443 1168 1444 /* enable ucontrol for s390 */ 1169 - struct kvm_s390_ucas_mapping { 1170 - __u64 user_addr; 1171 - __u64 vcpu_addr; 1172 - __u64 length; 1173 - }; 1174 1445 #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) 1175 1446 #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) 1176 1447 #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) ··· 1360 1641 #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) 1361 1642 #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) 1362 1643 1363 - struct kvm_s390_pv_sec_parm { 1364 - __u64 origin; 1365 - __u64 length; 1366 - }; 1367 - 1368 - struct kvm_s390_pv_unp { 1369 - __u64 addr; 1370 - __u64 size; 1371 - __u64 tweak; 1372 - }; 1373 - 1374 - enum pv_cmd_dmp_id { 1375 - KVM_PV_DUMP_INIT, 1376 - KVM_PV_DUMP_CONFIG_STOR_STATE, 1377 - KVM_PV_DUMP_COMPLETE, 1378 - KVM_PV_DUMP_CPU, 1379 - }; 1380 - 1381 - struct kvm_s390_pv_dmp { 1382 - __u64 subcmd; 1383 - __u64 buff_addr; 1384 - __u64 buff_len; 1385 - __u64 gaddr; /* For dump storage state */ 1386 - __u64 reserved[4]; 1387 - }; 1388 - 1389 - enum pv_cmd_info_id { 1390 - KVM_PV_INFO_VM, 1391 - KVM_PV_INFO_DUMP, 1392 - }; 1393 - 1394 - struct kvm_s390_pv_info_dump { 1395 - __u64 dump_cpu_buffer_len; 1396 - __u64 dump_config_mem_buffer_per_1m; 1397 - __u64 dump_config_finalize_len; 1398 - }; 1399 - 1400 - struct kvm_s390_pv_info_vm { 1401 - __u64 inst_calls_list[4]; 1402 - __u64 max_cpus; 1403 - __u64 max_guests; 1404 - __u64 max_guest_addr; 1405 - __u64 feature_indication; 1406 - }; 1407 - 1408 - struct kvm_s390_pv_info_header { 1409 - __u32 id; 1410 - __u32 len_max; 1411 - __u32 len_written; 1412 - __u32 reserved; 1413 - }; 1414 - 1415 - struct kvm_s390_pv_info { 1416 - struct kvm_s390_pv_info_header header; 1417 - union { 1418 - struct kvm_s390_pv_info_dump dump; 1419 - struct kvm_s390_pv_info_vm vm; 1420 - }; 1421 - }; 1422 - 1423 - enum pv_cmd_id { 1424 - KVM_PV_ENABLE, 1425 - KVM_PV_DISABLE, 1426 - KVM_PV_SET_SEC_PARMS, 1427 - KVM_PV_UNPACK, 1428 - KVM_PV_VERIFY, 1429 - KVM_PV_PREP_RESET, 1430 - KVM_PV_UNSHARE_ALL, 1431 - KVM_PV_INFO, 1432 - KVM_PV_DUMP, 1433 - KVM_PV_ASYNC_CLEANUP_PREPARE, 1434 - KVM_PV_ASYNC_CLEANUP_PERFORM, 1435 - }; 1436 - 1437 - struct kvm_pv_cmd { 1438 - __u32 cmd; /* Command to be executed */ 1439 - __u16 rc; /* Ultravisor return code */ 1440 - __u16 rrc; /* Ultravisor return reason code */ 1441 - __u64 data; /* Data or address */ 1442 - __u32 flags; /* flags for future extensions. Must be 0 for now */ 1443 - __u32 reserved[3]; 1444 - }; 1445 - 1446 1644 /* Available with KVM_CAP_S390_PROTECTED */ 1447 1645 #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) 1448 1646 ··· 1373 1737 #define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) 1374 1738 #define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) 1375 1739 1376 - struct kvm_xen_hvm_attr { 1377 - __u16 type; 1378 - __u16 pad[3]; 1379 - union { 1380 - __u8 long_mode; 1381 - __u8 vector; 1382 - __u8 runstate_update_flag; 1383 - struct { 1384 - __u64 gfn; 1385 - #define KVM_XEN_INVALID_GFN ((__u64)-1) 1386 - } shared_info; 1387 - struct { 1388 - __u32 send_port; 1389 - __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ 1390 - __u32 flags; 1391 - #define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) 1392 - #define KVM_XEN_EVTCHN_UPDATE (1 << 1) 1393 - #define KVM_XEN_EVTCHN_RESET (1 << 2) 1394 - /* 1395 - * Events sent by the guest are either looped back to 1396 - * the guest itself (potentially on a different port#) 1397 - * or signalled via an eventfd. 1398 - */ 1399 - union { 1400 - struct { 1401 - __u32 port; 1402 - __u32 vcpu; 1403 - __u32 priority; 1404 - } port; 1405 - struct { 1406 - __u32 port; /* Zero for eventfd */ 1407 - __s32 fd; 1408 - } eventfd; 1409 - __u32 padding[4]; 1410 - } deliver; 1411 - } evtchn; 1412 - __u32 xen_version; 1413 - __u64 pad[8]; 1414 - } u; 1415 - }; 1416 - 1417 - 1418 - /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ 1419 - #define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 1420 - #define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 1421 - #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 1422 - /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ 1423 - #define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 1424 - #define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 1425 - /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ 1426 - #define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 1427 - 1428 1740 /* Per-vCPU Xen attributes */ 1429 1741 #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) 1430 1742 #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) ··· 1382 1798 1383 1799 #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) 1384 1800 #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) 1385 - 1386 - struct kvm_xen_vcpu_attr { 1387 - __u16 type; 1388 - __u16 pad[3]; 1389 - union { 1390 - __u64 gpa; 1391 - #define KVM_XEN_INVALID_GPA ((__u64)-1) 1392 - __u64 pad[8]; 1393 - struct { 1394 - __u64 state; 1395 - __u64 state_entry_time; 1396 - __u64 time_running; 1397 - __u64 time_runnable; 1398 - __u64 time_blocked; 1399 - __u64 time_offline; 1400 - } runstate; 1401 - __u32 vcpu_id; 1402 - struct { 1403 - __u32 port; 1404 - __u32 priority; 1405 - __u64 expires_ns; 1406 - } timer; 1407 - __u8 vector; 1408 - } u; 1409 - }; 1410 - 1411 - /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ 1412 - #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 1413 - #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 1414 - #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 1415 - #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 1416 - #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 1417 - #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 1418 - /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ 1419 - #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 1420 - #define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 1421 - #define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 1422 - 1423 - /* Secure Encrypted Virtualization command */ 1424 - enum sev_cmd_id { 1425 - /* Guest initialization commands */ 1426 - KVM_SEV_INIT = 0, 1427 - KVM_SEV_ES_INIT, 1428 - /* Guest launch commands */ 1429 - KVM_SEV_LAUNCH_START, 1430 - KVM_SEV_LAUNCH_UPDATE_DATA, 1431 - KVM_SEV_LAUNCH_UPDATE_VMSA, 1432 - KVM_SEV_LAUNCH_SECRET, 1433 - KVM_SEV_LAUNCH_MEASURE, 1434 - KVM_SEV_LAUNCH_FINISH, 1435 - /* Guest migration commands (outgoing) */ 1436 - KVM_SEV_SEND_START, 1437 - KVM_SEV_SEND_UPDATE_DATA, 1438 - KVM_SEV_SEND_UPDATE_VMSA, 1439 - KVM_SEV_SEND_FINISH, 1440 - /* Guest migration commands (incoming) */ 1441 - KVM_SEV_RECEIVE_START, 1442 - KVM_SEV_RECEIVE_UPDATE_DATA, 1443 - KVM_SEV_RECEIVE_UPDATE_VMSA, 1444 - KVM_SEV_RECEIVE_FINISH, 1445 - /* Guest status and debug commands */ 1446 - KVM_SEV_GUEST_STATUS, 1447 - KVM_SEV_DBG_DECRYPT, 1448 - KVM_SEV_DBG_ENCRYPT, 1449 - /* Guest certificates commands */ 1450 - KVM_SEV_CERT_EXPORT, 1451 - /* Attestation report */ 1452 - KVM_SEV_GET_ATTESTATION_REPORT, 1453 - /* Guest Migration Extension */ 1454 - KVM_SEV_SEND_CANCEL, 1455 - 1456 - KVM_SEV_NR_MAX, 1457 - }; 1458 - 1459 - struct kvm_sev_cmd { 1460 - __u32 id; 1461 - __u64 data; 1462 - __u32 error; 1463 - __u32 sev_fd; 1464 - }; 1465 - 1466 - struct kvm_sev_launch_start { 1467 - __u32 handle; 1468 - __u32 policy; 1469 - __u64 dh_uaddr; 1470 - __u32 dh_len; 1471 - __u64 session_uaddr; 1472 - __u32 session_len; 1473 - }; 1474 - 1475 - struct kvm_sev_launch_update_data { 1476 - __u64 uaddr; 1477 - __u32 len; 1478 - }; 1479 - 1480 - 1481 - struct kvm_sev_launch_secret { 1482 - __u64 hdr_uaddr; 1483 - __u32 hdr_len; 1484 - __u64 guest_uaddr; 1485 - __u32 guest_len; 1486 - __u64 trans_uaddr; 1487 - __u32 trans_len; 1488 - }; 1489 - 1490 - struct kvm_sev_launch_measure { 1491 - __u64 uaddr; 1492 - __u32 len; 1493 - }; 1494 - 1495 - struct kvm_sev_guest_status { 1496 - __u32 handle; 1497 - __u32 policy; 1498 - __u32 state; 1499 - }; 1500 - 1501 - struct kvm_sev_dbg { 1502 - __u64 src_uaddr; 1503 - __u64 dst_uaddr; 1504 - __u32 len; 1505 - }; 1506 - 1507 - struct kvm_sev_attestation_report { 1508 - __u8 mnonce[16]; 1509 - __u64 uaddr; 1510 - __u32 len; 1511 - }; 1512 - 1513 - struct kvm_sev_send_start { 1514 - __u32 policy; 1515 - __u64 pdh_cert_uaddr; 1516 - __u32 pdh_cert_len; 1517 - __u64 plat_certs_uaddr; 1518 - __u32 plat_certs_len; 1519 - __u64 amd_certs_uaddr; 1520 - __u32 amd_certs_len; 1521 - __u64 session_uaddr; 1522 - __u32 session_len; 1523 - }; 1524 - 1525 - struct kvm_sev_send_update_data { 1526 - __u64 hdr_uaddr; 1527 - __u32 hdr_len; 1528 - __u64 guest_uaddr; 1529 - __u32 guest_len; 1530 - __u64 trans_uaddr; 1531 - __u32 trans_len; 1532 - }; 1533 - 1534 - struct kvm_sev_receive_start { 1535 - __u32 handle; 1536 - __u32 policy; 1537 - __u64 pdh_uaddr; 1538 - __u32 pdh_len; 1539 - __u64 session_uaddr; 1540 - __u32 session_len; 1541 - }; 1542 - 1543 - struct kvm_sev_receive_update_data { 1544 - __u64 hdr_uaddr; 1545 - __u32 hdr_len; 1546 - __u64 guest_uaddr; 1547 - __u32 guest_len; 1548 - __u64 trans_uaddr; 1549 - __u32 trans_len; 1550 - }; 1551 - 1552 - #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 1553 - #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) 1554 - #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) 1555 - 1556 - struct kvm_assigned_pci_dev { 1557 - __u32 assigned_dev_id; 1558 - __u32 busnr; 1559 - __u32 devfn; 1560 - __u32 flags; 1561 - __u32 segnr; 1562 - union { 1563 - __u32 reserved[11]; 1564 - }; 1565 - }; 1566 - 1567 - #define KVM_DEV_IRQ_HOST_INTX (1 << 0) 1568 - #define KVM_DEV_IRQ_HOST_MSI (1 << 1) 1569 - #define KVM_DEV_IRQ_HOST_MSIX (1 << 2) 1570 - 1571 - #define KVM_DEV_IRQ_GUEST_INTX (1 << 8) 1572 - #define KVM_DEV_IRQ_GUEST_MSI (1 << 9) 1573 - #define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) 1574 - 1575 - #define KVM_DEV_IRQ_HOST_MASK 0x00ff 1576 - #define KVM_DEV_IRQ_GUEST_MASK 0xff00 1577 - 1578 - struct kvm_assigned_irq { 1579 - __u32 assigned_dev_id; 1580 - __u32 host_irq; /* ignored (legacy field) */ 1581 - __u32 guest_irq; 1582 - __u32 flags; 1583 - union { 1584 - __u32 reserved[12]; 1585 - }; 1586 - }; 1587 - 1588 - struct kvm_assigned_msix_nr { 1589 - __u32 assigned_dev_id; 1590 - __u16 entry_nr; 1591 - __u16 padding; 1592 - }; 1593 - 1594 - #define KVM_MAX_MSIX_PER_DEV 256 1595 - struct kvm_assigned_msix_entry { 1596 - __u32 assigned_dev_id; 1597 - __u32 gsi; 1598 - __u16 entry; /* The index of entry in the MSI-X table */ 1599 - __u16 padding[3]; 1600 - }; 1601 - 1602 - #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) 1603 - #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) 1604 - 1605 - /* Available with KVM_CAP_ARM_USER_IRQ */ 1606 - 1607 - /* Bits for run->s.regs.device_irq_level */ 1608 - #define KVM_ARM_DEV_EL1_VTIMER (1 << 0) 1609 - #define KVM_ARM_DEV_EL1_PTIMER (1 << 1) 1610 - #define KVM_ARM_DEV_PMU (1 << 2) 1611 - 1612 - struct kvm_hyperv_eventfd { 1613 - __u32 conn_id; 1614 - __s32 fd; 1615 - __u32 flags; 1616 - __u32 padding[3]; 1617 - }; 1618 - 1619 - #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff 1620 - #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) 1621 1801 1622 1802 #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) 1623 1803 #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) ··· 1527 2179 1528 2180 /* Available with KVM_CAP_S390_ZPCI_OP */ 1529 2181 #define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) 1530 - 1531 - struct kvm_s390_zpci_op { 1532 - /* in */ 1533 - __u32 fh; /* target device */ 1534 - __u8 op; /* operation to perform */ 1535 - __u8 pad[3]; 1536 - union { 1537 - /* for KVM_S390_ZPCIOP_REG_AEN */ 1538 - struct { 1539 - __u64 ibv; /* Guest addr of interrupt bit vector */ 1540 - __u64 sb; /* Guest addr of summary bit */ 1541 - __u32 flags; 1542 - __u32 noi; /* Number of interrupts */ 1543 - __u8 isc; /* Guest interrupt subclass */ 1544 - __u8 sbo; /* Offset of guest summary bit vector */ 1545 - __u16 pad; 1546 - } reg_aen; 1547 - __u64 reserved[8]; 1548 - } u; 1549 - }; 1550 - 1551 - /* types for kvm_s390_zpci_op->op */ 1552 - #define KVM_S390_ZPCIOP_REG_AEN 0 1553 - #define KVM_S390_ZPCIOP_DEREG_AEN 1 1554 - 1555 - /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ 1556 - #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) 1557 2182 1558 2183 /* Available with KVM_CAP_MEMORY_ATTRIBUTES */ 1559 2184 #define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes)

+5 -1

scripts/gdb/linux/constants.py.in

··· 130 130 LX_CONFIG(CONFIG_X86_MCE_AMD) 131 131 LX_CONFIG(CONFIG_X86_MCE) 132 132 LX_CONFIG(CONFIG_X86_IO_APIC) 133 - LX_CONFIG(CONFIG_HAVE_KVM) 133 + /* 134 + * CONFIG_KVM can be "m" but it affects common code too. Use CONFIG_KVM_COMMON 135 + * as a proxy for IS_ENABLED(CONFIG_KVM). 136 + */ 137 + LX_CONFIG_KVM = IS_BUILTIN(CONFIG_KVM_COMMON) 134 138 LX_CONFIG(CONFIG_NUMA) 135 139 LX_CONFIG(CONFIG_ARM64) 136 140 LX_CONFIG(CONFIG_ARM64_4K_PAGES)

+1 -1

scripts/gdb/linux/interrupts.py

··· 151 151 if cnt is not None: 152 152 text += "%*s: %10u\n" % (prec, "MIS", cnt['counter']) 153 153 154 - if constants.LX_CONFIG_HAVE_KVM: 154 + if constants.LX_CONFIG_KVM: 155 155 text += x86_show_irqstat(prec, "PIN", 'kvm_posted_intr_ipis', 'Posted-interrupt notification event') 156 156 text += x86_show_irqstat(prec, "NPI", 'kvm_posted_intr_nested_ipis', 'Nested posted-interrupt event') 157 157 text += x86_show_irqstat(prec, "PIW", 'kvm_posted_intr_wakeup_ipis', 'Posted-interrupt wakeup event')

+541

tools/arch/riscv/include/asm/csr.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (C) 2015 Regents of the University of California 4 + */ 5 + 6 + #ifndef _ASM_RISCV_CSR_H 7 + #define _ASM_RISCV_CSR_H 8 + 9 + #include <linux/bits.h> 10 + 11 + /* Status register flags */ 12 + #define SR_SIE _AC(0x00000002, UL) /* Supervisor Interrupt Enable */ 13 + #define SR_MIE _AC(0x00000008, UL) /* Machine Interrupt Enable */ 14 + #define SR_SPIE _AC(0x00000020, UL) /* Previous Supervisor IE */ 15 + #define SR_MPIE _AC(0x00000080, UL) /* Previous Machine IE */ 16 + #define SR_SPP _AC(0x00000100, UL) /* Previously Supervisor */ 17 + #define SR_MPP _AC(0x00001800, UL) /* Previously Machine */ 18 + #define SR_SUM _AC(0x00040000, UL) /* Supervisor User Memory Access */ 19 + 20 + #define SR_FS _AC(0x00006000, UL) /* Floating-point Status */ 21 + #define SR_FS_OFF _AC(0x00000000, UL) 22 + #define SR_FS_INITIAL _AC(0x00002000, UL) 23 + #define SR_FS_CLEAN _AC(0x00004000, UL) 24 + #define SR_FS_DIRTY _AC(0x00006000, UL) 25 + 26 + #define SR_VS _AC(0x00000600, UL) /* Vector Status */ 27 + #define SR_VS_OFF _AC(0x00000000, UL) 28 + #define SR_VS_INITIAL _AC(0x00000200, UL) 29 + #define SR_VS_CLEAN _AC(0x00000400, UL) 30 + #define SR_VS_DIRTY _AC(0x00000600, UL) 31 + 32 + #define SR_XS _AC(0x00018000, UL) /* Extension Status */ 33 + #define SR_XS_OFF _AC(0x00000000, UL) 34 + #define SR_XS_INITIAL _AC(0x00008000, UL) 35 + #define SR_XS_CLEAN _AC(0x00010000, UL) 36 + #define SR_XS_DIRTY _AC(0x00018000, UL) 37 + 38 + #define SR_FS_VS (SR_FS | SR_VS) /* Vector and Floating-Point Unit */ 39 + 40 + #ifndef CONFIG_64BIT 41 + #define SR_SD _AC(0x80000000, UL) /* FS/VS/XS dirty */ 42 + #else 43 + #define SR_SD _AC(0x8000000000000000, UL) /* FS/VS/XS dirty */ 44 + #endif 45 + 46 + #ifdef CONFIG_64BIT 47 + #define SR_UXL _AC(0x300000000, UL) /* XLEN mask for U-mode */ 48 + #define SR_UXL_32 _AC(0x100000000, UL) /* XLEN = 32 for U-mode */ 49 + #define SR_UXL_64 _AC(0x200000000, UL) /* XLEN = 64 for U-mode */ 50 + #endif 51 + 52 + /* SATP flags */ 53 + #ifndef CONFIG_64BIT 54 + #define SATP_PPN _AC(0x003FFFFF, UL) 55 + #define SATP_MODE_32 _AC(0x80000000, UL) 56 + #define SATP_MODE_SHIFT 31 57 + #define SATP_ASID_BITS 9 58 + #define SATP_ASID_SHIFT 22 59 + #define SATP_ASID_MASK _AC(0x1FF, UL) 60 + #else 61 + #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) 62 + #define SATP_MODE_39 _AC(0x8000000000000000, UL) 63 + #define SATP_MODE_48 _AC(0x9000000000000000, UL) 64 + #define SATP_MODE_57 _AC(0xa000000000000000, UL) 65 + #define SATP_MODE_SHIFT 60 66 + #define SATP_ASID_BITS 16 67 + #define SATP_ASID_SHIFT 44 68 + #define SATP_ASID_MASK _AC(0xFFFF, UL) 69 + #endif 70 + 71 + /* Exception cause high bit - is an interrupt if set */ 72 + #define CAUSE_IRQ_FLAG (_AC(1, UL) << (__riscv_xlen - 1)) 73 + 74 + /* Interrupt causes (minus the high bit) */ 75 + #define IRQ_S_SOFT 1 76 + #define IRQ_VS_SOFT 2 77 + #define IRQ_M_SOFT 3 78 + #define IRQ_S_TIMER 5 79 + #define IRQ_VS_TIMER 6 80 + #define IRQ_M_TIMER 7 81 + #define IRQ_S_EXT 9 82 + #define IRQ_VS_EXT 10 83 + #define IRQ_M_EXT 11 84 + #define IRQ_S_GEXT 12 85 + #define IRQ_PMU_OVF 13 86 + #define IRQ_LOCAL_MAX (IRQ_PMU_OVF + 1) 87 + #define IRQ_LOCAL_MASK GENMASK((IRQ_LOCAL_MAX - 1), 0) 88 + 89 + /* Exception causes */ 90 + #define EXC_INST_MISALIGNED 0 91 + #define EXC_INST_ACCESS 1 92 + #define EXC_INST_ILLEGAL 2 93 + #define EXC_BREAKPOINT 3 94 + #define EXC_LOAD_MISALIGNED 4 95 + #define EXC_LOAD_ACCESS 5 96 + #define EXC_STORE_MISALIGNED 6 97 + #define EXC_STORE_ACCESS 7 98 + #define EXC_SYSCALL 8 99 + #define EXC_HYPERVISOR_SYSCALL 9 100 + #define EXC_SUPERVISOR_SYSCALL 10 101 + #define EXC_INST_PAGE_FAULT 12 102 + #define EXC_LOAD_PAGE_FAULT 13 103 + #define EXC_STORE_PAGE_FAULT 15 104 + #define EXC_INST_GUEST_PAGE_FAULT 20 105 + #define EXC_LOAD_GUEST_PAGE_FAULT 21 106 + #define EXC_VIRTUAL_INST_FAULT 22 107 + #define EXC_STORE_GUEST_PAGE_FAULT 23 108 + 109 + /* PMP configuration */ 110 + #define PMP_R 0x01 111 + #define PMP_W 0x02 112 + #define PMP_X 0x04 113 + #define PMP_A 0x18 114 + #define PMP_A_TOR 0x08 115 + #define PMP_A_NA4 0x10 116 + #define PMP_A_NAPOT 0x18 117 + #define PMP_L 0x80 118 + 119 + /* HSTATUS flags */ 120 + #ifdef CONFIG_64BIT 121 + #define HSTATUS_VSXL _AC(0x300000000, UL) 122 + #define HSTATUS_VSXL_SHIFT 32 123 + #endif 124 + #define HSTATUS_VTSR _AC(0x00400000, UL) 125 + #define HSTATUS_VTW _AC(0x00200000, UL) 126 + #define HSTATUS_VTVM _AC(0x00100000, UL) 127 + #define HSTATUS_VGEIN _AC(0x0003f000, UL) 128 + #define HSTATUS_VGEIN_SHIFT 12 129 + #define HSTATUS_HU _AC(0x00000200, UL) 130 + #define HSTATUS_SPVP _AC(0x00000100, UL) 131 + #define HSTATUS_SPV _AC(0x00000080, UL) 132 + #define HSTATUS_GVA _AC(0x00000040, UL) 133 + #define HSTATUS_VSBE _AC(0x00000020, UL) 134 + 135 + /* HGATP flags */ 136 + #define HGATP_MODE_OFF _AC(0, UL) 137 + #define HGATP_MODE_SV32X4 _AC(1, UL) 138 + #define HGATP_MODE_SV39X4 _AC(8, UL) 139 + #define HGATP_MODE_SV48X4 _AC(9, UL) 140 + #define HGATP_MODE_SV57X4 _AC(10, UL) 141 + 142 + #define HGATP32_MODE_SHIFT 31 143 + #define HGATP32_VMID_SHIFT 22 144 + #define HGATP32_VMID GENMASK(28, 22) 145 + #define HGATP32_PPN GENMASK(21, 0) 146 + 147 + #define HGATP64_MODE_SHIFT 60 148 + #define HGATP64_VMID_SHIFT 44 149 + #define HGATP64_VMID GENMASK(57, 44) 150 + #define HGATP64_PPN GENMASK(43, 0) 151 + 152 + #define HGATP_PAGE_SHIFT 12 153 + 154 + #ifdef CONFIG_64BIT 155 + #define HGATP_PPN HGATP64_PPN 156 + #define HGATP_VMID_SHIFT HGATP64_VMID_SHIFT 157 + #define HGATP_VMID HGATP64_VMID 158 + #define HGATP_MODE_SHIFT HGATP64_MODE_SHIFT 159 + #else 160 + #define HGATP_PPN HGATP32_PPN 161 + #define HGATP_VMID_SHIFT HGATP32_VMID_SHIFT 162 + #define HGATP_VMID HGATP32_VMID 163 + #define HGATP_MODE_SHIFT HGATP32_MODE_SHIFT 164 + #endif 165 + 166 + /* VSIP & HVIP relation */ 167 + #define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT) 168 + #define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \ 169 + (_AC(1, UL) << IRQ_S_TIMER) | \ 170 + (_AC(1, UL) << IRQ_S_EXT)) 171 + 172 + /* AIA CSR bits */ 173 + #define TOPI_IID_SHIFT 16 174 + #define TOPI_IID_MASK GENMASK(11, 0) 175 + #define TOPI_IPRIO_MASK GENMASK(7, 0) 176 + #define TOPI_IPRIO_BITS 8 177 + 178 + #define TOPEI_ID_SHIFT 16 179 + #define TOPEI_ID_MASK GENMASK(10, 0) 180 + #define TOPEI_PRIO_MASK GENMASK(10, 0) 181 + 182 + #define ISELECT_IPRIO0 0x30 183 + #define ISELECT_IPRIO15 0x3f 184 + #define ISELECT_MASK GENMASK(8, 0) 185 + 186 + #define HVICTL_VTI BIT(30) 187 + #define HVICTL_IID GENMASK(27, 16) 188 + #define HVICTL_IID_SHIFT 16 189 + #define HVICTL_DPR BIT(9) 190 + #define HVICTL_IPRIOM BIT(8) 191 + #define HVICTL_IPRIO GENMASK(7, 0) 192 + 193 + /* xENVCFG flags */ 194 + #define ENVCFG_STCE (_AC(1, ULL) << 63) 195 + #define ENVCFG_PBMTE (_AC(1, ULL) << 62) 196 + #define ENVCFG_CBZE (_AC(1, UL) << 7) 197 + #define ENVCFG_CBCFE (_AC(1, UL) << 6) 198 + #define ENVCFG_CBIE_SHIFT 4 199 + #define ENVCFG_CBIE (_AC(0x3, UL) << ENVCFG_CBIE_SHIFT) 200 + #define ENVCFG_CBIE_ILL _AC(0x0, UL) 201 + #define ENVCFG_CBIE_FLUSH _AC(0x1, UL) 202 + #define ENVCFG_CBIE_INV _AC(0x3, UL) 203 + #define ENVCFG_FIOM _AC(0x1, UL) 204 + 205 + /* Smstateen bits */ 206 + #define SMSTATEEN0_AIA_IMSIC_SHIFT 58 207 + #define SMSTATEEN0_AIA_IMSIC (_ULL(1) << SMSTATEEN0_AIA_IMSIC_SHIFT) 208 + #define SMSTATEEN0_AIA_SHIFT 59 209 + #define SMSTATEEN0_AIA (_ULL(1) << SMSTATEEN0_AIA_SHIFT) 210 + #define SMSTATEEN0_AIA_ISEL_SHIFT 60 211 + #define SMSTATEEN0_AIA_ISEL (_ULL(1) << SMSTATEEN0_AIA_ISEL_SHIFT) 212 + #define SMSTATEEN0_HSENVCFG_SHIFT 62 213 + #define SMSTATEEN0_HSENVCFG (_ULL(1) << SMSTATEEN0_HSENVCFG_SHIFT) 214 + #define SMSTATEEN0_SSTATEEN0_SHIFT 63 215 + #define SMSTATEEN0_SSTATEEN0 (_ULL(1) << SMSTATEEN0_SSTATEEN0_SHIFT) 216 + 217 + /* symbolic CSR names: */ 218 + #define CSR_CYCLE 0xc00 219 + #define CSR_TIME 0xc01 220 + #define CSR_INSTRET 0xc02 221 + #define CSR_HPMCOUNTER3 0xc03 222 + #define CSR_HPMCOUNTER4 0xc04 223 + #define CSR_HPMCOUNTER5 0xc05 224 + #define CSR_HPMCOUNTER6 0xc06 225 + #define CSR_HPMCOUNTER7 0xc07 226 + #define CSR_HPMCOUNTER8 0xc08 227 + #define CSR_HPMCOUNTER9 0xc09 228 + #define CSR_HPMCOUNTER10 0xc0a 229 + #define CSR_HPMCOUNTER11 0xc0b 230 + #define CSR_HPMCOUNTER12 0xc0c 231 + #define CSR_HPMCOUNTER13 0xc0d 232 + #define CSR_HPMCOUNTER14 0xc0e 233 + #define CSR_HPMCOUNTER15 0xc0f 234 + #define CSR_HPMCOUNTER16 0xc10 235 + #define CSR_HPMCOUNTER17 0xc11 236 + #define CSR_HPMCOUNTER18 0xc12 237 + #define CSR_HPMCOUNTER19 0xc13 238 + #define CSR_HPMCOUNTER20 0xc14 239 + #define CSR_HPMCOUNTER21 0xc15 240 + #define CSR_HPMCOUNTER22 0xc16 241 + #define CSR_HPMCOUNTER23 0xc17 242 + #define CSR_HPMCOUNTER24 0xc18 243 + #define CSR_HPMCOUNTER25 0xc19 244 + #define CSR_HPMCOUNTER26 0xc1a 245 + #define CSR_HPMCOUNTER27 0xc1b 246 + #define CSR_HPMCOUNTER28 0xc1c 247 + #define CSR_HPMCOUNTER29 0xc1d 248 + #define CSR_HPMCOUNTER30 0xc1e 249 + #define CSR_HPMCOUNTER31 0xc1f 250 + #define CSR_CYCLEH 0xc80 251 + #define CSR_TIMEH 0xc81 252 + #define CSR_INSTRETH 0xc82 253 + #define CSR_HPMCOUNTER3H 0xc83 254 + #define CSR_HPMCOUNTER4H 0xc84 255 + #define CSR_HPMCOUNTER5H 0xc85 256 + #define CSR_HPMCOUNTER6H 0xc86 257 + #define CSR_HPMCOUNTER7H 0xc87 258 + #define CSR_HPMCOUNTER8H 0xc88 259 + #define CSR_HPMCOUNTER9H 0xc89 260 + #define CSR_HPMCOUNTER10H 0xc8a 261 + #define CSR_HPMCOUNTER11H 0xc8b 262 + #define CSR_HPMCOUNTER12H 0xc8c 263 + #define CSR_HPMCOUNTER13H 0xc8d 264 + #define CSR_HPMCOUNTER14H 0xc8e 265 + #define CSR_HPMCOUNTER15H 0xc8f 266 + #define CSR_HPMCOUNTER16H 0xc90 267 + #define CSR_HPMCOUNTER17H 0xc91 268 + #define CSR_HPMCOUNTER18H 0xc92 269 + #define CSR_HPMCOUNTER19H 0xc93 270 + #define CSR_HPMCOUNTER20H 0xc94 271 + #define CSR_HPMCOUNTER21H 0xc95 272 + #define CSR_HPMCOUNTER22H 0xc96 273 + #define CSR_HPMCOUNTER23H 0xc97 274 + #define CSR_HPMCOUNTER24H 0xc98 275 + #define CSR_HPMCOUNTER25H 0xc99 276 + #define CSR_HPMCOUNTER26H 0xc9a 277 + #define CSR_HPMCOUNTER27H 0xc9b 278 + #define CSR_HPMCOUNTER28H 0xc9c 279 + #define CSR_HPMCOUNTER29H 0xc9d 280 + #define CSR_HPMCOUNTER30H 0xc9e 281 + #define CSR_HPMCOUNTER31H 0xc9f 282 + 283 + #define CSR_SSCOUNTOVF 0xda0 284 + 285 + #define CSR_SSTATUS 0x100 286 + #define CSR_SIE 0x104 287 + #define CSR_STVEC 0x105 288 + #define CSR_SCOUNTEREN 0x106 289 + #define CSR_SENVCFG 0x10a 290 + #define CSR_SSTATEEN0 0x10c 291 + #define CSR_SSCRATCH 0x140 292 + #define CSR_SEPC 0x141 293 + #define CSR_SCAUSE 0x142 294 + #define CSR_STVAL 0x143 295 + #define CSR_SIP 0x144 296 + #define CSR_SATP 0x180 297 + 298 + #define CSR_STIMECMP 0x14D 299 + #define CSR_STIMECMPH 0x15D 300 + 301 + /* Supervisor-Level Window to Indirectly Accessed Registers (AIA) */ 302 + #define CSR_SISELECT 0x150 303 + #define CSR_SIREG 0x151 304 + 305 + /* Supervisor-Level Interrupts (AIA) */ 306 + #define CSR_STOPEI 0x15c 307 + #define CSR_STOPI 0xdb0 308 + 309 + /* Supervisor-Level High-Half CSRs (AIA) */ 310 + #define CSR_SIEH 0x114 311 + #define CSR_SIPH 0x154 312 + 313 + #define CSR_VSSTATUS 0x200 314 + #define CSR_VSIE 0x204 315 + #define CSR_VSTVEC 0x205 316 + #define CSR_VSSCRATCH 0x240 317 + #define CSR_VSEPC 0x241 318 + #define CSR_VSCAUSE 0x242 319 + #define CSR_VSTVAL 0x243 320 + #define CSR_VSIP 0x244 321 + #define CSR_VSATP 0x280 322 + #define CSR_VSTIMECMP 0x24D 323 + #define CSR_VSTIMECMPH 0x25D 324 + 325 + #define CSR_HSTATUS 0x600 326 + #define CSR_HEDELEG 0x602 327 + #define CSR_HIDELEG 0x603 328 + #define CSR_HIE 0x604 329 + #define CSR_HTIMEDELTA 0x605 330 + #define CSR_HCOUNTEREN 0x606 331 + #define CSR_HGEIE 0x607 332 + #define CSR_HENVCFG 0x60a 333 + #define CSR_HTIMEDELTAH 0x615 334 + #define CSR_HENVCFGH 0x61a 335 + #define CSR_HTVAL 0x643 336 + #define CSR_HIP 0x644 337 + #define CSR_HVIP 0x645 338 + #define CSR_HTINST 0x64a 339 + #define CSR_HGATP 0x680 340 + #define CSR_HGEIP 0xe12 341 + 342 + /* Virtual Interrupts and Interrupt Priorities (H-extension with AIA) */ 343 + #define CSR_HVIEN 0x608 344 + #define CSR_HVICTL 0x609 345 + #define CSR_HVIPRIO1 0x646 346 + #define CSR_HVIPRIO2 0x647 347 + 348 + /* VS-Level Window to Indirectly Accessed Registers (H-extension with AIA) */ 349 + #define CSR_VSISELECT 0x250 350 + #define CSR_VSIREG 0x251 351 + 352 + /* VS-Level Interrupts (H-extension with AIA) */ 353 + #define CSR_VSTOPEI 0x25c 354 + #define CSR_VSTOPI 0xeb0 355 + 356 + /* Hypervisor and VS-Level High-Half CSRs (H-extension with AIA) */ 357 + #define CSR_HIDELEGH 0x613 358 + #define CSR_HVIENH 0x618 359 + #define CSR_HVIPH 0x655 360 + #define CSR_HVIPRIO1H 0x656 361 + #define CSR_HVIPRIO2H 0x657 362 + #define CSR_VSIEH 0x214 363 + #define CSR_VSIPH 0x254 364 + 365 + /* Hypervisor stateen CSRs */ 366 + #define CSR_HSTATEEN0 0x60c 367 + #define CSR_HSTATEEN0H 0x61c 368 + 369 + #define CSR_MSTATUS 0x300 370 + #define CSR_MISA 0x301 371 + #define CSR_MIDELEG 0x303 372 + #define CSR_MIE 0x304 373 + #define CSR_MTVEC 0x305 374 + #define CSR_MENVCFG 0x30a 375 + #define CSR_MENVCFGH 0x31a 376 + #define CSR_MSCRATCH 0x340 377 + #define CSR_MEPC 0x341 378 + #define CSR_MCAUSE 0x342 379 + #define CSR_MTVAL 0x343 380 + #define CSR_MIP 0x344 381 + #define CSR_PMPCFG0 0x3a0 382 + #define CSR_PMPADDR0 0x3b0 383 + #define CSR_MVENDORID 0xf11 384 + #define CSR_MARCHID 0xf12 385 + #define CSR_MIMPID 0xf13 386 + #define CSR_MHARTID 0xf14 387 + 388 + /* Machine-Level Window to Indirectly Accessed Registers (AIA) */ 389 + #define CSR_MISELECT 0x350 390 + #define CSR_MIREG 0x351 391 + 392 + /* Machine-Level Interrupts (AIA) */ 393 + #define CSR_MTOPEI 0x35c 394 + #define CSR_MTOPI 0xfb0 395 + 396 + /* Virtual Interrupts for Supervisor Level (AIA) */ 397 + #define CSR_MVIEN 0x308 398 + #define CSR_MVIP 0x309 399 + 400 + /* Machine-Level High-Half CSRs (AIA) */ 401 + #define CSR_MIDELEGH 0x313 402 + #define CSR_MIEH 0x314 403 + #define CSR_MVIENH 0x318 404 + #define CSR_MVIPH 0x319 405 + #define CSR_MIPH 0x354 406 + 407 + #define CSR_VSTART 0x8 408 + #define CSR_VCSR 0xf 409 + #define CSR_VL 0xc20 410 + #define CSR_VTYPE 0xc21 411 + #define CSR_VLENB 0xc22 412 + 413 + #ifdef CONFIG_RISCV_M_MODE 414 + # define CSR_STATUS CSR_MSTATUS 415 + # define CSR_IE CSR_MIE 416 + # define CSR_TVEC CSR_MTVEC 417 + # define CSR_SCRATCH CSR_MSCRATCH 418 + # define CSR_EPC CSR_MEPC 419 + # define CSR_CAUSE CSR_MCAUSE 420 + # define CSR_TVAL CSR_MTVAL 421 + # define CSR_IP CSR_MIP 422 + 423 + # define CSR_IEH CSR_MIEH 424 + # define CSR_ISELECT CSR_MISELECT 425 + # define CSR_IREG CSR_MIREG 426 + # define CSR_IPH CSR_MIPH 427 + # define CSR_TOPEI CSR_MTOPEI 428 + # define CSR_TOPI CSR_MTOPI 429 + 430 + # define SR_IE SR_MIE 431 + # define SR_PIE SR_MPIE 432 + # define SR_PP SR_MPP 433 + 434 + # define RV_IRQ_SOFT IRQ_M_SOFT 435 + # define RV_IRQ_TIMER IRQ_M_TIMER 436 + # define RV_IRQ_EXT IRQ_M_EXT 437 + #else /* CONFIG_RISCV_M_MODE */ 438 + # define CSR_STATUS CSR_SSTATUS 439 + # define CSR_IE CSR_SIE 440 + # define CSR_TVEC CSR_STVEC 441 + # define CSR_SCRATCH CSR_SSCRATCH 442 + # define CSR_EPC CSR_SEPC 443 + # define CSR_CAUSE CSR_SCAUSE 444 + # define CSR_TVAL CSR_STVAL 445 + # define CSR_IP CSR_SIP 446 + 447 + # define CSR_IEH CSR_SIEH 448 + # define CSR_ISELECT CSR_SISELECT 449 + # define CSR_IREG CSR_SIREG 450 + # define CSR_IPH CSR_SIPH 451 + # define CSR_TOPEI CSR_STOPEI 452 + # define CSR_TOPI CSR_STOPI 453 + 454 + # define SR_IE SR_SIE 455 + # define SR_PIE SR_SPIE 456 + # define SR_PP SR_SPP 457 + 458 + # define RV_IRQ_SOFT IRQ_S_SOFT 459 + # define RV_IRQ_TIMER IRQ_S_TIMER 460 + # define RV_IRQ_EXT IRQ_S_EXT 461 + # define RV_IRQ_PMU IRQ_PMU_OVF 462 + # define SIP_LCOFIP (_AC(0x1, UL) << IRQ_PMU_OVF) 463 + 464 + #endif /* !CONFIG_RISCV_M_MODE */ 465 + 466 + /* IE/IP (Supervisor/Machine Interrupt Enable/Pending) flags */ 467 + #define IE_SIE (_AC(0x1, UL) << RV_IRQ_SOFT) 468 + #define IE_TIE (_AC(0x1, UL) << RV_IRQ_TIMER) 469 + #define IE_EIE (_AC(0x1, UL) << RV_IRQ_EXT) 470 + 471 + #ifdef __ASSEMBLY__ 472 + #define __ASM_STR(x) x 473 + #else 474 + #define __ASM_STR(x) #x 475 + #endif 476 + 477 + #ifndef __ASSEMBLY__ 478 + 479 + #define csr_swap(csr, val) \ 480 + ({ \ 481 + unsigned long __v = (unsigned long)(val); \ 482 + __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1"\ 483 + : "=r" (__v) : "rK" (__v) \ 484 + : "memory"); \ 485 + __v; \ 486 + }) 487 + 488 + #define csr_read(csr) \ 489 + ({ \ 490 + register unsigned long __v; \ 491 + __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) \ 492 + : "=r" (__v) : \ 493 + : "memory"); \ 494 + __v; \ 495 + }) 496 + 497 + #define csr_write(csr, val) \ 498 + ({ \ 499 + unsigned long __v = (unsigned long)(val); \ 500 + __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" \ 501 + : : "rK" (__v) \ 502 + : "memory"); \ 503 + }) 504 + 505 + #define csr_read_set(csr, val) \ 506 + ({ \ 507 + unsigned long __v = (unsigned long)(val); \ 508 + __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1"\ 509 + : "=r" (__v) : "rK" (__v) \ 510 + : "memory"); \ 511 + __v; \ 512 + }) 513 + 514 + #define csr_set(csr, val) \ 515 + ({ \ 516 + unsigned long __v = (unsigned long)(val); \ 517 + __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" \ 518 + : : "rK" (__v) \ 519 + : "memory"); \ 520 + }) 521 + 522 + #define csr_read_clear(csr, val) \ 523 + ({ \ 524 + unsigned long __v = (unsigned long)(val); \ 525 + __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1"\ 526 + : "=r" (__v) : "rK" (__v) \ 527 + : "memory"); \ 528 + __v; \ 529 + }) 530 + 531 + #define csr_clear(csr, val) \ 532 + ({ \ 533 + unsigned long __v = (unsigned long)(val); \ 534 + __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" \ 535 + : : "rK" (__v) \ 536 + : "memory"); \ 537 + }) 538 + 539 + #endif /* __ASSEMBLY__ */ 540 + 541 + #endif /* _ASM_RISCV_CSR_H */

+32

tools/arch/riscv/include/asm/vdso/processor.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef __ASM_VDSO_PROCESSOR_H 3 + #define __ASM_VDSO_PROCESSOR_H 4 + 5 + #ifndef __ASSEMBLY__ 6 + 7 + #include <asm-generic/barrier.h> 8 + 9 + static inline void cpu_relax(void) 10 + { 11 + #ifdef __riscv_muldiv 12 + int dummy; 13 + /* In lieu of a halt instruction, induce a long-latency stall. */ 14 + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); 15 + #endif 16 + 17 + #ifdef CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE 18 + /* 19 + * Reduce instruction retirement. 20 + * This assumes the PC changes. 21 + */ 22 + __asm__ __volatile__ ("pause"); 23 + #else 24 + /* Encoding of the pause instruction */ 25 + __asm__ __volatile__ (".4byte 0x100000F"); 26 + #endif 27 + barrier(); 28 + } 29 + 30 + #endif /* __ASSEMBLY__ */ 31 + 32 + #endif /* __ASM_VDSO_PROCESSOR_H */

+1 -1

tools/arch/x86/include/asm/irq_vectors.h

··· 84 84 #define HYPERVISOR_CALLBACK_VECTOR 0xf3 85 85 86 86 /* Vector for KVM to deliver posted interrupt IPI */ 87 - #ifdef CONFIG_HAVE_KVM 87 + #if IS_ENABLED(CONFIG_KVM) 88 88 #define POSTED_INTR_VECTOR 0xf2 89 89 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 90 90 #define POSTED_INTR_NESTED_VECTOR 0xf0

+21 -10

tools/testing/selftests/kvm/Makefile

··· 36 36 LIBKVM_x86_64 += lib/x86_64/handlers.S 37 37 LIBKVM_x86_64 += lib/x86_64/hyperv.c 38 38 LIBKVM_x86_64 += lib/x86_64/memstress.c 39 + LIBKVM_x86_64 += lib/x86_64/pmu.c 39 40 LIBKVM_x86_64 += lib/x86_64/processor.c 41 + LIBKVM_x86_64 += lib/x86_64/sev.c 40 42 LIBKVM_x86_64 += lib/x86_64/svm.c 41 43 LIBKVM_x86_64 += lib/x86_64/ucall.c 42 44 LIBKVM_x86_64 += lib/x86_64/vmx.c ··· 55 53 LIBKVM_s390x += lib/s390x/processor.c 56 54 LIBKVM_s390x += lib/s390x/ucall.c 57 55 56 + LIBKVM_riscv += lib/riscv/handlers.S 58 57 LIBKVM_riscv += lib/riscv/processor.c 59 58 LIBKVM_riscv += lib/riscv/ucall.c 60 59 ··· 83 80 TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test 84 81 TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test 85 82 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test 83 + TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test 86 84 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test 87 85 TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test 88 86 TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test ··· 121 117 TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test 122 118 TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test 123 119 TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests 120 + TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test 124 121 TEST_GEN_PROGS_x86_64 += x86_64/amx_test 125 122 TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test 126 123 TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test ··· 148 143 TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test 149 144 150 145 TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs 151 - TEST_GEN_PROGS_aarch64 += aarch64/arch_timer 152 146 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions 153 147 TEST_GEN_PROGS_aarch64 += aarch64/hypercalls 154 148 TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test ··· 159 155 TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq 160 156 TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access 161 157 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test 158 + TEST_GEN_PROGS_aarch64 += arch_timer 162 159 TEST_GEN_PROGS_aarch64 += demand_paging_test 163 160 TEST_GEN_PROGS_aarch64 += dirty_log_test 164 161 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test ··· 189 184 TEST_GEN_PROGS_s390x += set_memory_region_test 190 185 TEST_GEN_PROGS_s390x += kvm_binary_stats_test 191 186 187 + TEST_GEN_PROGS_riscv += arch_timer 192 188 TEST_GEN_PROGS_riscv += demand_paging_test 193 189 TEST_GEN_PROGS_riscv += dirty_log_test 194 190 TEST_GEN_PROGS_riscv += get-reg-list ··· 200 194 TEST_GEN_PROGS_riscv += set_memory_region_test 201 195 TEST_GEN_PROGS_riscv += steal_time 202 196 197 + SPLIT_TESTS += arch_timer 203 198 SPLIT_TESTS += get-reg-list 204 199 205 200 TEST_PROGS += $(TEST_PROGS_$(ARCH_DIR)) ··· 224 217 LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include 225 218 endif 226 219 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ 227 - -Wno-gnu-variable-sized-type-not-at-end -MD -MP \ 220 + -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ 228 221 -fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \ 229 222 -fno-builtin-strnlen \ 230 223 -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ ··· 267 260 LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S)) 268 261 LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING)) 269 262 LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) 270 - SPLIT_TESTS_TARGETS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS)) 271 - SPLIT_TESTS_OBJS := $(patsubst %, $(ARCH_DIR)/%.o, $(SPLIT_TESTS)) 263 + SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS)) 264 + SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH_DIR)/%.o, $(SPLIT_TESTS)) 272 265 273 266 TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS)) 274 267 TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED)) 275 268 TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ)) 276 269 TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS)) 277 - TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TESTS_OBJS)) 270 + TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TEST_GEN_OBJ)) 278 271 -include $(TEST_DEP_FILES) 279 272 280 - $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): %: %.o 273 + $(shell mkdir -p $(sort $(OUTPUT)/$(ARCH_DIR) $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)))) 274 + 275 + $(filter-out $(SPLIT_TEST_GEN_PROGS), $(TEST_GEN_PROGS)) \ 276 + $(TEST_GEN_PROGS_EXTENDED): %: %.o 281 277 $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@ 282 278 $(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c 283 279 $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ 284 280 285 - $(SPLIT_TESTS_TARGETS): %: %.o $(SPLIT_TESTS_OBJS) 281 + $(SPLIT_TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/$(ARCH_DIR)/%.o 286 282 $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $^ $(LDLIBS) -o $@ 283 + $(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH_DIR)/%.o: $(ARCH_DIR)/%.c 284 + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ 287 285 288 286 EXTRA_CLEAN += $(GEN_HDRS) \ 289 287 $(LIBKVM_OBJS) \ 290 - $(SPLIT_TESTS_OBJS) \ 288 + $(SPLIT_TEST_GEN_OBJ) \ 291 289 $(TEST_DEP_FILES) \ 292 290 $(TEST_GEN_OBJ) \ 293 291 cscope.* 294 292 295 - $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)))) 296 293 $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c $(GEN_HDRS) 297 294 $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ 298 295 ··· 310 299 $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@ 311 300 312 301 $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) 313 - $(SPLIT_TESTS_OBJS): $(GEN_HDRS) 302 + $(SPLIT_TEST_GEN_OBJ): $(GEN_HDRS) 314 303 $(TEST_GEN_PROGS): $(LIBKVM_OBJS) 315 304 $(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS) 316 305 $(TEST_GEN_OBJ): $(GEN_HDRS)

+22 -277

tools/testing/selftests/kvm/aarch64/arch_timer.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * arch_timer.c - Tests the aarch64 timer IRQ functionality 4 - * 5 3 * The test validates both the virtual and physical timer IRQs using 6 - * CVAL and TVAL registers. This consitutes the four stages in the test. 7 - * The guest's main thread configures the timer interrupt for a stage 8 - * and waits for it to fire, with a timeout equal to the timer period. 9 - * It asserts that the timeout doesn't exceed the timer period. 10 - * 11 - * On the other hand, upon receipt of an interrupt, the guest's interrupt 12 - * handler validates the interrupt by checking if the architectural state 13 - * is in compliance with the specifications. 14 - * 15 - * The test provides command-line options to configure the timer's 16 - * period (-p), number of vCPUs (-n), and iterations per stage (-i). 17 - * To stress-test the timer stack even more, an option to migrate the 18 - * vCPUs across pCPUs (-m), at a particular rate, is also provided. 4 + * CVAL and TVAL registers. 19 5 * 20 6 * Copyright (c) 2021, Google LLC. 21 7 */ 22 8 #define _GNU_SOURCE 23 9 24 - #include <stdlib.h> 25 - #include <pthread.h> 26 - #include <linux/kvm.h> 27 - #include <linux/sizes.h> 28 - #include <linux/bitmap.h> 29 - #include <sys/sysinfo.h> 30 - 31 - #include "kvm_util.h" 32 - #include "processor.h" 33 - #include "delay.h" 34 10 #include "arch_timer.h" 11 + #include "delay.h" 35 12 #include "gic.h" 13 + #include "processor.h" 14 + #include "timer_test.h" 36 15 #include "vgic.h" 37 - 38 - #define NR_VCPUS_DEF 4 39 - #define NR_TEST_ITERS_DEF 5 40 - #define TIMER_TEST_PERIOD_MS_DEF 10 41 - #define TIMER_TEST_ERR_MARGIN_US 100 42 - #define TIMER_TEST_MIGRATION_FREQ_MS 2 43 - 44 - struct test_args { 45 - int nr_vcpus; 46 - int nr_iter; 47 - int timer_period_ms; 48 - int migration_freq_ms; 49 - struct kvm_arm_counter_offset offset; 50 - }; 51 - 52 - static struct test_args test_args = { 53 - .nr_vcpus = NR_VCPUS_DEF, 54 - .nr_iter = NR_TEST_ITERS_DEF, 55 - .timer_period_ms = TIMER_TEST_PERIOD_MS_DEF, 56 - .migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS, 57 - .offset = { .reserved = 1 }, 58 - }; 59 - 60 - #define msecs_to_usecs(msec) ((msec) * 1000LL) 61 16 62 17 #define GICD_BASE_GPA 0x8000000ULL 63 18 #define GICR_BASE_GPA 0x80A0000ULL ··· 25 70 GUEST_STAGE_MAX, 26 71 }; 27 72 28 - /* Shared variables between host and guest */ 29 - struct test_vcpu_shared_data { 30 - int nr_iter; 31 - enum guest_stage guest_stage; 32 - uint64_t xcnt; 33 - }; 34 - 35 - static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; 36 - static pthread_t pt_vcpu_run[KVM_MAX_VCPUS]; 37 - static struct test_vcpu_shared_data vcpu_shared_data[KVM_MAX_VCPUS]; 38 - 39 73 static int vtimer_irq, ptimer_irq; 40 - 41 - static unsigned long *vcpu_done_map; 42 - static pthread_mutex_t vcpu_done_map_lock; 43 74 44 75 static void 45 76 guest_configure_timer_action(struct test_vcpu_shared_data *shared_data) ··· 99 158 100 159 /* Basic 'timer condition met' check */ 101 160 __GUEST_ASSERT(xcnt >= cval, 102 - "xcnt = 0x%llx, cval = 0x%llx, xcnt_diff_us = 0x%llx", 161 + "xcnt = 0x%lx, cval = 0x%lx, xcnt_diff_us = 0x%lx", 103 162 xcnt, cval, xcnt_diff_us); 104 - __GUEST_ASSERT(xctl & CTL_ISTATUS, "xcnt = 0x%llx", xcnt); 163 + __GUEST_ASSERT(xctl & CTL_ISTATUS, "xctl = 0x%lx", xctl); 105 164 106 165 WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); 107 166 } ··· 131 190 132 191 /* Setup a timeout for the interrupt to arrive */ 133 192 udelay(msecs_to_usecs(test_args.timer_period_ms) + 134 - TIMER_TEST_ERR_MARGIN_US); 193 + test_args.timer_err_margin_us); 135 194 136 195 irq_iter = READ_ONCE(shared_data->nr_iter); 137 - GUEST_ASSERT_EQ(config_iter + 1, irq_iter); 196 + __GUEST_ASSERT(config_iter + 1 == irq_iter, 197 + "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" 198 + " Guest timer interrupt was not trigged within the specified\n" 199 + " interval, try to increase the error margin by [-e] option.\n", 200 + config_iter + 1, irq_iter); 138 201 } 139 202 } 140 203 ··· 167 222 GUEST_DONE(); 168 223 } 169 224 170 - static void *test_vcpu_run(void *arg) 171 - { 172 - unsigned int vcpu_idx = (unsigned long)arg; 173 - struct ucall uc; 174 - struct kvm_vcpu *vcpu = vcpus[vcpu_idx]; 175 - struct kvm_vm *vm = vcpu->vm; 176 - struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpu_idx]; 177 - 178 - vcpu_run(vcpu); 179 - 180 - /* Currently, any exit from guest is an indication of completion */ 181 - pthread_mutex_lock(&vcpu_done_map_lock); 182 - __set_bit(vcpu_idx, vcpu_done_map); 183 - pthread_mutex_unlock(&vcpu_done_map_lock); 184 - 185 - switch (get_ucall(vcpu, &uc)) { 186 - case UCALL_SYNC: 187 - case UCALL_DONE: 188 - break; 189 - case UCALL_ABORT: 190 - sync_global_from_guest(vm, *shared_data); 191 - fprintf(stderr, "Guest assert failed, vcpu %u; stage; %u; iter: %u\n", 192 - vcpu_idx, shared_data->guest_stage, shared_data->nr_iter); 193 - REPORT_GUEST_ASSERT(uc); 194 - break; 195 - default: 196 - TEST_FAIL("Unexpected guest exit"); 197 - } 198 - 199 - return NULL; 200 - } 201 - 202 - static uint32_t test_get_pcpu(void) 203 - { 204 - uint32_t pcpu; 205 - unsigned int nproc_conf; 206 - cpu_set_t online_cpuset; 207 - 208 - nproc_conf = get_nprocs_conf(); 209 - sched_getaffinity(0, sizeof(cpu_set_t), &online_cpuset); 210 - 211 - /* Randomly find an available pCPU to place a vCPU on */ 212 - do { 213 - pcpu = rand() % nproc_conf; 214 - } while (!CPU_ISSET(pcpu, &online_cpuset)); 215 - 216 - return pcpu; 217 - } 218 - 219 - static int test_migrate_vcpu(unsigned int vcpu_idx) 220 - { 221 - int ret; 222 - cpu_set_t cpuset; 223 - uint32_t new_pcpu = test_get_pcpu(); 224 - 225 - CPU_ZERO(&cpuset); 226 - CPU_SET(new_pcpu, &cpuset); 227 - 228 - pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu_idx, new_pcpu); 229 - 230 - ret = pthread_setaffinity_np(pt_vcpu_run[vcpu_idx], 231 - sizeof(cpuset), &cpuset); 232 - 233 - /* Allow the error where the vCPU thread is already finished */ 234 - TEST_ASSERT(ret == 0 || ret == ESRCH, 235 - "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d", 236 - vcpu_idx, new_pcpu, ret); 237 - 238 - return ret; 239 - } 240 - 241 - static void *test_vcpu_migration(void *arg) 242 - { 243 - unsigned int i, n_done; 244 - bool vcpu_done; 245 - 246 - do { 247 - usleep(msecs_to_usecs(test_args.migration_freq_ms)); 248 - 249 - for (n_done = 0, i = 0; i < test_args.nr_vcpus; i++) { 250 - pthread_mutex_lock(&vcpu_done_map_lock); 251 - vcpu_done = test_bit(i, vcpu_done_map); 252 - pthread_mutex_unlock(&vcpu_done_map_lock); 253 - 254 - if (vcpu_done) { 255 - n_done++; 256 - continue; 257 - } 258 - 259 - test_migrate_vcpu(i); 260 - } 261 - } while (test_args.nr_vcpus != n_done); 262 - 263 - return NULL; 264 - } 265 - 266 - static void test_run(struct kvm_vm *vm) 267 - { 268 - pthread_t pt_vcpu_migration; 269 - unsigned int i; 270 - int ret; 271 - 272 - pthread_mutex_init(&vcpu_done_map_lock, NULL); 273 - vcpu_done_map = bitmap_zalloc(test_args.nr_vcpus); 274 - TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap"); 275 - 276 - for (i = 0; i < (unsigned long)test_args.nr_vcpus; i++) { 277 - ret = pthread_create(&pt_vcpu_run[i], NULL, test_vcpu_run, 278 - (void *)(unsigned long)i); 279 - TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread", i); 280 - } 281 - 282 - /* Spawn a thread to control the vCPU migrations */ 283 - if (test_args.migration_freq_ms) { 284 - srand(time(NULL)); 285 - 286 - ret = pthread_create(&pt_vcpu_migration, NULL, 287 - test_vcpu_migration, NULL); 288 - TEST_ASSERT(!ret, "Failed to create the migration pthread"); 289 - } 290 - 291 - 292 - for (i = 0; i < test_args.nr_vcpus; i++) 293 - pthread_join(pt_vcpu_run[i], NULL); 294 - 295 - if (test_args.migration_freq_ms) 296 - pthread_join(pt_vcpu_migration, NULL); 297 - 298 - bitmap_free(vcpu_done_map); 299 - } 300 - 301 225 static void test_init_timer_irq(struct kvm_vm *vm) 302 226 { 303 227 /* Timer initid should be same for all the vCPUs, so query only vCPU-0 */ ··· 183 369 184 370 static int gic_fd; 185 371 186 - static struct kvm_vm *test_vm_create(void) 372 + struct kvm_vm *test_vm_create(void) 187 373 { 188 374 struct kvm_vm *vm; 189 375 unsigned int i; ··· 194 380 vm_init_descriptor_tables(vm); 195 381 vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); 196 382 197 - if (!test_args.offset.reserved) { 198 - if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET)) 199 - vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &test_args.offset); 200 - else 383 + if (!test_args.reserved) { 384 + if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET)) { 385 + struct kvm_arm_counter_offset offset = { 386 + .counter_offset = test_args.counter_offset, 387 + .reserved = 0, 388 + }; 389 + vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &offset); 390 + } else 201 391 TEST_FAIL("no support for global offset"); 202 392 } 203 393 ··· 218 400 return vm; 219 401 } 220 402 221 - static void test_vm_cleanup(struct kvm_vm *vm) 403 + void test_vm_cleanup(struct kvm_vm *vm) 222 404 { 223 405 close(gic_fd); 224 406 kvm_vm_free(vm); 225 - } 226 - 227 - static void test_print_help(char *name) 228 - { 229 - pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n", 230 - name); 231 - pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n", 232 - NR_VCPUS_DEF, KVM_MAX_VCPUS); 233 - pr_info("\t-i: Number of iterations per stage (default: %u)\n", 234 - NR_TEST_ITERS_DEF); 235 - pr_info("\t-p: Periodicity (in ms) of the guest timer (default: %u)\n", 236 - TIMER_TEST_PERIOD_MS_DEF); 237 - pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n", 238 - TIMER_TEST_MIGRATION_FREQ_MS); 239 - pr_info("\t-o: Counter offset (in counter cycles, default: 0)\n"); 240 - pr_info("\t-h: print this help screen\n"); 241 - } 242 - 243 - static bool parse_args(int argc, char *argv[]) 244 - { 245 - int opt; 246 - 247 - while ((opt = getopt(argc, argv, "hn:i:p:m:o:")) != -1) { 248 - switch (opt) { 249 - case 'n': 250 - test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg); 251 - if (test_args.nr_vcpus > KVM_MAX_VCPUS) { 252 - pr_info("Max allowed vCPUs: %u\n", 253 - KVM_MAX_VCPUS); 254 - goto err; 255 - } 256 - break; 257 - case 'i': 258 - test_args.nr_iter = atoi_positive("Number of iterations", optarg); 259 - break; 260 - case 'p': 261 - test_args.timer_period_ms = atoi_positive("Periodicity", optarg); 262 - break; 263 - case 'm': 264 - test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg); 265 - break; 266 - case 'o': 267 - test_args.offset.counter_offset = strtol(optarg, NULL, 0); 268 - test_args.offset.reserved = 0; 269 - break; 270 - case 'h': 271 - default: 272 - goto err; 273 - } 274 - } 275 - 276 - return true; 277 - 278 - err: 279 - test_print_help(argv[0]); 280 - return false; 281 - } 282 - 283 - int main(int argc, char *argv[]) 284 - { 285 - struct kvm_vm *vm; 286 - 287 - if (!parse_args(argc, argv)) 288 - exit(KSFT_SKIP); 289 - 290 - __TEST_REQUIRE(!test_args.migration_freq_ms || get_nprocs() >= 2, 291 - "At least two physical CPUs needed for vCPU migration"); 292 - 293 - vm = test_vm_create(); 294 - test_run(vm); 295 - test_vm_cleanup(vm); 296 - 297 - return 0; 298 407 }

+1 -1

tools/testing/selftests/kvm/aarch64/debug-exceptions.c

··· 365 365 366 366 static void guest_ss_handler(struct ex_regs *regs) 367 367 { 368 - __GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%u'", ss_idx); 368 + __GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%lu'", ss_idx); 369 369 ss_addr[ss_idx++] = regs->pc; 370 370 regs->pstate |= SPSR_SS; 371 371 }

+2 -2

tools/testing/selftests/kvm/aarch64/hypercalls.c

··· 105 105 case TEST_STAGE_HVC_IFACE_FEAT_DISABLED: 106 106 case TEST_STAGE_HVC_IFACE_FALSE_INFO: 107 107 __GUEST_ASSERT(res.a0 == SMCCC_RET_NOT_SUPPORTED, 108 - "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%llx, stage = %u", 108 + "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u", 109 109 res.a0, hc_info->func_id, hc_info->arg1, stage); 110 110 break; 111 111 case TEST_STAGE_HVC_IFACE_FEAT_ENABLED: 112 112 __GUEST_ASSERT(res.a0 != SMCCC_RET_NOT_SUPPORTED, 113 - "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%llx, stage = %u", 113 + "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u", 114 114 res.a0, hc_info->func_id, hc_info->arg1, stage); 115 115 break; 116 116 default:

+1 -1

tools/testing/selftests/kvm/aarch64/page_fault_test.c

··· 292 292 293 293 static void no_dabt_handler(struct ex_regs *regs) 294 294 { 295 - GUEST_FAIL("Unexpected dabt, far_el1 = 0x%llx", read_sysreg(far_el1)); 295 + GUEST_FAIL("Unexpected dabt, far_el1 = 0x%lx", read_sysreg(far_el1)); 296 296 } 297 297 298 298 static void no_iabt_handler(struct ex_regs *regs)

+11 -7

tools/testing/selftests/kvm/aarch64/set_id_regs.c

··· 32 32 enum ftr_type type; 33 33 uint8_t shift; 34 34 uint64_t mask; 35 + /* 36 + * For FTR_EXACT, safe_val is used as the exact safe value. 37 + * For FTR_LOWER_SAFE, safe_val is used as the minimal safe value. 38 + */ 35 39 int64_t safe_val; 36 40 }; 37 41 ··· 69 65 70 66 static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = { 71 67 S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0), 72 - REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, 0), 68 + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP), 73 69 REG_FTR_END, 74 70 }; 75 71 76 72 static const struct reg_ftr_bits ftr_id_dfr0_el1[] = { 77 - S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, 0), 78 - REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, 0), 73 + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, ID_DFR0_EL1_PerfMon_PMUv3), 74 + REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, ID_DFR0_EL1_CopDbg_Armv8), 79 75 REG_FTR_END, 80 76 }; 81 77 ··· 228 224 { 229 225 uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); 230 226 231 - if (ftr_bits->type == FTR_UNSIGNED) { 227 + if (ftr_bits->sign == FTR_UNSIGNED) { 232 228 switch (ftr_bits->type) { 233 229 case FTR_EXACT: 234 230 ftr = ftr_bits->safe_val; 235 231 break; 236 232 case FTR_LOWER_SAFE: 237 - if (ftr > 0) 233 + if (ftr > ftr_bits->safe_val) 238 234 ftr--; 239 235 break; 240 236 case FTR_HIGHER_SAFE: ··· 256 252 ftr = ftr_bits->safe_val; 257 253 break; 258 254 case FTR_LOWER_SAFE: 259 - if (ftr > 0) 255 + if (ftr > ftr_bits->safe_val) 260 256 ftr--; 261 257 break; 262 258 case FTR_HIGHER_SAFE: ··· 280 276 { 281 277 uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); 282 278 283 - if (ftr_bits->type == FTR_UNSIGNED) { 279 + if (ftr_bits->sign == FTR_UNSIGNED) { 284 280 switch (ftr_bits->type) { 285 281 case FTR_EXACT: 286 282 ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1);

+6 -22

tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c

··· 93 93 isb(); 94 94 } 95 95 96 - static inline void enable_counter(int idx) 97 - { 98 - uint64_t v = read_sysreg(pmcntenset_el0); 99 - 100 - write_sysreg(BIT(idx) | v, pmcntenset_el0); 101 - isb(); 102 - } 103 - 104 - static inline void disable_counter(int idx) 105 - { 106 - uint64_t v = read_sysreg(pmcntenset_el0); 107 - 108 - write_sysreg(BIT(idx) | v, pmcntenclr_el0); 109 - isb(); 110 - } 111 - 112 96 static void pmu_disable_reset(void) 113 97 { 114 98 uint64_t pmcr = read_sysreg(pmcr_el0); ··· 179 195 \ 180 196 if (set_expected) \ 181 197 __GUEST_ASSERT((_tval & mask), \ 182 - "tval: 0x%lx; mask: 0x%lx; set_expected: 0x%lx", \ 198 + "tval: 0x%lx; mask: 0x%lx; set_expected: %u", \ 183 199 _tval, mask, set_expected); \ 184 200 else \ 185 201 __GUEST_ASSERT(!(_tval & mask), \ 186 - "tval: 0x%lx; mask: 0x%lx; set_expected: 0x%lx", \ 202 + "tval: 0x%lx; mask: 0x%lx; set_expected: %u", \ 187 203 _tval, mask, set_expected); \ 188 204 } 189 205 ··· 270 286 acc->write_typer(pmc_idx, write_data); 271 287 read_data = acc->read_typer(pmc_idx); 272 288 __GUEST_ASSERT(read_data == write_data, 273 - "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", 289 + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", 274 290 pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data); 275 291 276 292 /* ··· 281 297 282 298 /* The count value must be 0, as it is disabled and reset */ 283 299 __GUEST_ASSERT(read_data == 0, 284 - "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx", 300 + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx", 285 301 pmc_idx, PMC_ACC_TO_IDX(acc), read_data); 286 302 287 303 write_data = read_data + pmc_idx + 0x12345; 288 304 acc->write_cntr(pmc_idx, write_data); 289 305 read_data = acc->read_cntr(pmc_idx); 290 306 __GUEST_ASSERT(read_data == write_data, 291 - "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", 307 + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", 292 308 pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data); 293 309 } 294 310 ··· 363 379 int i, pmc; 364 380 365 381 __GUEST_ASSERT(expected_pmcr_n <= ARMV8_PMU_MAX_GENERAL_COUNTERS, 366 - "Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%lx", 382 + "Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%x", 367 383 expected_pmcr_n, ARMV8_PMU_MAX_GENERAL_COUNTERS); 368 384 369 385 pmcr = read_sysreg(pmcr_el0);

+259

tools/testing/selftests/kvm/arch_timer.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * arch_timer.c - Tests the arch timer IRQ functionality 4 + * 5 + * The guest's main thread configures the timer interrupt and waits 6 + * for it to fire, with a timeout equal to the timer period. 7 + * It asserts that the timeout doesn't exceed the timer period plus 8 + * a user configurable error margin(default to 100us) 9 + * 10 + * On the other hand, upon receipt of an interrupt, the guest's interrupt 11 + * handler validates the interrupt by checking if the architectural state 12 + * is in compliance with the specifications. 13 + * 14 + * The test provides command-line options to configure the timer's 15 + * period (-p), number of vCPUs (-n), iterations per stage (-i) and timer 16 + * interrupt arrival error margin (-e). To stress-test the timer stack 17 + * even more, an option to migrate the vCPUs across pCPUs (-m), at a 18 + * particular rate, is also provided. 19 + * 20 + * Copyright (c) 2021, Google LLC. 21 + */ 22 + 23 + #define _GNU_SOURCE 24 + 25 + #include <stdlib.h> 26 + #include <pthread.h> 27 + #include <linux/sizes.h> 28 + #include <linux/bitmap.h> 29 + #include <sys/sysinfo.h> 30 + 31 + #include "timer_test.h" 32 + 33 + struct test_args test_args = { 34 + .nr_vcpus = NR_VCPUS_DEF, 35 + .nr_iter = NR_TEST_ITERS_DEF, 36 + .timer_period_ms = TIMER_TEST_PERIOD_MS_DEF, 37 + .migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS, 38 + .timer_err_margin_us = TIMER_TEST_ERR_MARGIN_US, 39 + .reserved = 1, 40 + }; 41 + 42 + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; 43 + struct test_vcpu_shared_data vcpu_shared_data[KVM_MAX_VCPUS]; 44 + 45 + static pthread_t pt_vcpu_run[KVM_MAX_VCPUS]; 46 + static unsigned long *vcpu_done_map; 47 + static pthread_mutex_t vcpu_done_map_lock; 48 + 49 + static void *test_vcpu_run(void *arg) 50 + { 51 + unsigned int vcpu_idx = (unsigned long)arg; 52 + struct ucall uc; 53 + struct kvm_vcpu *vcpu = vcpus[vcpu_idx]; 54 + struct kvm_vm *vm = vcpu->vm; 55 + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpu_idx]; 56 + 57 + vcpu_run(vcpu); 58 + 59 + /* Currently, any exit from guest is an indication of completion */ 60 + pthread_mutex_lock(&vcpu_done_map_lock); 61 + __set_bit(vcpu_idx, vcpu_done_map); 62 + pthread_mutex_unlock(&vcpu_done_map_lock); 63 + 64 + switch (get_ucall(vcpu, &uc)) { 65 + case UCALL_SYNC: 66 + case UCALL_DONE: 67 + break; 68 + case UCALL_ABORT: 69 + sync_global_from_guest(vm, *shared_data); 70 + fprintf(stderr, "Guest assert failed, vcpu %u; stage; %u; iter: %u\n", 71 + vcpu_idx, shared_data->guest_stage, shared_data->nr_iter); 72 + REPORT_GUEST_ASSERT(uc); 73 + break; 74 + default: 75 + TEST_FAIL("Unexpected guest exit"); 76 + } 77 + 78 + pr_info("PASS(vCPU-%d).\n", vcpu_idx); 79 + 80 + return NULL; 81 + } 82 + 83 + static uint32_t test_get_pcpu(void) 84 + { 85 + uint32_t pcpu; 86 + unsigned int nproc_conf; 87 + cpu_set_t online_cpuset; 88 + 89 + nproc_conf = get_nprocs_conf(); 90 + sched_getaffinity(0, sizeof(cpu_set_t), &online_cpuset); 91 + 92 + /* Randomly find an available pCPU to place a vCPU on */ 93 + do { 94 + pcpu = rand() % nproc_conf; 95 + } while (!CPU_ISSET(pcpu, &online_cpuset)); 96 + 97 + return pcpu; 98 + } 99 + 100 + static int test_migrate_vcpu(unsigned int vcpu_idx) 101 + { 102 + int ret; 103 + cpu_set_t cpuset; 104 + uint32_t new_pcpu = test_get_pcpu(); 105 + 106 + CPU_ZERO(&cpuset); 107 + CPU_SET(new_pcpu, &cpuset); 108 + 109 + pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu_idx, new_pcpu); 110 + 111 + ret = pthread_setaffinity_np(pt_vcpu_run[vcpu_idx], 112 + sizeof(cpuset), &cpuset); 113 + 114 + /* Allow the error where the vCPU thread is already finished */ 115 + TEST_ASSERT(ret == 0 || ret == ESRCH, 116 + "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d", 117 + vcpu_idx, new_pcpu, ret); 118 + 119 + return ret; 120 + } 121 + 122 + static void *test_vcpu_migration(void *arg) 123 + { 124 + unsigned int i, n_done; 125 + bool vcpu_done; 126 + 127 + do { 128 + usleep(msecs_to_usecs(test_args.migration_freq_ms)); 129 + 130 + for (n_done = 0, i = 0; i < test_args.nr_vcpus; i++) { 131 + pthread_mutex_lock(&vcpu_done_map_lock); 132 + vcpu_done = test_bit(i, vcpu_done_map); 133 + pthread_mutex_unlock(&vcpu_done_map_lock); 134 + 135 + if (vcpu_done) { 136 + n_done++; 137 + continue; 138 + } 139 + 140 + test_migrate_vcpu(i); 141 + } 142 + } while (test_args.nr_vcpus != n_done); 143 + 144 + return NULL; 145 + } 146 + 147 + static void test_run(struct kvm_vm *vm) 148 + { 149 + pthread_t pt_vcpu_migration; 150 + unsigned int i; 151 + int ret; 152 + 153 + pthread_mutex_init(&vcpu_done_map_lock, NULL); 154 + vcpu_done_map = bitmap_zalloc(test_args.nr_vcpus); 155 + TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap"); 156 + 157 + for (i = 0; i < (unsigned long)test_args.nr_vcpus; i++) { 158 + ret = pthread_create(&pt_vcpu_run[i], NULL, test_vcpu_run, 159 + (void *)(unsigned long)i); 160 + TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread", i); 161 + } 162 + 163 + /* Spawn a thread to control the vCPU migrations */ 164 + if (test_args.migration_freq_ms) { 165 + srand(time(NULL)); 166 + 167 + ret = pthread_create(&pt_vcpu_migration, NULL, 168 + test_vcpu_migration, NULL); 169 + TEST_ASSERT(!ret, "Failed to create the migration pthread"); 170 + } 171 + 172 + 173 + for (i = 0; i < test_args.nr_vcpus; i++) 174 + pthread_join(pt_vcpu_run[i], NULL); 175 + 176 + if (test_args.migration_freq_ms) 177 + pthread_join(pt_vcpu_migration, NULL); 178 + 179 + bitmap_free(vcpu_done_map); 180 + } 181 + 182 + static void test_print_help(char *name) 183 + { 184 + pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n" 185 + "\t\t [-m migration_freq_ms] [-o counter_offset]\n" 186 + "\t\t [-e timer_err_margin_us]\n", name); 187 + pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n", 188 + NR_VCPUS_DEF, KVM_MAX_VCPUS); 189 + pr_info("\t-i: Number of iterations per stage (default: %u)\n", 190 + NR_TEST_ITERS_DEF); 191 + pr_info("\t-p: Periodicity (in ms) of the guest timer (default: %u)\n", 192 + TIMER_TEST_PERIOD_MS_DEF); 193 + pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n", 194 + TIMER_TEST_MIGRATION_FREQ_MS); 195 + pr_info("\t-o: Counter offset (in counter cycles, default: 0) [aarch64-only]\n"); 196 + pr_info("\t-e: Interrupt arrival error margin (in us) of the guest timer (default: %u)\n", 197 + TIMER_TEST_ERR_MARGIN_US); 198 + pr_info("\t-h: print this help screen\n"); 199 + } 200 + 201 + static bool parse_args(int argc, char *argv[]) 202 + { 203 + int opt; 204 + 205 + while ((opt = getopt(argc, argv, "hn:i:p:m:o:e:")) != -1) { 206 + switch (opt) { 207 + case 'n': 208 + test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg); 209 + if (test_args.nr_vcpus > KVM_MAX_VCPUS) { 210 + pr_info("Max allowed vCPUs: %u\n", 211 + KVM_MAX_VCPUS); 212 + goto err; 213 + } 214 + break; 215 + case 'i': 216 + test_args.nr_iter = atoi_positive("Number of iterations", optarg); 217 + break; 218 + case 'p': 219 + test_args.timer_period_ms = atoi_positive("Periodicity", optarg); 220 + break; 221 + case 'm': 222 + test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg); 223 + break; 224 + case 'e': 225 + test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg); 226 + break; 227 + case 'o': 228 + test_args.counter_offset = strtol(optarg, NULL, 0); 229 + test_args.reserved = 0; 230 + break; 231 + case 'h': 232 + default: 233 + goto err; 234 + } 235 + } 236 + 237 + return true; 238 + 239 + err: 240 + test_print_help(argv[0]); 241 + return false; 242 + } 243 + 244 + int main(int argc, char *argv[]) 245 + { 246 + struct kvm_vm *vm; 247 + 248 + if (!parse_args(argc, argv)) 249 + exit(KSFT_SKIP); 250 + 251 + __TEST_REQUIRE(!test_args.migration_freq_ms || get_nprocs() >= 2, 252 + "At least two physical CPUs needed for vCPU migration"); 253 + 254 + vm = test_vm_create(); 255 + test_run(vm); 256 + test_vm_cleanup(vm); 257 + 258 + return 0; 259 + }

+3

tools/testing/selftests/kvm/guest_memfd_test.c

··· 167 167 TEST_ASSERT(ret != -1, "memfd fstat should succeed"); 168 168 TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size"); 169 169 TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers"); 170 + 171 + close(fd2); 172 + close(fd1); 170 173 } 171 174 172 175 int main(int argc, char *argv[])

+7

tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef SELFTEST_KVM_UTIL_ARCH_H 3 + #define SELFTEST_KVM_UTIL_ARCH_H 4 + 5 + struct kvm_vm_arch {}; 6 + 7 + #endif // SELFTEST_KVM_UTIL_ARCH_H

-4

tools/testing/selftests/kvm/include/aarch64/processor.h

··· 226 226 uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, 227 227 uint64_t arg6, struct arm_smccc_res *res); 228 228 229 - 230 - 231 - uint32_t guest_get_vcpuid(void); 232 - 233 229 #endif /* SELFTEST_KVM_PROCESSOR_H */

+36

tools/testing/selftests/kvm/include/kvm_test_harness.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Macros for defining a KVM test 4 + * 5 + * Copyright (C) 2022, Google LLC. 6 + */ 7 + 8 + #ifndef SELFTEST_KVM_TEST_HARNESS_H 9 + #define SELFTEST_KVM_TEST_HARNESS_H 10 + 11 + #include "kselftest_harness.h" 12 + 13 + #define KVM_ONE_VCPU_TEST_SUITE(name) \ 14 + FIXTURE(name) { \ 15 + struct kvm_vcpu *vcpu; \ 16 + }; \ 17 + \ 18 + FIXTURE_SETUP(name) { \ 19 + (void)vm_create_with_one_vcpu(&self->vcpu, NULL); \ 20 + } \ 21 + \ 22 + FIXTURE_TEARDOWN(name) { \ 23 + kvm_vm_free(self->vcpu->vm); \ 24 + } 25 + 26 + #define KVM_ONE_VCPU_TEST(suite, test, guestcode) \ 27 + static void __suite##_##test(struct kvm_vcpu *vcpu); \ 28 + \ 29 + TEST_F(suite, test) \ 30 + { \ 31 + vcpu_arch_set_entry_point(self->vcpu, guestcode); \ 32 + __suite##_##test(self->vcpu); \ 33 + } \ 34 + static void __suite##_##test(struct kvm_vcpu *vcpu) 35 + 36 + #endif /* SELFTEST_KVM_TEST_HARNESS_H */

+59 -8

tools/testing/selftests/kvm/include/kvm_util_base.h

··· 18 18 #include <linux/types.h> 19 19 20 20 #include <asm/atomic.h> 21 + #include <asm/kvm.h> 21 22 22 23 #include <sys/ioctl.h> 23 24 25 + #include "kvm_util_arch.h" 24 26 #include "sparsebit.h" 25 27 26 28 /* ··· 48 46 struct userspace_mem_region { 49 47 struct kvm_userspace_memory_region2 region; 50 48 struct sparsebit *unused_phy_pages; 49 + struct sparsebit *protected_phy_pages; 51 50 int fd; 52 51 off_t offset; 53 52 enum vm_mem_backing_src_type backing_src_type; ··· 93 90 struct kvm_vm { 94 91 int mode; 95 92 unsigned long type; 93 + uint8_t subtype; 96 94 int kvm_fd; 97 95 int fd; 98 96 unsigned int pgtable_levels; ··· 115 111 vm_vaddr_t idt; 116 112 vm_vaddr_t handlers; 117 113 uint32_t dirty_ring_size; 114 + uint64_t gpa_tag_mask; 115 + 116 + struct kvm_vm_arch arch; 118 117 119 118 /* Cache of information for binary stats interface */ 120 119 int stats_fd; ··· 198 191 }; 199 192 200 193 struct vm_shape { 201 - enum vm_guest_mode mode; 202 - unsigned int type; 194 + uint32_t type; 195 + uint8_t mode; 196 + uint8_t subtype; 197 + uint16_t padding; 203 198 }; 199 + 200 + kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); 204 201 205 202 #define VM_TYPE_DEFAULT 0 206 203 ··· 269 258 bool get_kvm_param_bool(const char *param); 270 259 bool get_kvm_intel_param_bool(const char *param); 271 260 bool get_kvm_amd_param_bool(const char *param); 261 + 262 + int get_kvm_param_integer(const char *param); 263 + int get_kvm_intel_param_integer(const char *param); 264 + int get_kvm_amd_param_integer(const char *param); 272 265 273 266 unsigned int kvm_check_cap(long cap); 274 267 ··· 579 564 uint64_t guest_paddr, uint32_t slot, uint64_t npages, 580 565 uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); 581 566 567 + #ifndef vm_arch_has_protected_memory 568 + static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) 569 + { 570 + return false; 571 + } 572 + #endif 573 + 582 574 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); 583 575 void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); 584 576 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); ··· 595 573 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); 596 574 vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, 597 575 enum kvm_mem_region_type type); 576 + vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, 577 + vm_vaddr_t vaddr_min, 578 + enum kvm_mem_region_type type); 598 579 vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); 599 580 vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, 600 581 enum kvm_mem_region_type type); ··· 609 584 void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); 610 585 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); 611 586 void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); 587 + 588 + 589 + static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) 590 + { 591 + return gpa & ~vm->gpa_tag_mask; 592 + } 612 593 613 594 void vcpu_run(struct kvm_vcpu *vcpu); 614 595 int _vcpu_run(struct kvm_vcpu *vcpu); ··· 858 827 859 828 vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, 860 829 uint32_t memslot); 861 - vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 862 - vm_paddr_t paddr_min, uint32_t memslot); 830 + vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 831 + vm_paddr_t paddr_min, uint32_t memslot, 832 + bool protected); 863 833 vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); 834 + 835 + static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 836 + vm_paddr_t paddr_min, uint32_t memslot) 837 + { 838 + /* 839 + * By default, allocate memory as protected for VMs that support 840 + * protected memory, as the majority of memory for such VMs is 841 + * protected, i.e. using shared memory is effectively opt-in. 842 + */ 843 + return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, 844 + vm_arch_has_protected_memory(vm)); 845 + } 864 846 865 847 /* 866 848 * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also ··· 1013 969 * Input Args: 1014 970 * vm - Virtual Machine 1015 971 * vcpu_id - The id of the VCPU to add to the VM. 1016 - * guest_code - The vCPU's entry point 1017 972 */ 1018 - struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 1019 - void *guest_code); 973 + struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); 974 + void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); 1020 975 1021 976 static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 1022 977 void *guest_code) 1023 978 { 1024 - return vm_arch_vcpu_add(vm, vcpu_id, guest_code); 979 + struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); 980 + 981 + vcpu_arch_set_entry_point(vcpu, guest_code); 982 + 983 + return vcpu; 1025 984 } 1026 985 1027 986 /* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ ··· 1127 1080 void kvm_selftest_arch_init(void); 1128 1081 1129 1082 void kvm_arch_vm_post_create(struct kvm_vm *vm); 1083 + 1084 + bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); 1085 + 1086 + uint32_t guest_get_vcpuid(void); 1130 1087 1131 1088 #endif /* SELFTEST_KVM_UTIL_BASE_H */

+71

tools/testing/selftests/kvm/include/riscv/arch_timer.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * RISC-V Arch Timer(sstc) specific interface 4 + * 5 + * Copyright (c) 2024 Intel Corporation 6 + */ 7 + 8 + #ifndef SELFTEST_KVM_ARCH_TIMER_H 9 + #define SELFTEST_KVM_ARCH_TIMER_H 10 + 11 + #include <asm/csr.h> 12 + #include <asm/vdso/processor.h> 13 + 14 + static unsigned long timer_freq; 15 + 16 + #define msec_to_cycles(msec) \ 17 + ((timer_freq) * (uint64_t)(msec) / 1000) 18 + 19 + #define usec_to_cycles(usec) \ 20 + ((timer_freq) * (uint64_t)(usec) / 1000000) 21 + 22 + #define cycles_to_usec(cycles) \ 23 + ((uint64_t)(cycles) * 1000000 / (timer_freq)) 24 + 25 + static inline uint64_t timer_get_cycles(void) 26 + { 27 + return csr_read(CSR_TIME); 28 + } 29 + 30 + static inline void timer_set_cmp(uint64_t cval) 31 + { 32 + csr_write(CSR_STIMECMP, cval); 33 + } 34 + 35 + static inline uint64_t timer_get_cmp(void) 36 + { 37 + return csr_read(CSR_STIMECMP); 38 + } 39 + 40 + static inline void timer_irq_enable(void) 41 + { 42 + csr_set(CSR_SIE, IE_TIE); 43 + } 44 + 45 + static inline void timer_irq_disable(void) 46 + { 47 + csr_clear(CSR_SIE, IE_TIE); 48 + } 49 + 50 + static inline void timer_set_next_cmp_ms(uint32_t msec) 51 + { 52 + uint64_t now_ct = timer_get_cycles(); 53 + uint64_t next_ct = now_ct + msec_to_cycles(msec); 54 + 55 + timer_set_cmp(next_ct); 56 + } 57 + 58 + static inline void __delay(uint64_t cycles) 59 + { 60 + uint64_t start = timer_get_cycles(); 61 + 62 + while ((timer_get_cycles() - start) < cycles) 63 + cpu_relax(); 64 + } 65 + 66 + static inline void udelay(unsigned long usec) 67 + { 68 + __delay(usec_to_cycles(usec)); 69 + } 70 + 71 + #endif /* SELFTEST_KVM_ARCH_TIMER_H */

+7

tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef SELFTEST_KVM_UTIL_ARCH_H 3 + #define SELFTEST_KVM_UTIL_ARCH_H 4 + 5 + struct kvm_vm_arch {}; 6 + 7 + #endif // SELFTEST_KVM_UTIL_ARCH_H

+64 -8

tools/testing/selftests/kvm/include/riscv/processor.h

··· 7 7 #ifndef SELFTEST_KVM_PROCESSOR_H 8 8 #define SELFTEST_KVM_PROCESSOR_H 9 9 10 - #include "kvm_util.h" 11 10 #include <linux/stringify.h> 11 + #include <asm/csr.h> 12 + #include "kvm_util.h" 12 13 13 14 static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype, 14 15 uint64_t idx, uint64_t size) ··· 47 46 #define RISCV_SBI_EXT_REG(idx) __kvm_reg_id(KVM_REG_RISCV_SBI_EXT, \ 48 47 KVM_REG_RISCV_SBI_SINGLE, \ 49 48 idx, KVM_REG_SIZE_ULONG) 49 + 50 + bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext); 51 + 52 + struct ex_regs { 53 + unsigned long ra; 54 + unsigned long sp; 55 + unsigned long gp; 56 + unsigned long tp; 57 + unsigned long t0; 58 + unsigned long t1; 59 + unsigned long t2; 60 + unsigned long s0; 61 + unsigned long s1; 62 + unsigned long a0; 63 + unsigned long a1; 64 + unsigned long a2; 65 + unsigned long a3; 66 + unsigned long a4; 67 + unsigned long a5; 68 + unsigned long a6; 69 + unsigned long a7; 70 + unsigned long s2; 71 + unsigned long s3; 72 + unsigned long s4; 73 + unsigned long s5; 74 + unsigned long s6; 75 + unsigned long s7; 76 + unsigned long s8; 77 + unsigned long s9; 78 + unsigned long s10; 79 + unsigned long s11; 80 + unsigned long t3; 81 + unsigned long t4; 82 + unsigned long t5; 83 + unsigned long t6; 84 + unsigned long epc; 85 + unsigned long status; 86 + unsigned long cause; 87 + }; 88 + 89 + #define NR_VECTORS 2 90 + #define NR_EXCEPTIONS 32 91 + #define EC_MASK (NR_EXCEPTIONS - 1) 92 + 93 + typedef void(*exception_handler_fn)(struct ex_regs *); 94 + 95 + void vm_init_vector_tables(struct kvm_vm *vm); 96 + void vcpu_init_vector_tables(struct kvm_vcpu *vcpu); 97 + 98 + void vm_install_exception_handler(struct kvm_vm *vm, int vector, exception_handler_fn handler); 99 + 100 + void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handler); 50 101 51 102 /* L3 index Bit[47:39] */ 52 103 #define PGTBL_L3_INDEX_MASK 0x0000FF8000000000ULL ··· 154 101 #define PGTBL_PAGE_SIZE PGTBL_L0_BLOCK_SIZE 155 102 #define PGTBL_PAGE_SIZE_SHIFT PGTBL_L0_BLOCK_SHIFT 156 103 157 - #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) 158 - #define SATP_MODE_39 _AC(0x8000000000000000, UL) 159 - #define SATP_MODE_48 _AC(0x9000000000000000, UL) 160 - #define SATP_ASID_BITS 16 161 - #define SATP_ASID_SHIFT 44 162 - #define SATP_ASID_MASK _AC(0xFFFF, UL) 163 - 164 104 /* SBI return error codes */ 165 105 #define SBI_SUCCESS 0 166 106 #define SBI_ERR_FAILURE -1 ··· 192 146 unsigned long arg5); 193 147 194 148 bool guest_sbi_probe_extension(int extid, long *out_val); 149 + 150 + static inline void local_irq_enable(void) 151 + { 152 + csr_set(CSR_SSTATUS, SR_SIE); 153 + } 154 + 155 + static inline void local_irq_disable(void) 156 + { 157 + csr_clear(CSR_SSTATUS, SR_SIE); 158 + } 195 159 196 160 #endif /* SELFTEST_KVM_PROCESSOR_H */

+7

tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef SELFTEST_KVM_UTIL_ARCH_H 3 + #define SELFTEST_KVM_UTIL_ARCH_H 4 + 5 + struct kvm_vm_arch {}; 6 + 7 + #endif // SELFTEST_KVM_UTIL_ARCH_H

+38 -18

tools/testing/selftests/kvm/include/sparsebit.h

··· 30 30 31 31 struct sparsebit *sparsebit_alloc(void); 32 32 void sparsebit_free(struct sparsebit **sbitp); 33 - void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src); 33 + void sparsebit_copy(struct sparsebit *dstp, const struct sparsebit *src); 34 34 35 - bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx); 36 - bool sparsebit_is_set_num(struct sparsebit *sbit, 35 + bool sparsebit_is_set(const struct sparsebit *sbit, sparsebit_idx_t idx); 36 + bool sparsebit_is_set_num(const struct sparsebit *sbit, 37 37 sparsebit_idx_t idx, sparsebit_num_t num); 38 - bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx); 39 - bool sparsebit_is_clear_num(struct sparsebit *sbit, 38 + bool sparsebit_is_clear(const struct sparsebit *sbit, sparsebit_idx_t idx); 39 + bool sparsebit_is_clear_num(const struct sparsebit *sbit, 40 40 sparsebit_idx_t idx, sparsebit_num_t num); 41 - sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit); 42 - bool sparsebit_any_set(struct sparsebit *sbit); 43 - bool sparsebit_any_clear(struct sparsebit *sbit); 44 - bool sparsebit_all_set(struct sparsebit *sbit); 45 - bool sparsebit_all_clear(struct sparsebit *sbit); 46 - sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit); 47 - sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit); 48 - sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev); 49 - sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev); 50 - sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit, 41 + sparsebit_num_t sparsebit_num_set(const struct sparsebit *sbit); 42 + bool sparsebit_any_set(const struct sparsebit *sbit); 43 + bool sparsebit_any_clear(const struct sparsebit *sbit); 44 + bool sparsebit_all_set(const struct sparsebit *sbit); 45 + bool sparsebit_all_clear(const struct sparsebit *sbit); 46 + sparsebit_idx_t sparsebit_first_set(const struct sparsebit *sbit); 47 + sparsebit_idx_t sparsebit_first_clear(const struct sparsebit *sbit); 48 + sparsebit_idx_t sparsebit_next_set(const struct sparsebit *sbit, sparsebit_idx_t prev); 49 + sparsebit_idx_t sparsebit_next_clear(const struct sparsebit *sbit, sparsebit_idx_t prev); 50 + sparsebit_idx_t sparsebit_next_set_num(const struct sparsebit *sbit, 51 51 sparsebit_idx_t start, sparsebit_num_t num); 52 - sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit, 52 + sparsebit_idx_t sparsebit_next_clear_num(const struct sparsebit *sbit, 53 53 sparsebit_idx_t start, sparsebit_num_t num); 54 54 55 55 void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx); ··· 62 62 sparsebit_idx_t start, sparsebit_num_t num); 63 63 void sparsebit_clear_all(struct sparsebit *sbitp); 64 64 65 - void sparsebit_dump(FILE *stream, struct sparsebit *sbit, 65 + void sparsebit_dump(FILE *stream, const struct sparsebit *sbit, 66 66 unsigned int indent); 67 - void sparsebit_validate_internal(struct sparsebit *sbit); 67 + void sparsebit_validate_internal(const struct sparsebit *sbit); 68 + 69 + /* 70 + * Iterate over an inclusive ranges within sparsebit @s. In each iteration, 71 + * @range_begin and @range_end will take the beginning and end of the set 72 + * range, which are of type sparsebit_idx_t. 73 + * 74 + * For example, if the range [3, 7] (inclusive) is set, within the 75 + * iteration,@range_begin will take the value 3 and @range_end will take 76 + * the value 7. 77 + * 78 + * Ensure that there is at least one bit set before using this macro with 79 + * sparsebit_any_set(), because sparsebit_first_set() will abort if none 80 + * are set. 81 + */ 82 + #define sparsebit_for_each_set_range(s, range_begin, range_end) \ 83 + for (range_begin = sparsebit_first_set(s), \ 84 + range_end = sparsebit_next_clear(s, range_begin) - 1; \ 85 + range_begin && range_end; \ 86 + range_begin = sparsebit_next_set(s, range_end), \ 87 + range_end = sparsebit_next_clear(s, range_begin) - 1) 68 88 69 89 #ifdef __cplusplus 70 90 }

+2

tools/testing/selftests/kvm/include/test_util.h

··· 20 20 #include <sys/mman.h> 21 21 #include "kselftest.h" 22 22 23 + #define msecs_to_usecs(msec) ((msec) * 1000ULL) 24 + 23 25 static inline int _no_printf(const char *format, ...) { return 0; } 24 26 25 27 #ifdef DEBUG

+45

tools/testing/selftests/kvm/include/timer_test.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * timer test specific header 4 + * 5 + * Copyright (C) 2018, Google LLC 6 + */ 7 + 8 + #ifndef SELFTEST_KVM_TIMER_TEST_H 9 + #define SELFTEST_KVM_TIMER_TEST_H 10 + 11 + #include "kvm_util.h" 12 + 13 + #define NR_VCPUS_DEF 4 14 + #define NR_TEST_ITERS_DEF 5 15 + #define TIMER_TEST_PERIOD_MS_DEF 10 16 + #define TIMER_TEST_ERR_MARGIN_US 100 17 + #define TIMER_TEST_MIGRATION_FREQ_MS 2 18 + 19 + /* Timer test cmdline parameters */ 20 + struct test_args { 21 + uint32_t nr_vcpus; 22 + uint32_t nr_iter; 23 + uint32_t timer_period_ms; 24 + uint32_t migration_freq_ms; 25 + uint32_t timer_err_margin_us; 26 + /* Members of struct kvm_arm_counter_offset */ 27 + uint64_t counter_offset; 28 + uint64_t reserved; 29 + }; 30 + 31 + /* Shared variables between host and guest */ 32 + struct test_vcpu_shared_data { 33 + uint32_t nr_iter; 34 + int guest_stage; 35 + uint64_t xcnt; 36 + }; 37 + 38 + extern struct test_args test_args; 39 + extern struct kvm_vcpu *vcpus[]; 40 + extern struct test_vcpu_shared_data vcpu_shared_data[]; 41 + 42 + struct kvm_vm *test_vm_create(void); 43 + void test_vm_cleanup(struct kvm_vm *vm); 44 + 45 + #endif /* SELFTEST_KVM_TIMER_TEST_H */

+23

tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef SELFTEST_KVM_UTIL_ARCH_H 3 + #define SELFTEST_KVM_UTIL_ARCH_H 4 + 5 + #include <stdbool.h> 6 + #include <stdint.h> 7 + 8 + struct kvm_vm_arch { 9 + uint64_t c_bit; 10 + uint64_t s_bit; 11 + int sev_fd; 12 + bool is_pt_protected; 13 + }; 14 + 15 + static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) 16 + { 17 + return arch->c_bit || arch->s_bit; 18 + } 19 + 20 + #define vm_arch_has_protected_memory(vm) \ 21 + __vm_arch_has_protected_memory(&(vm)->arch) 22 + 23 + #endif // SELFTEST_KVM_UTIL_ARCH_H

+97

tools/testing/selftests/kvm/include/x86_64/pmu.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (C) 2023, Tencent, Inc. 4 + */ 5 + #ifndef SELFTEST_KVM_PMU_H 6 + #define SELFTEST_KVM_PMU_H 7 + 8 + #include <stdint.h> 9 + 10 + #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 11 + 12 + /* 13 + * Encode an eventsel+umask pair into event-select MSR format. Note, this is 14 + * technically AMD's format, as Intel's format only supports 8 bits for the 15 + * event selector, i.e. doesn't use bits 24:16 for the selector. But, OR-ing 16 + * in '0' is a nop and won't clobber the CMASK. 17 + */ 18 + #define RAW_EVENT(eventsel, umask) (((eventsel & 0xf00UL) << 24) | \ 19 + ((eventsel) & 0xff) | \ 20 + ((umask) & 0xff) << 8) 21 + 22 + /* 23 + * These are technically Intel's definitions, but except for CMASK (see above), 24 + * AMD's layout is compatible with Intel's. 25 + */ 26 + #define ARCH_PERFMON_EVENTSEL_EVENT GENMASK_ULL(7, 0) 27 + #define ARCH_PERFMON_EVENTSEL_UMASK GENMASK_ULL(15, 8) 28 + #define ARCH_PERFMON_EVENTSEL_USR BIT_ULL(16) 29 + #define ARCH_PERFMON_EVENTSEL_OS BIT_ULL(17) 30 + #define ARCH_PERFMON_EVENTSEL_EDGE BIT_ULL(18) 31 + #define ARCH_PERFMON_EVENTSEL_PIN_CONTROL BIT_ULL(19) 32 + #define ARCH_PERFMON_EVENTSEL_INT BIT_ULL(20) 33 + #define ARCH_PERFMON_EVENTSEL_ANY BIT_ULL(21) 34 + #define ARCH_PERFMON_EVENTSEL_ENABLE BIT_ULL(22) 35 + #define ARCH_PERFMON_EVENTSEL_INV BIT_ULL(23) 36 + #define ARCH_PERFMON_EVENTSEL_CMASK GENMASK_ULL(31, 24) 37 + 38 + /* RDPMC control flags, Intel only. */ 39 + #define INTEL_RDPMC_METRICS BIT_ULL(29) 40 + #define INTEL_RDPMC_FIXED BIT_ULL(30) 41 + #define INTEL_RDPMC_FAST BIT_ULL(31) 42 + 43 + /* Fixed PMC controls, Intel only. */ 44 + #define FIXED_PMC_GLOBAL_CTRL_ENABLE(_idx) BIT_ULL((32 + (_idx))) 45 + 46 + #define FIXED_PMC_KERNEL BIT_ULL(0) 47 + #define FIXED_PMC_USER BIT_ULL(1) 48 + #define FIXED_PMC_ANYTHREAD BIT_ULL(2) 49 + #define FIXED_PMC_ENABLE_PMI BIT_ULL(3) 50 + #define FIXED_PMC_NR_BITS 4 51 + #define FIXED_PMC_CTRL(_idx, _val) ((_val) << ((_idx) * FIXED_PMC_NR_BITS)) 52 + 53 + #define PMU_CAP_FW_WRITES BIT_ULL(13) 54 + #define PMU_CAP_LBR_FMT 0x3f 55 + 56 + #define INTEL_ARCH_CPU_CYCLES RAW_EVENT(0x3c, 0x00) 57 + #define INTEL_ARCH_INSTRUCTIONS_RETIRED RAW_EVENT(0xc0, 0x00) 58 + #define INTEL_ARCH_REFERENCE_CYCLES RAW_EVENT(0x3c, 0x01) 59 + #define INTEL_ARCH_LLC_REFERENCES RAW_EVENT(0x2e, 0x4f) 60 + #define INTEL_ARCH_LLC_MISSES RAW_EVENT(0x2e, 0x41) 61 + #define INTEL_ARCH_BRANCHES_RETIRED RAW_EVENT(0xc4, 0x00) 62 + #define INTEL_ARCH_BRANCHES_MISPREDICTED RAW_EVENT(0xc5, 0x00) 63 + #define INTEL_ARCH_TOPDOWN_SLOTS RAW_EVENT(0xa4, 0x01) 64 + 65 + #define AMD_ZEN_CORE_CYCLES RAW_EVENT(0x76, 0x00) 66 + #define AMD_ZEN_INSTRUCTIONS_RETIRED RAW_EVENT(0xc0, 0x00) 67 + #define AMD_ZEN_BRANCHES_RETIRED RAW_EVENT(0xc2, 0x00) 68 + #define AMD_ZEN_BRANCHES_MISPREDICTED RAW_EVENT(0xc3, 0x00) 69 + 70 + /* 71 + * Note! The order and thus the index of the architectural events matters as 72 + * support for each event is enumerated via CPUID using the index of the event. 73 + */ 74 + enum intel_pmu_architectural_events { 75 + INTEL_ARCH_CPU_CYCLES_INDEX, 76 + INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX, 77 + INTEL_ARCH_REFERENCE_CYCLES_INDEX, 78 + INTEL_ARCH_LLC_REFERENCES_INDEX, 79 + INTEL_ARCH_LLC_MISSES_INDEX, 80 + INTEL_ARCH_BRANCHES_RETIRED_INDEX, 81 + INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX, 82 + INTEL_ARCH_TOPDOWN_SLOTS_INDEX, 83 + NR_INTEL_ARCH_EVENTS, 84 + }; 85 + 86 + enum amd_pmu_zen_events { 87 + AMD_ZEN_CORE_CYCLES_INDEX, 88 + AMD_ZEN_INSTRUCTIONS_INDEX, 89 + AMD_ZEN_BRANCHES_INDEX, 90 + AMD_ZEN_BRANCH_MISSES_INDEX, 91 + NR_AMD_ZEN_EVENTS, 92 + }; 93 + 94 + extern const uint64_t intel_pmu_arch_events[]; 95 + extern const uint64_t amd_pmu_zen_events[]; 96 + 97 + #endif /* SELFTEST_KVM_PMU_H */

+124 -32

tools/testing/selftests/kvm/include/x86_64/processor.h

··· 23 23 extern bool host_cpu_is_intel; 24 24 extern bool host_cpu_is_amd; 25 25 26 + enum vm_guest_x86_subtype { 27 + VM_SUBTYPE_NONE = 0, 28 + VM_SUBTYPE_SEV, 29 + VM_SUBTYPE_SEV_ES, 30 + }; 31 + 32 + /* Forced emulation prefix, used to invoke the emulator unconditionally. */ 33 + #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" 34 + 26 35 #define NMI_VECTOR 0x02 27 36 28 37 #define X86_EFLAGS_FIXED (1u << 1) ··· 282 273 #define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31) 283 274 #define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7) 284 275 #define X86_PROPERTY_MAX_VIRT_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15) 276 + #define X86_PROPERTY_SEV_C_BIT KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5) 285 277 #define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) 286 278 287 279 #define X86_PROPERTY_MAX_CENTAUR_LEAF KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31) ··· 292 282 * that indicates the feature is _not_ supported, and a property that states 293 283 * the length of the bit mask of unsupported features. A feature is supported 294 284 * if the size of the bit mask is larger than the "unavailable" bit, and said 295 - * bit is not set. 285 + * bit is not set. Fixed counters also bizarre enumeration, but inverted from 286 + * arch events for general purpose counters. Fixed counters are supported if a 287 + * feature flag is set **OR** the total number of fixed counters is greater 288 + * than index of the counter. 296 289 * 297 - * Wrap the "unavailable" feature to simplify checking whether or not a given 298 - * architectural event is supported. 290 + * Wrap the events for general purpose and fixed counters to simplify checking 291 + * whether or not a given architectural event is supported. 299 292 */ 300 293 struct kvm_x86_pmu_feature { 301 - struct kvm_x86_cpu_feature anti_feature; 294 + struct kvm_x86_cpu_feature f; 302 295 }; 303 - #define KVM_X86_PMU_FEATURE(name, __bit) \ 304 - ({ \ 305 - struct kvm_x86_pmu_feature feature = { \ 306 - .anti_feature = KVM_X86_CPU_FEATURE(0xa, 0, EBX, __bit), \ 307 - }; \ 308 - \ 309 - feature; \ 296 + #define KVM_X86_PMU_FEATURE(__reg, __bit) \ 297 + ({ \ 298 + struct kvm_x86_pmu_feature feature = { \ 299 + .f = KVM_X86_CPU_FEATURE(0xa, 0, __reg, __bit), \ 300 + }; \ 301 + \ 302 + kvm_static_assert(KVM_CPUID_##__reg == KVM_CPUID_EBX || \ 303 + KVM_CPUID_##__reg == KVM_CPUID_ECX); \ 304 + feature; \ 310 305 }) 311 306 312 - #define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED KVM_X86_PMU_FEATURE(BRANCH_INSNS_RETIRED, 5) 307 + #define X86_PMU_FEATURE_CPU_CYCLES KVM_X86_PMU_FEATURE(EBX, 0) 308 + #define X86_PMU_FEATURE_INSNS_RETIRED KVM_X86_PMU_FEATURE(EBX, 1) 309 + #define X86_PMU_FEATURE_REFERENCE_CYCLES KVM_X86_PMU_FEATURE(EBX, 2) 310 + #define X86_PMU_FEATURE_LLC_REFERENCES KVM_X86_PMU_FEATURE(EBX, 3) 311 + #define X86_PMU_FEATURE_LLC_MISSES KVM_X86_PMU_FEATURE(EBX, 4) 312 + #define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED KVM_X86_PMU_FEATURE(EBX, 5) 313 + #define X86_PMU_FEATURE_BRANCHES_MISPREDICTED KVM_X86_PMU_FEATURE(EBX, 6) 314 + #define X86_PMU_FEATURE_TOPDOWN_SLOTS KVM_X86_PMU_FEATURE(EBX, 7) 315 + 316 + #define X86_PMU_FEATURE_INSNS_RETIRED_FIXED KVM_X86_PMU_FEATURE(ECX, 0) 317 + #define X86_PMU_FEATURE_CPU_CYCLES_FIXED KVM_X86_PMU_FEATURE(ECX, 1) 318 + #define X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED KVM_X86_PMU_FEATURE(ECX, 2) 319 + #define X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED KVM_X86_PMU_FEATURE(ECX, 3) 313 320 314 321 static inline unsigned int x86_family(unsigned int eax) 315 322 { ··· 725 698 726 699 static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature) 727 700 { 728 - uint32_t nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); 701 + uint32_t nr_bits; 729 702 730 - return nr_bits > feature.anti_feature.bit && 731 - !this_cpu_has(feature.anti_feature); 703 + if (feature.f.reg == KVM_CPUID_EBX) { 704 + nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); 705 + return nr_bits > feature.f.bit && !this_cpu_has(feature.f); 706 + } 707 + 708 + GUEST_ASSERT(feature.f.reg == KVM_CPUID_ECX); 709 + nr_bits = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); 710 + return nr_bits > feature.f.bit || this_cpu_has(feature.f); 732 711 } 733 712 734 713 static __always_inline uint64_t this_cpu_supported_xcr0(void) ··· 950 917 951 918 static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature) 952 919 { 953 - uint32_t nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); 920 + uint32_t nr_bits; 954 921 955 - return nr_bits > feature.anti_feature.bit && 956 - !kvm_cpu_has(feature.anti_feature); 922 + if (feature.f.reg == KVM_CPUID_EBX) { 923 + nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); 924 + return nr_bits > feature.f.bit && !kvm_cpu_has(feature.f); 925 + } 926 + 927 + TEST_ASSERT_EQ(feature.f.reg, KVM_CPUID_ECX); 928 + nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); 929 + return nr_bits > feature.f.bit || kvm_cpu_has(feature.f); 957 930 } 958 931 959 932 static __always_inline uint64_t kvm_cpu_supported_xcr0(void) ··· 1034 995 vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); 1035 996 } 1036 997 1037 - void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr); 998 + void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu, 999 + struct kvm_x86_cpu_property property, 1000 + uint32_t value); 1038 1001 1039 1002 void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function); 1040 1003 void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, ··· 1100 1059 } while (0) 1101 1060 1102 1061 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); 1062 + void kvm_init_vm_address_properties(struct kvm_vm *vm); 1103 1063 bool vm_is_unrestricted_guest(struct kvm_vm *vm); 1104 1064 1105 1065 struct ex_regs { ··· 1162 1120 * r9 = exception vector (non-zero) 1163 1121 * r10 = error code 1164 1122 */ 1165 - #define KVM_ASM_SAFE(insn) \ 1123 + #define __KVM_ASM_SAFE(insn, fep) \ 1166 1124 "mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t" \ 1167 1125 "lea 1f(%%rip), %%r10\n\t" \ 1168 1126 "lea 2f(%%rip), %%r11\n\t" \ 1169 - "1: " insn "\n\t" \ 1127 + fep "1: " insn "\n\t" \ 1170 1128 "xor %%r9, %%r9\n\t" \ 1171 1129 "2:\n\t" \ 1172 1130 "mov %%r9b, %[vector]\n\t" \ 1173 1131 "mov %%r10, %[error_code]\n\t" 1132 + 1133 + #define KVM_ASM_SAFE(insn) __KVM_ASM_SAFE(insn, "") 1134 + #define KVM_ASM_SAFE_FEP(insn) __KVM_ASM_SAFE(insn, KVM_FEP) 1174 1135 1175 1136 #define KVM_ASM_SAFE_OUTPUTS(v, ec) [vector] "=qm"(v), [error_code] "=rm"(ec) 1176 1137 #define KVM_ASM_SAFE_CLOBBERS "r9", "r10", "r11" ··· 1201 1156 vector; \ 1202 1157 }) 1203 1158 1204 - static inline uint8_t rdmsr_safe(uint32_t msr, uint64_t *val) 1205 - { 1206 - uint64_t error_code; 1207 - uint8_t vector; 1208 - uint32_t a, d; 1159 + #define kvm_asm_safe_fep(insn, inputs...) \ 1160 + ({ \ 1161 + uint64_t ign_error_code; \ 1162 + uint8_t vector; \ 1163 + \ 1164 + asm volatile(KVM_ASM_SAFE(insn) \ 1165 + : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code) \ 1166 + : inputs \ 1167 + : KVM_ASM_SAFE_CLOBBERS); \ 1168 + vector; \ 1169 + }) 1209 1170 1210 - asm volatile(KVM_ASM_SAFE("rdmsr") 1211 - : "=a"(a), "=d"(d), KVM_ASM_SAFE_OUTPUTS(vector, error_code) 1212 - : "c"(msr) 1213 - : KVM_ASM_SAFE_CLOBBERS); 1171 + #define kvm_asm_safe_ec_fep(insn, error_code, inputs...) \ 1172 + ({ \ 1173 + uint8_t vector; \ 1174 + \ 1175 + asm volatile(KVM_ASM_SAFE_FEP(insn) \ 1176 + : KVM_ASM_SAFE_OUTPUTS(vector, error_code) \ 1177 + : inputs \ 1178 + : KVM_ASM_SAFE_CLOBBERS); \ 1179 + vector; \ 1180 + }) 1214 1181 1215 - *val = (uint64_t)a | ((uint64_t)d << 32); 1216 - return vector; 1182 + #define BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP) \ 1183 + static inline uint8_t insn##_safe ##_fep(uint32_t idx, uint64_t *val) \ 1184 + { \ 1185 + uint64_t error_code; \ 1186 + uint8_t vector; \ 1187 + uint32_t a, d; \ 1188 + \ 1189 + asm volatile(KVM_ASM_SAFE##_FEP(#insn) \ 1190 + : "=a"(a), "=d"(d), \ 1191 + KVM_ASM_SAFE_OUTPUTS(vector, error_code) \ 1192 + : "c"(idx) \ 1193 + : KVM_ASM_SAFE_CLOBBERS); \ 1194 + \ 1195 + *val = (uint64_t)a | ((uint64_t)d << 32); \ 1196 + return vector; \ 1217 1197 } 1198 + 1199 + /* 1200 + * Generate {insn}_safe() and {insn}_safe_fep() helpers for instructions that 1201 + * use ECX as in input index, and EDX:EAX as a 64-bit output. 1202 + */ 1203 + #define BUILD_READ_U64_SAFE_HELPERS(insn) \ 1204 + BUILD_READ_U64_SAFE_HELPER(insn, , ) \ 1205 + BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP) \ 1206 + 1207 + BUILD_READ_U64_SAFE_HELPERS(rdmsr) 1208 + BUILD_READ_U64_SAFE_HELPERS(rdpmc) 1209 + BUILD_READ_U64_SAFE_HELPERS(xgetbv) 1218 1210 1219 1211 static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) 1220 1212 { ··· 1267 1185 } 1268 1186 1269 1187 bool kvm_is_tdp_enabled(void); 1188 + 1189 + static inline bool kvm_is_pmu_enabled(void) 1190 + { 1191 + return get_kvm_param_bool("enable_pmu"); 1192 + } 1193 + 1194 + static inline bool kvm_is_forced_emulation_enabled(void) 1195 + { 1196 + return !!get_kvm_param_integer("force_emulation_prefix"); 1197 + } 1270 1198 1271 1199 uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, 1272 1200 int *level);

+107

tools/testing/selftests/kvm/include/x86_64/sev.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Helpers used for SEV guests 4 + * 5 + */ 6 + #ifndef SELFTEST_KVM_SEV_H 7 + #define SELFTEST_KVM_SEV_H 8 + 9 + #include <stdint.h> 10 + #include <stdbool.h> 11 + 12 + #include "linux/psp-sev.h" 13 + 14 + #include "kvm_util.h" 15 + #include "svm_util.h" 16 + #include "processor.h" 17 + 18 + enum sev_guest_state { 19 + SEV_GUEST_STATE_UNINITIALIZED = 0, 20 + SEV_GUEST_STATE_LAUNCH_UPDATE, 21 + SEV_GUEST_STATE_LAUNCH_SECRET, 22 + SEV_GUEST_STATE_RUNNING, 23 + }; 24 + 25 + #define SEV_POLICY_NO_DBG (1UL << 0) 26 + #define SEV_POLICY_ES (1UL << 2) 27 + 28 + #define GHCB_MSR_TERM_REQ 0x100 29 + 30 + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy); 31 + void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement); 32 + void sev_vm_launch_finish(struct kvm_vm *vm); 33 + 34 + struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, 35 + struct kvm_vcpu **cpu); 36 + 37 + kvm_static_assert(SEV_RET_SUCCESS == 0); 38 + 39 + /* 40 + * The KVM_MEMORY_ENCRYPT_OP uAPI is utter garbage and takes an "unsigned long" 41 + * instead of a proper struct. The size of the parameter is embedded in the 42 + * ioctl number, i.e. is ABI and thus immutable. Hack around the mess by 43 + * creating an overlay to pass in an "unsigned long" without a cast (casting 44 + * will make the compiler unhappy due to dereferencing an aliased pointer). 45 + */ 46 + #define __vm_sev_ioctl(vm, cmd, arg) \ 47 + ({ \ 48 + int r; \ 49 + \ 50 + union { \ 51 + struct kvm_sev_cmd c; \ 52 + unsigned long raw; \ 53 + } sev_cmd = { .c = { \ 54 + .id = (cmd), \ 55 + .data = (uint64_t)(arg), \ 56 + .sev_fd = (vm)->arch.sev_fd, \ 57 + } }; \ 58 + \ 59 + r = __vm_ioctl(vm, KVM_MEMORY_ENCRYPT_OP, &sev_cmd.raw); \ 60 + r ?: sev_cmd.c.error; \ 61 + }) 62 + 63 + #define vm_sev_ioctl(vm, cmd, arg) \ 64 + ({ \ 65 + int ret = __vm_sev_ioctl(vm, cmd, arg); \ 66 + \ 67 + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ 68 + }) 69 + 70 + static inline void sev_vm_init(struct kvm_vm *vm) 71 + { 72 + vm->arch.sev_fd = open_sev_dev_path_or_exit(); 73 + 74 + vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); 75 + } 76 + 77 + 78 + static inline void sev_es_vm_init(struct kvm_vm *vm) 79 + { 80 + vm->arch.sev_fd = open_sev_dev_path_or_exit(); 81 + 82 + vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); 83 + } 84 + 85 + static inline void sev_register_encrypted_memory(struct kvm_vm *vm, 86 + struct userspace_mem_region *region) 87 + { 88 + struct kvm_enc_region range = { 89 + .addr = region->region.userspace_addr, 90 + .size = region->region.memory_size, 91 + }; 92 + 93 + vm_ioctl(vm, KVM_MEMORY_ENCRYPT_REG_REGION, &range); 94 + } 95 + 96 + static inline void sev_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa, 97 + uint64_t size) 98 + { 99 + struct kvm_sev_launch_update_data update_data = { 100 + .uaddr = (unsigned long)addr_gpa2hva(vm, gpa), 101 + .len = size, 102 + }; 103 + 104 + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_DATA, &update_data); 105 + } 106 + 107 + #endif /* SELFTEST_KVM_SEV_H */

+18 -6

tools/testing/selftests/kvm/lib/aarch64/processor.c

··· 365 365 indent, "", pstate, pc); 366 366 } 367 367 368 - struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 369 - struct kvm_vcpu_init *init, void *guest_code) 368 + void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) 369 + { 370 + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); 371 + } 372 + 373 + static struct kvm_vcpu *__aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 374 + struct kvm_vcpu_init *init) 370 375 { 371 376 size_t stack_size; 372 377 uint64_t stack_vaddr; ··· 386 381 aarch64_vcpu_setup(vcpu, init); 387 382 388 383 vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); 389 - vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); 384 + return vcpu; 385 + } 386 + 387 + struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 388 + struct kvm_vcpu_init *init, void *guest_code) 389 + { 390 + struct kvm_vcpu *vcpu = __aarch64_vcpu_add(vm, vcpu_id, init); 391 + 392 + vcpu_arch_set_entry_point(vcpu, guest_code); 390 393 391 394 return vcpu; 392 395 } 393 396 394 - struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 395 - void *guest_code) 397 + struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) 396 398 { 397 - return aarch64_vcpu_add(vm, vcpu_id, NULL, guest_code); 399 + return __aarch64_vcpu_add(vm, vcpu_id, NULL); 398 400 } 399 401 400 402 void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)

+114 -15

tools/testing/selftests/kvm/lib/kvm_util.c

··· 52 52 return _open_kvm_dev_path_or_exit(O_RDONLY); 53 53 } 54 54 55 - static bool get_module_param_bool(const char *module_name, const char *param) 55 + static ssize_t get_module_param(const char *module_name, const char *param, 56 + void *buffer, size_t buffer_size) 56 57 { 57 58 const int path_size = 128; 58 59 char path[path_size]; 59 - char value; 60 - ssize_t r; 61 - int fd; 60 + ssize_t bytes_read; 61 + int fd, r; 62 62 63 63 r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", 64 64 module_name, param); ··· 67 67 68 68 fd = open_path_or_exit(path, O_RDONLY); 69 69 70 - r = read(fd, &value, 1); 71 - TEST_ASSERT(r == 1, "read(%s) failed", path); 70 + bytes_read = read(fd, buffer, buffer_size); 71 + TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes", 72 + path, bytes_read, buffer_size); 72 73 73 74 r = close(fd); 74 75 TEST_ASSERT(!r, "close(%s) failed", path); 76 + return bytes_read; 77 + } 78 + 79 + static int get_module_param_integer(const char *module_name, const char *param) 80 + { 81 + /* 82 + * 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the 83 + * NUL char, and 1 byte because the kernel sucks and inserts a newline 84 + * at the end. 85 + */ 86 + char value[16 + 1 + 1]; 87 + ssize_t r; 88 + 89 + memset(value, '\0', sizeof(value)); 90 + 91 + r = get_module_param(module_name, param, value, sizeof(value)); 92 + TEST_ASSERT(value[r - 1] == '\n', 93 + "Expected trailing newline, got char '%c'", value[r - 1]); 94 + 95 + /* 96 + * Squash the newline, otherwise atoi_paranoid() will complain about 97 + * trailing non-NUL characters in the string. 98 + */ 99 + value[r - 1] = '\0'; 100 + return atoi_paranoid(value); 101 + } 102 + 103 + static bool get_module_param_bool(const char *module_name, const char *param) 104 + { 105 + char value; 106 + ssize_t r; 107 + 108 + r = get_module_param(module_name, param, &value, sizeof(value)); 109 + TEST_ASSERT_EQ(r, 1); 75 110 76 111 if (value == 'Y') 77 112 return true; ··· 129 94 bool get_kvm_amd_param_bool(const char *param) 130 95 { 131 96 return get_module_param_bool("kvm_amd", param); 97 + } 98 + 99 + int get_kvm_param_integer(const char *param) 100 + { 101 + return get_module_param_integer("kvm", param); 102 + } 103 + 104 + int get_kvm_intel_param_integer(const char *param) 105 + { 106 + return get_module_param_integer("kvm_intel", param); 107 + } 108 + 109 + int get_kvm_amd_param_integer(const char *param) 110 + { 111 + return get_module_param_integer("kvm_amd", param); 132 112 } 133 113 134 114 /* ··· 276 226 277 227 vm->mode = shape.mode; 278 228 vm->type = shape.type; 229 + vm->subtype = shape.subtype; 279 230 280 231 vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; 281 232 vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; ··· 317 266 case VM_MODE_PXXV48_4K: 318 267 #ifdef __x86_64__ 319 268 kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); 269 + kvm_init_vm_address_properties(vm); 320 270 /* 321 271 * Ignore KVM support for 5-level paging (vm->va_bits == 57), 322 272 * it doesn't take effect unless a CR4.LA57 is set, which it ··· 718 666 vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region); 719 667 720 668 sparsebit_free(&region->unused_phy_pages); 669 + sparsebit_free(&region->protected_phy_pages); 721 670 ret = munmap(region->mmap_start, region->mmap_size); 722 671 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 723 672 if (region->fd >= 0) { ··· 1100 1047 } 1101 1048 1102 1049 region->unused_phy_pages = sparsebit_alloc(); 1050 + if (vm_arch_has_protected_memory(vm)) 1051 + region->protected_phy_pages = sparsebit_alloc(); 1103 1052 sparsebit_set_num(region->unused_phy_pages, 1104 1053 guest_paddr >> vm->page_shift, npages); 1105 1054 region->region.slot = slot; ··· 1432 1377 return pgidx_start * vm->page_size; 1433 1378 } 1434 1379 1435 - vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, 1436 - enum kvm_mem_region_type type) 1380 + static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, 1381 + vm_vaddr_t vaddr_min, 1382 + enum kvm_mem_region_type type, 1383 + bool protected) 1437 1384 { 1438 1385 uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); 1439 1386 1440 1387 virt_pgd_alloc(vm); 1441 - vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, 1442 - KVM_UTIL_MIN_PFN * vm->page_size, 1443 - vm->memslots[type]); 1388 + vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages, 1389 + KVM_UTIL_MIN_PFN * vm->page_size, 1390 + vm->memslots[type], protected); 1444 1391 1445 1392 /* 1446 1393 * Find an unused range of virtual page addresses of at least ··· 1460 1403 } 1461 1404 1462 1405 return vaddr_start; 1406 + } 1407 + 1408 + vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, 1409 + enum kvm_mem_region_type type) 1410 + { 1411 + return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, 1412 + vm_arch_has_protected_memory(vm)); 1413 + } 1414 + 1415 + vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, 1416 + vm_vaddr_t vaddr_min, 1417 + enum kvm_mem_region_type type) 1418 + { 1419 + return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false); 1463 1420 } 1464 1421 1465 1422 /* ··· 1597 1526 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) 1598 1527 { 1599 1528 struct userspace_mem_region *region; 1529 + 1530 + gpa = vm_untag_gpa(vm, gpa); 1600 1531 1601 1532 region = userspace_mem_region_find(vm, gpa, gpa); 1602 1533 if (!region) { ··· 1946 1873 region->host_mem); 1947 1874 fprintf(stream, "%*sunused_phy_pages: ", indent + 2, ""); 1948 1875 sparsebit_dump(stream, region->unused_phy_pages, 0); 1876 + if (region->protected_phy_pages) { 1877 + fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, ""); 1878 + sparsebit_dump(stream, region->protected_phy_pages, 0); 1879 + } 1949 1880 } 1950 1881 fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); 1951 1882 sparsebit_dump(stream, vm->vpages_mapped, indent + 2); ··· 2051 1974 * num - number of pages 2052 1975 * paddr_min - Physical address minimum 2053 1976 * memslot - Memory region to allocate page from 1977 + * protected - True if the pages will be used as protected/private memory 2054 1978 * 2055 1979 * Output Args: None 2056 1980 * ··· 2063 1985 * and their base address is returned. A TEST_ASSERT failure occurs if 2064 1986 * not enough pages are available at or above paddr_min. 2065 1987 */ 2066 - vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 2067 - vm_paddr_t paddr_min, uint32_t memslot) 1988 + vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 1989 + vm_paddr_t paddr_min, uint32_t memslot, 1990 + bool protected) 2068 1991 { 2069 1992 struct userspace_mem_region *region; 2070 1993 sparsebit_idx_t pg, base; ··· 2078 1999 paddr_min, vm->page_size); 2079 2000 2080 2001 region = memslot2region(vm, memslot); 2081 - base = pg = paddr_min >> vm->page_shift; 2002 + TEST_ASSERT(!protected || region->protected_phy_pages, 2003 + "Region doesn't support protected memory"); 2082 2004 2005 + base = pg = paddr_min >> vm->page_shift; 2083 2006 do { 2084 2007 for (; pg < base + num; ++pg) { 2085 2008 if (!sparsebit_is_set(region->unused_phy_pages, pg)) { ··· 2100 2019 abort(); 2101 2020 } 2102 2021 2103 - for (pg = base; pg < base + num; ++pg) 2022 + for (pg = base; pg < base + num; ++pg) { 2104 2023 sparsebit_clear(region->unused_phy_pages, pg); 2024 + if (protected) 2025 + sparsebit_set(region->protected_phy_pages, pg); 2026 + } 2105 2027 2106 2028 return base * vm->page_size; 2107 2029 } ··· 2307 2223 setbuf(stdout, NULL); 2308 2224 2309 2225 kvm_selftest_arch_init(); 2226 + } 2227 + 2228 + bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr) 2229 + { 2230 + sparsebit_idx_t pg = 0; 2231 + struct userspace_mem_region *region; 2232 + 2233 + if (!vm_arch_has_protected_memory(vm)) 2234 + return false; 2235 + 2236 + region = userspace_mem_region_find(vm, paddr, paddr); 2237 + TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr); 2238 + 2239 + pg = paddr >> vm->page_shift; 2240 + return sparsebit_is_set(region->protected_phy_pages, pg); 2310 2241 }

+101

tools/testing/selftests/kvm/lib/riscv/handlers.S

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2023 Intel Corporation 4 + */ 5 + 6 + #ifndef __ASSEMBLY__ 7 + #define __ASSEMBLY__ 8 + #endif 9 + 10 + #include <asm/csr.h> 11 + 12 + .macro save_context 13 + addi sp, sp, (-8*34) 14 + sd x1, 0(sp) 15 + sd x2, 8(sp) 16 + sd x3, 16(sp) 17 + sd x4, 24(sp) 18 + sd x5, 32(sp) 19 + sd x6, 40(sp) 20 + sd x7, 48(sp) 21 + sd x8, 56(sp) 22 + sd x9, 64(sp) 23 + sd x10, 72(sp) 24 + sd x11, 80(sp) 25 + sd x12, 88(sp) 26 + sd x13, 96(sp) 27 + sd x14, 104(sp) 28 + sd x15, 112(sp) 29 + sd x16, 120(sp) 30 + sd x17, 128(sp) 31 + sd x18, 136(sp) 32 + sd x19, 144(sp) 33 + sd x20, 152(sp) 34 + sd x21, 160(sp) 35 + sd x22, 168(sp) 36 + sd x23, 176(sp) 37 + sd x24, 184(sp) 38 + sd x25, 192(sp) 39 + sd x26, 200(sp) 40 + sd x27, 208(sp) 41 + sd x28, 216(sp) 42 + sd x29, 224(sp) 43 + sd x30, 232(sp) 44 + sd x31, 240(sp) 45 + csrr s0, CSR_SEPC 46 + csrr s1, CSR_SSTATUS 47 + csrr s2, CSR_SCAUSE 48 + sd s0, 248(sp) 49 + sd s1, 256(sp) 50 + sd s2, 264(sp) 51 + .endm 52 + 53 + .macro restore_context 54 + ld s2, 264(sp) 55 + ld s1, 256(sp) 56 + ld s0, 248(sp) 57 + csrw CSR_SCAUSE, s2 58 + csrw CSR_SSTATUS, s1 59 + csrw CSR_SEPC, s0 60 + ld x31, 240(sp) 61 + ld x30, 232(sp) 62 + ld x29, 224(sp) 63 + ld x28, 216(sp) 64 + ld x27, 208(sp) 65 + ld x26, 200(sp) 66 + ld x25, 192(sp) 67 + ld x24, 184(sp) 68 + ld x23, 176(sp) 69 + ld x22, 168(sp) 70 + ld x21, 160(sp) 71 + ld x20, 152(sp) 72 + ld x19, 144(sp) 73 + ld x18, 136(sp) 74 + ld x17, 128(sp) 75 + ld x16, 120(sp) 76 + ld x15, 112(sp) 77 + ld x14, 104(sp) 78 + ld x13, 96(sp) 79 + ld x12, 88(sp) 80 + ld x11, 80(sp) 81 + ld x10, 72(sp) 82 + ld x9, 64(sp) 83 + ld x8, 56(sp) 84 + ld x7, 48(sp) 85 + ld x6, 40(sp) 86 + ld x5, 32(sp) 87 + ld x4, 24(sp) 88 + ld x3, 16(sp) 89 + ld x2, 8(sp) 90 + ld x1, 0(sp) 91 + addi sp, sp, (8*34) 92 + .endm 93 + 94 + .balign 4 95 + .global exception_vectors 96 + exception_vectors: 97 + save_context 98 + move a0, sp 99 + call route_exception 100 + restore_context 101 + sret

+93 -3

tools/testing/selftests/kvm/lib/riscv/processor.c

··· 13 13 14 14 #define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN 0xac0000 15 15 16 + static vm_vaddr_t exception_handlers; 17 + 18 + bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) 19 + { 20 + unsigned long value = 0; 21 + int ret; 22 + 23 + ret = __vcpu_get_reg(vcpu, ext, &value); 24 + 25 + return !ret && !!value; 26 + } 27 + 16 28 static uint64_t page_align(struct kvm_vm *vm, uint64_t v) 17 29 { 18 30 return (v + vm->page_size) & ~(vm->page_size - 1); ··· 289 277 0, 0, 0, 0, 0, 0); 290 278 } 291 279 292 - struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 293 - void *guest_code) 280 + void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) 281 + { 282 + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code); 283 + } 284 + 285 + struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) 294 286 { 295 287 int r; 296 288 size_t stack_size; ··· 328 312 329 313 /* Setup stack pointer and program counter of guest */ 330 314 vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.sp), stack_vaddr + stack_size); 331 - vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code); 315 + 316 + /* Setup sscratch for guest_get_vcpuid() */ 317 + vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(sscratch), vcpu_id); 332 318 333 319 /* Setup default exception vector of guest */ 334 320 vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(stvec), (unsigned long)guest_unexp_trap); ··· 382 364 va_end(ap); 383 365 } 384 366 367 + void kvm_exit_unexpected_exception(int vector, int ec) 368 + { 369 + ucall(UCALL_UNHANDLED, 2, vector, ec); 370 + } 371 + 385 372 void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) 386 373 { 374 + struct ucall uc; 375 + 376 + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) { 377 + TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)", 378 + uc.args[0], uc.args[1]); 379 + } 380 + } 381 + 382 + struct handlers { 383 + exception_handler_fn exception_handlers[NR_VECTORS][NR_EXCEPTIONS]; 384 + }; 385 + 386 + void route_exception(struct ex_regs *regs) 387 + { 388 + struct handlers *handlers = (struct handlers *)exception_handlers; 389 + int vector = 0, ec; 390 + 391 + ec = regs->cause & ~CAUSE_IRQ_FLAG; 392 + if (ec >= NR_EXCEPTIONS) 393 + goto unexpected_exception; 394 + 395 + /* Use the same handler for all the interrupts */ 396 + if (regs->cause & CAUSE_IRQ_FLAG) { 397 + vector = 1; 398 + ec = 0; 399 + } 400 + 401 + if (handlers && handlers->exception_handlers[vector][ec]) 402 + return handlers->exception_handlers[vector][ec](regs); 403 + 404 + unexpected_exception: 405 + return kvm_exit_unexpected_exception(vector, ec); 406 + } 407 + 408 + void vcpu_init_vector_tables(struct kvm_vcpu *vcpu) 409 + { 410 + extern char exception_vectors; 411 + 412 + vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(stvec), (unsigned long)&exception_vectors); 413 + } 414 + 415 + void vm_init_vector_tables(struct kvm_vm *vm) 416 + { 417 + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), 418 + vm->page_size, MEM_REGION_DATA); 419 + 420 + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; 421 + } 422 + 423 + void vm_install_exception_handler(struct kvm_vm *vm, int vector, exception_handler_fn handler) 424 + { 425 + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); 426 + 427 + assert(vector < NR_EXCEPTIONS); 428 + handlers->exception_handlers[0][vector] = handler; 429 + } 430 + 431 + void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handler) 432 + { 433 + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); 434 + 435 + handlers->exception_handlers[1][0] = handler; 436 + } 437 + 438 + uint32_t guest_get_vcpuid(void) 439 + { 440 + return csr_read(CSR_SSCRATCH); 387 441 } 388 442 389 443 struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,

+7 -6

tools/testing/selftests/kvm/lib/s390x/processor.c

··· 155 155 virt_dump_region(stream, vm, indent, vm->pgd); 156 156 } 157 157 158 - struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 159 - void *guest_code) 158 + void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) 159 + { 160 + vcpu->run->psw_addr = (uintptr_t)guest_code; 161 + } 162 + 163 + struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) 160 164 { 161 165 size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); 162 166 uint64_t stack_vaddr; 163 167 struct kvm_regs regs; 164 168 struct kvm_sregs sregs; 165 169 struct kvm_vcpu *vcpu; 166 - struct kvm_run *run; 167 170 168 171 TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", 169 172 vm->page_size); ··· 187 184 sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ 188 185 vcpu_sregs_set(vcpu, &sregs); 189 186 190 - run = vcpu->run; 191 - run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ 192 - run->psw_addr = (uintptr_t)guest_code; 187 + vcpu->run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ 193 188 194 189 return vcpu; 195 190 }

+24 -24

tools/testing/selftests/kvm/lib/sparsebit.c

··· 202 202 /* Returns a pointer to the node that describes the 203 203 * lowest bit index. 204 204 */ 205 - static struct node *node_first(struct sparsebit *s) 205 + static struct node *node_first(const struct sparsebit *s) 206 206 { 207 207 struct node *nodep; 208 208 ··· 216 216 * lowest bit index > the index of the node pointed to by np. 217 217 * Returns NULL if no node with a higher index exists. 218 218 */ 219 - static struct node *node_next(struct sparsebit *s, struct node *np) 219 + static struct node *node_next(const struct sparsebit *s, struct node *np) 220 220 { 221 221 struct node *nodep = np; 222 222 ··· 244 244 * highest index < the index of the node pointed to by np. 245 245 * Returns NULL if no node with a lower index exists. 246 246 */ 247 - static struct node *node_prev(struct sparsebit *s, struct node *np) 247 + static struct node *node_prev(const struct sparsebit *s, struct node *np) 248 248 { 249 249 struct node *nodep = np; 250 250 ··· 273 273 * subtree and duplicates the bit settings to the newly allocated nodes. 274 274 * Returns the newly allocated copy of subtree. 275 275 */ 276 - static struct node *node_copy_subtree(struct node *subtree) 276 + static struct node *node_copy_subtree(const struct node *subtree) 277 277 { 278 278 struct node *root; 279 279 ··· 307 307 * index is within the bits described by the mask bits or the number of 308 308 * contiguous bits set after the mask. Returns NULL if there is no such node. 309 309 */ 310 - static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx) 310 + static struct node *node_find(const struct sparsebit *s, sparsebit_idx_t idx) 311 311 { 312 312 struct node *nodep; 313 313 ··· 393 393 } 394 394 395 395 /* Returns whether all the bits in the sparsebit array are set. */ 396 - bool sparsebit_all_set(struct sparsebit *s) 396 + bool sparsebit_all_set(const struct sparsebit *s) 397 397 { 398 398 /* 399 399 * If any nodes there must be at least one bit set. Only case ··· 775 775 /* Returns whether the bit at the index given by idx, within the 776 776 * sparsebit array is set or not. 777 777 */ 778 - bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx) 778 + bool sparsebit_is_set(const struct sparsebit *s, sparsebit_idx_t idx) 779 779 { 780 780 struct node *nodep; 781 781 ··· 921 921 * used by test cases after they detect an unexpected condition, as a means 922 922 * to capture diagnostic information. 923 923 */ 924 - static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s, 924 + static void sparsebit_dump_internal(FILE *stream, const struct sparsebit *s, 925 925 unsigned int indent) 926 926 { 927 927 /* Dump the contents of s */ ··· 969 969 * sparsebit_alloc(). It can though already have bits set, which 970 970 * if different from src will be cleared. 971 971 */ 972 - void sparsebit_copy(struct sparsebit *d, struct sparsebit *s) 972 + void sparsebit_copy(struct sparsebit *d, const struct sparsebit *s) 973 973 { 974 974 /* First clear any bits already set in the destination */ 975 975 sparsebit_clear_all(d); ··· 981 981 } 982 982 983 983 /* Returns whether num consecutive bits starting at idx are all set. */ 984 - bool sparsebit_is_set_num(struct sparsebit *s, 984 + bool sparsebit_is_set_num(const struct sparsebit *s, 985 985 sparsebit_idx_t idx, sparsebit_num_t num) 986 986 { 987 987 sparsebit_idx_t next_cleared; ··· 1005 1005 } 1006 1006 1007 1007 /* Returns whether the bit at the index given by idx. */ 1008 - bool sparsebit_is_clear(struct sparsebit *s, 1008 + bool sparsebit_is_clear(const struct sparsebit *s, 1009 1009 sparsebit_idx_t idx) 1010 1010 { 1011 1011 return !sparsebit_is_set(s, idx); 1012 1012 } 1013 1013 1014 1014 /* Returns whether num consecutive bits starting at idx are all cleared. */ 1015 - bool sparsebit_is_clear_num(struct sparsebit *s, 1015 + bool sparsebit_is_clear_num(const struct sparsebit *s, 1016 1016 sparsebit_idx_t idx, sparsebit_num_t num) 1017 1017 { 1018 1018 sparsebit_idx_t next_set; ··· 1041 1041 * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0, 1042 1042 * to determine if the sparsebit array has any bits set. 1043 1043 */ 1044 - sparsebit_num_t sparsebit_num_set(struct sparsebit *s) 1044 + sparsebit_num_t sparsebit_num_set(const struct sparsebit *s) 1045 1045 { 1046 1046 return s->num_set; 1047 1047 } 1048 1048 1049 1049 /* Returns whether any bit is set in the sparsebit array. */ 1050 - bool sparsebit_any_set(struct sparsebit *s) 1050 + bool sparsebit_any_set(const struct sparsebit *s) 1051 1051 { 1052 1052 /* 1053 1053 * Nodes only describe set bits. If any nodes then there ··· 1070 1070 } 1071 1071 1072 1072 /* Returns whether all the bits in the sparsebit array are cleared. */ 1073 - bool sparsebit_all_clear(struct sparsebit *s) 1073 + bool sparsebit_all_clear(const struct sparsebit *s) 1074 1074 { 1075 1075 return !sparsebit_any_set(s); 1076 1076 } 1077 1077 1078 1078 /* Returns whether all the bits in the sparsebit array are set. */ 1079 - bool sparsebit_any_clear(struct sparsebit *s) 1079 + bool sparsebit_any_clear(const struct sparsebit *s) 1080 1080 { 1081 1081 return !sparsebit_all_set(s); 1082 1082 } 1083 1083 1084 1084 /* Returns the index of the first set bit. Abort if no bits are set. 1085 1085 */ 1086 - sparsebit_idx_t sparsebit_first_set(struct sparsebit *s) 1086 + sparsebit_idx_t sparsebit_first_set(const struct sparsebit *s) 1087 1087 { 1088 1088 struct node *nodep; 1089 1089 ··· 1097 1097 /* Returns the index of the first cleared bit. Abort if 1098 1098 * no bits are cleared. 1099 1099 */ 1100 - sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s) 1100 + sparsebit_idx_t sparsebit_first_clear(const struct sparsebit *s) 1101 1101 { 1102 1102 struct node *nodep1, *nodep2; 1103 1103 ··· 1151 1151 /* Returns index of next bit set within s after the index given by prev. 1152 1152 * Returns 0 if there are no bits after prev that are set. 1153 1153 */ 1154 - sparsebit_idx_t sparsebit_next_set(struct sparsebit *s, 1154 + sparsebit_idx_t sparsebit_next_set(const struct sparsebit *s, 1155 1155 sparsebit_idx_t prev) 1156 1156 { 1157 1157 sparsebit_idx_t lowest_possible = prev + 1; ··· 1244 1244 /* Returns index of next bit cleared within s after the index given by prev. 1245 1245 * Returns 0 if there are no bits after prev that are cleared. 1246 1246 */ 1247 - sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s, 1247 + sparsebit_idx_t sparsebit_next_clear(const struct sparsebit *s, 1248 1248 sparsebit_idx_t prev) 1249 1249 { 1250 1250 sparsebit_idx_t lowest_possible = prev + 1; ··· 1300 1300 * and returns the index of the first sequence of num consecutively set 1301 1301 * bits. Returns a value of 0 of no such sequence exists. 1302 1302 */ 1303 - sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s, 1303 + sparsebit_idx_t sparsebit_next_set_num(const struct sparsebit *s, 1304 1304 sparsebit_idx_t start, sparsebit_num_t num) 1305 1305 { 1306 1306 sparsebit_idx_t idx; ··· 1335 1335 * and returns the index of the first sequence of num consecutively cleared 1336 1336 * bits. Returns a value of 0 of no such sequence exists. 1337 1337 */ 1338 - sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s, 1338 + sparsebit_idx_t sparsebit_next_clear_num(const struct sparsebit *s, 1339 1339 sparsebit_idx_t start, sparsebit_num_t num) 1340 1340 { 1341 1341 sparsebit_idx_t idx; ··· 1583 1583 * contiguous bits. This is done because '-' is used to specify command-line 1584 1584 * options, and sometimes ranges are specified as command-line arguments. 1585 1585 */ 1586 - void sparsebit_dump(FILE *stream, struct sparsebit *s, 1586 + void sparsebit_dump(FILE *stream, const struct sparsebit *s, 1587 1587 unsigned int indent) 1588 1588 { 1589 1589 size_t current_line_len = 0; ··· 1681 1681 * s. On error, diagnostic information is printed to stderr and 1682 1682 * abort is called. 1683 1683 */ 1684 - void sparsebit_validate_internal(struct sparsebit *s) 1684 + void sparsebit_validate_internal(const struct sparsebit *s) 1685 1685 { 1686 1686 bool error_detected = false; 1687 1687 struct node *nodep, *prev = NULL;

+2 -1

tools/testing/selftests/kvm/lib/ucall_common.c

··· 29 29 vm_vaddr_t vaddr; 30 30 int i; 31 31 32 - vaddr = __vm_vaddr_alloc(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR, MEM_REGION_DATA); 32 + vaddr = vm_vaddr_alloc_shared(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR, 33 + MEM_REGION_DATA); 33 34 hdr = (struct ucall_header *)addr_gva2hva(vm, vaddr); 34 35 memset(hdr, 0, sizeof(*hdr)); 35 36

+31

tools/testing/selftests/kvm/lib/x86_64/pmu.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2023, Tencent, Inc. 4 + */ 5 + 6 + #include <stdint.h> 7 + 8 + #include <linux/kernel.h> 9 + 10 + #include "kvm_util.h" 11 + #include "pmu.h" 12 + 13 + const uint64_t intel_pmu_arch_events[] = { 14 + INTEL_ARCH_CPU_CYCLES, 15 + INTEL_ARCH_INSTRUCTIONS_RETIRED, 16 + INTEL_ARCH_REFERENCE_CYCLES, 17 + INTEL_ARCH_LLC_REFERENCES, 18 + INTEL_ARCH_LLC_MISSES, 19 + INTEL_ARCH_BRANCHES_RETIRED, 20 + INTEL_ARCH_BRANCHES_MISPREDICTED, 21 + INTEL_ARCH_TOPDOWN_SLOTS, 22 + }; 23 + kvm_static_assert(ARRAY_SIZE(intel_pmu_arch_events) == NR_INTEL_ARCH_EVENTS); 24 + 25 + const uint64_t amd_pmu_zen_events[] = { 26 + AMD_ZEN_CORE_CYCLES, 27 + AMD_ZEN_INSTRUCTIONS_RETIRED, 28 + AMD_ZEN_BRANCHES_RETIRED, 29 + AMD_ZEN_BRANCHES_MISPREDICTED, 30 + }; 31 + kvm_static_assert(ARRAY_SIZE(amd_pmu_zen_events) == NR_AMD_ZEN_EVENTS);

+53 -7

tools/testing/selftests/kvm/lib/x86_64/processor.c

··· 9 9 #include "test_util.h" 10 10 #include "kvm_util.h" 11 11 #include "processor.h" 12 + #include "sev.h" 12 13 13 14 #ifndef NUM_INTERRUPTS 14 15 #define NUM_INTERRUPTS 256 ··· 158 157 { 159 158 uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level); 160 159 160 + paddr = vm_untag_gpa(vm, paddr); 161 + 161 162 if (!(*pte & PTE_PRESENT_MASK)) { 162 163 *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; 163 164 if (current_level == target_level) ··· 203 200 "Physical address beyond maximum supported,\n" 204 201 " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", 205 202 paddr, vm->max_gfn, vm->page_size); 203 + TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr, 204 + "Unexpected bits in paddr: %lx", paddr); 206 205 207 206 /* 208 207 * Allocate upper level page tables, if not already present. Return ··· 227 222 TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), 228 223 "PTE already present for 4k page at vaddr: 0x%lx", vaddr); 229 224 *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); 225 + 226 + /* 227 + * Neither SEV nor TDX supports shared page tables, so only the final 228 + * leaf PTE needs manually set the C/S-bit. 229 + */ 230 + if (vm_is_gpa_protected(vm, paddr)) 231 + *pte |= vm->arch.c_bit; 232 + else 233 + *pte |= vm->arch.s_bit; 230 234 } 231 235 232 236 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) ··· 278 264 int *level) 279 265 { 280 266 uint64_t *pml4e, *pdpe, *pde; 267 + 268 + TEST_ASSERT(!vm->arch.is_pt_protected, 269 + "Walking page tables of protected guests is impossible"); 281 270 282 271 TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, 283 272 "Invalid PG_LEVEL_* '%d'", *level); ··· 513 496 * No need for a hugepage mask on the PTE, x86-64 requires the "unused" 514 497 * address bits to be zero. 515 498 */ 516 - return PTE_GET_PA(*pte) | (gva & ~HUGEPAGE_MASK(level)); 499 + return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); 517 500 } 518 501 519 502 static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) ··· 577 560 vm_create_irqchip(vm); 578 561 sync_global_to_guest(vm, host_cpu_is_intel); 579 562 sync_global_to_guest(vm, host_cpu_is_amd); 563 + 564 + if (vm->subtype == VM_SUBTYPE_SEV) 565 + sev_vm_init(vm); 566 + else if (vm->subtype == VM_SUBTYPE_SEV_ES) 567 + sev_es_vm_init(vm); 580 568 } 581 569 582 - struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, 583 - void *guest_code) 570 + void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) 571 + { 572 + struct kvm_regs regs; 573 + 574 + vcpu_regs_get(vcpu, &regs); 575 + regs.rip = (unsigned long) guest_code; 576 + vcpu_regs_set(vcpu, &regs); 577 + } 578 + 579 + struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) 584 580 { 585 581 struct kvm_mp_state mp_state; 586 582 struct kvm_regs regs; ··· 627 597 vcpu_regs_get(vcpu, &regs); 628 598 regs.rflags = regs.rflags | 0x2; 629 599 regs.rsp = stack_vaddr; 630 - regs.rip = (unsigned long) guest_code; 631 600 vcpu_regs_set(vcpu, &regs); 632 601 633 602 /* Setup the MP state */ ··· 781 752 vcpu_set_cpuid(vcpu); 782 753 } 783 754 784 - void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr) 755 + void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu, 756 + struct kvm_x86_cpu_property property, 757 + uint32_t value) 785 758 { 786 - struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, 0x80000008); 759 + struct kvm_cpuid_entry2 *entry; 787 760 788 - entry->eax = (entry->eax & ~0xff) | maxphyaddr; 761 + entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index); 762 + 763 + (&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit); 764 + (&entry->eax)[property.reg] |= value << property.lo_bit; 765 + 789 766 vcpu_set_cpuid(vcpu); 767 + 768 + /* Sanity check that @value doesn't exceed the bounds in any way. */ 769 + TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value); 790 770 } 791 771 792 772 void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function) ··· 1076 1038 } else { 1077 1039 *pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR); 1078 1040 *va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR); 1041 + } 1042 + } 1043 + 1044 + void kvm_init_vm_address_properties(struct kvm_vm *vm) 1045 + { 1046 + if (vm->subtype == VM_SUBTYPE_SEV || vm->subtype == VM_SUBTYPE_SEV_ES) { 1047 + vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); 1048 + vm->gpa_tag_mask = vm->arch.c_bit; 1079 1049 } 1080 1050 } 1081 1051

+114

tools/testing/selftests/kvm/lib/x86_64/sev.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #define _GNU_SOURCE /* for program_invocation_short_name */ 3 + #include <stdint.h> 4 + #include <stdbool.h> 5 + 6 + #include "sev.h" 7 + 8 + /* 9 + * sparsebit_next_clear() can return 0 if [x, 2**64-1] are all set, and the 10 + * -1 would then cause an underflow back to 2**64 - 1. This is expected and 11 + * correct. 12 + * 13 + * If the last range in the sparsebit is [x, y] and we try to iterate, 14 + * sparsebit_next_set() will return 0, and sparsebit_next_clear() will try 15 + * and find the first range, but that's correct because the condition 16 + * expression would cause us to quit the loop. 17 + */ 18 + static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region) 19 + { 20 + const struct sparsebit *protected_phy_pages = region->protected_phy_pages; 21 + const vm_paddr_t gpa_base = region->region.guest_phys_addr; 22 + const sparsebit_idx_t lowest_page_in_region = gpa_base >> vm->page_shift; 23 + sparsebit_idx_t i, j; 24 + 25 + if (!sparsebit_any_set(protected_phy_pages)) 26 + return; 27 + 28 + sev_register_encrypted_memory(vm, region); 29 + 30 + sparsebit_for_each_set_range(protected_phy_pages, i, j) { 31 + const uint64_t size = (j - i + 1) * vm->page_size; 32 + const uint64_t offset = (i - lowest_page_in_region) * vm->page_size; 33 + 34 + sev_launch_update_data(vm, gpa_base + offset, size); 35 + } 36 + } 37 + 38 + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) 39 + { 40 + struct kvm_sev_launch_start launch_start = { 41 + .policy = policy, 42 + }; 43 + struct userspace_mem_region *region; 44 + struct kvm_sev_guest_status status; 45 + int ctr; 46 + 47 + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_START, &launch_start); 48 + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); 49 + 50 + TEST_ASSERT_EQ(status.policy, policy); 51 + TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_LAUNCH_UPDATE); 52 + 53 + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) 54 + encrypt_region(vm, region); 55 + 56 + if (policy & SEV_POLICY_ES) 57 + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); 58 + 59 + vm->arch.is_pt_protected = true; 60 + } 61 + 62 + void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement) 63 + { 64 + struct kvm_sev_launch_measure launch_measure; 65 + struct kvm_sev_guest_status guest_status; 66 + 67 + launch_measure.len = 256; 68 + launch_measure.uaddr = (__u64)measurement; 69 + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_MEASURE, &launch_measure); 70 + 71 + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &guest_status); 72 + TEST_ASSERT_EQ(guest_status.state, SEV_GUEST_STATE_LAUNCH_SECRET); 73 + } 74 + 75 + void sev_vm_launch_finish(struct kvm_vm *vm) 76 + { 77 + struct kvm_sev_guest_status status; 78 + 79 + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); 80 + TEST_ASSERT(status.state == SEV_GUEST_STATE_LAUNCH_UPDATE || 81 + status.state == SEV_GUEST_STATE_LAUNCH_SECRET, 82 + "Unexpected guest state: %d", status.state); 83 + 84 + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_FINISH, NULL); 85 + 86 + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); 87 + TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING); 88 + } 89 + 90 + struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code, 91 + struct kvm_vcpu **cpu) 92 + { 93 + struct vm_shape shape = { 94 + .type = VM_TYPE_DEFAULT, 95 + .mode = VM_MODE_DEFAULT, 96 + .subtype = policy & SEV_POLICY_ES ? VM_SUBTYPE_SEV_ES : 97 + VM_SUBTYPE_SEV, 98 + }; 99 + struct kvm_vm *vm; 100 + struct kvm_vcpu *cpus[1]; 101 + uint8_t measurement[512]; 102 + 103 + vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus); 104 + *cpu = cpus[0]; 105 + 106 + sev_vm_launch(vm, policy); 107 + 108 + /* TODO: Validate the measurement is as expected. */ 109 + sev_vm_launch_measure(vm, measurement); 110 + 111 + sev_vm_launch_finish(vm); 112 + 113 + return vm; 114 + }

+111

tools/testing/selftests/kvm/riscv/arch_timer.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * arch_timer.c - Tests the riscv64 sstc timer IRQ functionality 4 + * 5 + * The test validates the sstc timer IRQs using vstimecmp registers. 6 + * It's ported from the aarch64 arch_timer test. 7 + * 8 + * Copyright (c) 2024, Intel Corporation. 9 + */ 10 + 11 + #define _GNU_SOURCE 12 + 13 + #include "arch_timer.h" 14 + #include "kvm_util.h" 15 + #include "processor.h" 16 + #include "timer_test.h" 17 + 18 + static int timer_irq = IRQ_S_TIMER; 19 + 20 + static void guest_irq_handler(struct ex_regs *regs) 21 + { 22 + uint64_t xcnt, xcnt_diff_us, cmp; 23 + unsigned int intid = regs->cause & ~CAUSE_IRQ_FLAG; 24 + uint32_t cpu = guest_get_vcpuid(); 25 + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; 26 + 27 + timer_irq_disable(); 28 + 29 + xcnt = timer_get_cycles(); 30 + cmp = timer_get_cmp(); 31 + xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt); 32 + 33 + /* Make sure we are dealing with the correct timer IRQ */ 34 + GUEST_ASSERT_EQ(intid, timer_irq); 35 + 36 + __GUEST_ASSERT(xcnt >= cmp, 37 + "xcnt = 0x%"PRIx64", cmp = 0x%"PRIx64", xcnt_diff_us = 0x%" PRIx64, 38 + xcnt, cmp, xcnt_diff_us); 39 + 40 + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); 41 + } 42 + 43 + static void guest_run(struct test_vcpu_shared_data *shared_data) 44 + { 45 + uint32_t irq_iter, config_iter; 46 + 47 + shared_data->nr_iter = 0; 48 + shared_data->guest_stage = 0; 49 + 50 + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { 51 + /* Setup the next interrupt */ 52 + timer_set_next_cmp_ms(test_args.timer_period_ms); 53 + shared_data->xcnt = timer_get_cycles(); 54 + timer_irq_enable(); 55 + 56 + /* Setup a timeout for the interrupt to arrive */ 57 + udelay(msecs_to_usecs(test_args.timer_period_ms) + 58 + test_args.timer_err_margin_us); 59 + 60 + irq_iter = READ_ONCE(shared_data->nr_iter); 61 + __GUEST_ASSERT(config_iter + 1 == irq_iter, 62 + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" 63 + " Guest timer interrupt was not trigged within the specified\n" 64 + " interval, try to increase the error margin by [-e] option.\n", 65 + config_iter + 1, irq_iter); 66 + } 67 + } 68 + 69 + static void guest_code(void) 70 + { 71 + uint32_t cpu = guest_get_vcpuid(); 72 + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; 73 + 74 + timer_irq_disable(); 75 + local_irq_enable(); 76 + 77 + guest_run(shared_data); 78 + 79 + GUEST_DONE(); 80 + } 81 + 82 + struct kvm_vm *test_vm_create(void) 83 + { 84 + struct kvm_vm *vm; 85 + int nr_vcpus = test_args.nr_vcpus; 86 + 87 + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); 88 + __TEST_REQUIRE(__vcpu_has_ext(vcpus[0], RISCV_ISA_EXT_REG(KVM_RISCV_ISA_EXT_SSTC)), 89 + "SSTC not available, skipping test\n"); 90 + 91 + vm_init_vector_tables(vm); 92 + vm_install_interrupt_handler(vm, guest_irq_handler); 93 + 94 + for (int i = 0; i < nr_vcpus; i++) 95 + vcpu_init_vector_tables(vcpus[i]); 96 + 97 + /* Initialize guest timer frequency. */ 98 + vcpu_get_reg(vcpus[0], RISCV_TIMER_REG(frequency), &timer_freq); 99 + sync_global_to_guest(vm, timer_freq); 100 + pr_debug("timer_freq: %lu\n", timer_freq); 101 + 102 + /* Make all the test's cmdline args visible to the guest */ 103 + sync_global_to_guest(vm, test_args); 104 + 105 + return vm; 106 + } 107 + 108 + void test_vm_cleanup(struct kvm_vm *vm) 109 + { 110 + kvm_vm_free(vm); 111 + }

+9 -10

tools/testing/selftests/kvm/riscv/get-reg-list.c

··· 47 47 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL: 48 48 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT: 49 49 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVPBMT: 50 + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZACAS: 50 51 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBA: 51 52 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBB: 52 53 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBC: ··· 74 73 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKSED: 75 74 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKSH: 76 75 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKT: 76 + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZTSO: 77 77 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVBB: 78 78 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVBC: 79 79 case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVFH: ··· 125 123 return err == EINVAL; 126 124 } 127 125 128 - static bool vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext_id) 129 - { 130 - int ret; 131 - unsigned long value; 132 - 133 - ret = __vcpu_get_reg(vcpu, ext_id, &value); 134 - return (ret) ? false : !!value; 135 - } 136 - 137 126 void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c) 138 127 { 139 128 unsigned long isa_ext_state[KVM_RISCV_ISA_EXT_MAX] = { 0 }; ··· 169 176 __vcpu_set_reg(vcpu, feature, 1); 170 177 171 178 /* Double check whether the desired extension was enabled */ 172 - __TEST_REQUIRE(vcpu_has_ext(vcpu, feature), 179 + __TEST_REQUIRE(__vcpu_has_ext(vcpu, feature), 173 180 "%s not available, skipping tests", s->name); 174 181 } 175 182 } ··· 412 419 KVM_ISA_EXT_ARR(SVINVAL), 413 420 KVM_ISA_EXT_ARR(SVNAPOT), 414 421 KVM_ISA_EXT_ARR(SVPBMT), 422 + KVM_ISA_EXT_ARR(ZACAS), 415 423 KVM_ISA_EXT_ARR(ZBA), 416 424 KVM_ISA_EXT_ARR(ZBB), 417 425 KVM_ISA_EXT_ARR(ZBC), ··· 439 445 KVM_ISA_EXT_ARR(ZKSED), 440 446 KVM_ISA_EXT_ARR(ZKSH), 441 447 KVM_ISA_EXT_ARR(ZKT), 448 + KVM_ISA_EXT_ARR(ZTSO), 442 449 KVM_ISA_EXT_ARR(ZVBB), 443 450 KVM_ISA_EXT_ARR(ZVBC), 444 451 KVM_ISA_EXT_ARR(ZVFH), ··· 935 940 KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL); 936 941 KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT); 937 942 KVM_ISA_EXT_SIMPLE_CONFIG(svpbmt, SVPBMT); 943 + KVM_ISA_EXT_SIMPLE_CONFIG(zacas, ZACAS); 938 944 KVM_ISA_EXT_SIMPLE_CONFIG(zba, ZBA); 939 945 KVM_ISA_EXT_SIMPLE_CONFIG(zbb, ZBB); 940 946 KVM_ISA_EXT_SIMPLE_CONFIG(zbc, ZBC); ··· 962 966 KVM_ISA_EXT_SIMPLE_CONFIG(zksed, ZKSED); 963 967 KVM_ISA_EXT_SIMPLE_CONFIG(zksh, ZKSH); 964 968 KVM_ISA_EXT_SIMPLE_CONFIG(zkt, ZKT); 969 + KVM_ISA_EXT_SIMPLE_CONFIG(ztso, ZTSO); 965 970 KVM_ISA_EXT_SIMPLE_CONFIG(zvbb, ZVBB); 966 971 KVM_ISA_EXT_SIMPLE_CONFIG(zvbc, ZVBC); 967 972 KVM_ISA_EXT_SIMPLE_CONFIG(zvfh, ZVFH); ··· 990 993 &config_svinval, 991 994 &config_svnapot, 992 995 &config_svpbmt, 996 + &config_zacas, 993 997 &config_zba, 994 998 &config_zbb, 995 999 &config_zbc, ··· 1017 1019 &config_zksed, 1018 1020 &config_zksh, 1019 1021 &config_zkt, 1022 + &config_ztso, 1020 1023 &config_zvbb, 1021 1024 &config_zvbc, 1022 1025 &config_zvfh,

+2

tools/testing/selftests/kvm/s390x/memop.c

··· 515 515 516 516 amount = (amount + bits) % bits; 517 517 val = cut_to_size(size, val); 518 + if (!amount) 519 + return val; 518 520 return (val << (bits - amount)) | (val >> amount); 519 521 } 520 522

+18 -9

tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c

··· 9 9 #include <linux/stringify.h> 10 10 #include <stdint.h> 11 11 12 + #include "kvm_test_harness.h" 12 13 #include "apic.h" 13 14 #include "test_util.h" 14 15 #include "kvm_util.h" ··· 84 83 GUEST_DONE(); 85 84 } 86 85 86 + KVM_ONE_VCPU_TEST_SUITE(fix_hypercall); 87 + 87 88 static void enter_guest(struct kvm_vcpu *vcpu) 88 89 { 89 90 struct kvm_run *run = vcpu->run; ··· 106 103 } 107 104 } 108 105 109 - static void test_fix_hypercall(bool disable_quirk) 106 + static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk) 110 107 { 111 - struct kvm_vcpu *vcpu; 112 - struct kvm_vm *vm; 108 + struct kvm_vm *vm = vcpu->vm; 113 109 114 - vm = vm_create_with_one_vcpu(&vcpu, guest_main); 115 - 116 - vm_init_descriptor_tables(vcpu->vm); 110 + vm_init_descriptor_tables(vm); 117 111 vcpu_init_descriptor_tables(vcpu); 118 112 vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); 119 113 ··· 126 126 enter_guest(vcpu); 127 127 } 128 128 129 - int main(void) 129 + KVM_ONE_VCPU_TEST(fix_hypercall, enable_quirk, guest_main) 130 + { 131 + test_fix_hypercall(vcpu, false); 132 + } 133 + 134 + KVM_ONE_VCPU_TEST(fix_hypercall, disable_quirk, guest_main) 135 + { 136 + test_fix_hypercall(vcpu, true); 137 + } 138 + 139 + int main(int argc, char *argv[]) 130 140 { 131 141 TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN); 132 142 133 - test_fix_hypercall(false); 134 - test_fix_hypercall(true); 143 + return test_harness_run(argc, argv); 135 144 }

+620

tools/testing/selftests/kvm/x86_64/pmu_counters_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2023, Tencent, Inc. 4 + */ 5 + 6 + #define _GNU_SOURCE /* for program_invocation_short_name */ 7 + #include <x86intrin.h> 8 + 9 + #include "pmu.h" 10 + #include "processor.h" 11 + 12 + /* Number of LOOP instructions for the guest measurement payload. */ 13 + #define NUM_BRANCHES 10 14 + /* 15 + * Number of "extra" instructions that will be counted, i.e. the number of 16 + * instructions that are needed to set up the loop and then disabled the 17 + * counter. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR. 18 + */ 19 + #define NUM_EXTRA_INSNS 7 20 + #define NUM_INSNS_RETIRED (NUM_BRANCHES + NUM_EXTRA_INSNS) 21 + 22 + static uint8_t kvm_pmu_version; 23 + static bool kvm_has_perf_caps; 24 + static bool is_forced_emulation_enabled; 25 + 26 + static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, 27 + void *guest_code, 28 + uint8_t pmu_version, 29 + uint64_t perf_capabilities) 30 + { 31 + struct kvm_vm *vm; 32 + 33 + vm = vm_create_with_one_vcpu(vcpu, guest_code); 34 + vm_init_descriptor_tables(vm); 35 + vcpu_init_descriptor_tables(*vcpu); 36 + 37 + sync_global_to_guest(vm, kvm_pmu_version); 38 + sync_global_to_guest(vm, is_forced_emulation_enabled); 39 + 40 + /* 41 + * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling 42 + * features via PERF_CAPABILITIES if the guest doesn't have a vPMU. 43 + */ 44 + if (kvm_has_perf_caps) 45 + vcpu_set_msr(*vcpu, MSR_IA32_PERF_CAPABILITIES, perf_capabilities); 46 + 47 + vcpu_set_cpuid_property(*vcpu, X86_PROPERTY_PMU_VERSION, pmu_version); 48 + return vm; 49 + } 50 + 51 + static void run_vcpu(struct kvm_vcpu *vcpu) 52 + { 53 + struct ucall uc; 54 + 55 + do { 56 + vcpu_run(vcpu); 57 + switch (get_ucall(vcpu, &uc)) { 58 + case UCALL_SYNC: 59 + break; 60 + case UCALL_ABORT: 61 + REPORT_GUEST_ASSERT(uc); 62 + break; 63 + case UCALL_PRINTF: 64 + pr_info("%s", uc.buffer); 65 + break; 66 + case UCALL_DONE: 67 + break; 68 + default: 69 + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); 70 + } 71 + } while (uc.cmd != UCALL_DONE); 72 + } 73 + 74 + static uint8_t guest_get_pmu_version(void) 75 + { 76 + /* 77 + * Return the effective PMU version, i.e. the minimum between what KVM 78 + * supports and what is enumerated to the guest. The host deliberately 79 + * advertises a PMU version to the guest beyond what is actually 80 + * supported by KVM to verify KVM doesn't freak out and do something 81 + * bizarre with an architecturally valid, but unsupported, version. 82 + */ 83 + return min_t(uint8_t, kvm_pmu_version, this_cpu_property(X86_PROPERTY_PMU_VERSION)); 84 + } 85 + 86 + /* 87 + * If an architectural event is supported and guaranteed to generate at least 88 + * one "hit, assert that its count is non-zero. If an event isn't supported or 89 + * the test can't guarantee the associated action will occur, then all bets are 90 + * off regarding the count, i.e. no checks can be done. 91 + * 92 + * Sanity check that in all cases, the event doesn't count when it's disabled, 93 + * and that KVM correctly emulates the write of an arbitrary value. 94 + */ 95 + static void guest_assert_event_count(uint8_t idx, 96 + struct kvm_x86_pmu_feature event, 97 + uint32_t pmc, uint32_t pmc_msr) 98 + { 99 + uint64_t count; 100 + 101 + count = _rdpmc(pmc); 102 + if (!this_pmu_has(event)) 103 + goto sanity_checks; 104 + 105 + switch (idx) { 106 + case INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX: 107 + GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED); 108 + break; 109 + case INTEL_ARCH_BRANCHES_RETIRED_INDEX: 110 + GUEST_ASSERT_EQ(count, NUM_BRANCHES); 111 + break; 112 + case INTEL_ARCH_LLC_REFERENCES_INDEX: 113 + case INTEL_ARCH_LLC_MISSES_INDEX: 114 + if (!this_cpu_has(X86_FEATURE_CLFLUSHOPT) && 115 + !this_cpu_has(X86_FEATURE_CLFLUSH)) 116 + break; 117 + fallthrough; 118 + case INTEL_ARCH_CPU_CYCLES_INDEX: 119 + case INTEL_ARCH_REFERENCE_CYCLES_INDEX: 120 + GUEST_ASSERT_NE(count, 0); 121 + break; 122 + case INTEL_ARCH_TOPDOWN_SLOTS_INDEX: 123 + GUEST_ASSERT(count >= NUM_INSNS_RETIRED); 124 + break; 125 + default: 126 + break; 127 + } 128 + 129 + sanity_checks: 130 + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); 131 + GUEST_ASSERT_EQ(_rdpmc(pmc), count); 132 + 133 + wrmsr(pmc_msr, 0xdead); 134 + GUEST_ASSERT_EQ(_rdpmc(pmc), 0xdead); 135 + } 136 + 137 + /* 138 + * Enable and disable the PMC in a monolithic asm blob to ensure that the 139 + * compiler can't insert _any_ code into the measured sequence. Note, ECX 140 + * doesn't need to be clobbered as the input value, @pmc_msr, is restored 141 + * before the end of the sequence. 142 + * 143 + * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the 144 + * start of the loop to force LLC references and misses, i.e. to allow testing 145 + * that those events actually count. 146 + * 147 + * If forced emulation is enabled (and specified), force emulation on a subset 148 + * of the measured code to verify that KVM correctly emulates instructions and 149 + * branches retired events in conjunction with hardware also counting said 150 + * events. 151 + */ 152 + #define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP) \ 153 + do { \ 154 + __asm__ __volatile__("wrmsr\n\t" \ 155 + clflush "\n\t" \ 156 + "mfence\n\t" \ 157 + "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t" \ 158 + FEP "loop .\n\t" \ 159 + FEP "mov %%edi, %%ecx\n\t" \ 160 + FEP "xor %%eax, %%eax\n\t" \ 161 + FEP "xor %%edx, %%edx\n\t" \ 162 + "wrmsr\n\t" \ 163 + :: "a"((uint32_t)_value), "d"(_value >> 32), \ 164 + "c"(_msr), "D"(_msr) \ 165 + ); \ 166 + } while (0) 167 + 168 + #define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP) \ 169 + do { \ 170 + wrmsr(pmc_msr, 0); \ 171 + \ 172 + if (this_cpu_has(X86_FEATURE_CLFLUSHOPT)) \ 173 + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP); \ 174 + else if (this_cpu_has(X86_FEATURE_CLFLUSH)) \ 175 + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP); \ 176 + else \ 177 + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP); \ 178 + \ 179 + guest_assert_event_count(_idx, _event, _pmc, _pmc_msr); \ 180 + } while (0) 181 + 182 + static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event, 183 + uint32_t pmc, uint32_t pmc_msr, 184 + uint32_t ctrl_msr, uint64_t ctrl_msr_value) 185 + { 186 + GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, ""); 187 + 188 + if (is_forced_emulation_enabled) 189 + GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP); 190 + } 191 + 192 + #define X86_PMU_FEATURE_NULL \ 193 + ({ \ 194 + struct kvm_x86_pmu_feature feature = {}; \ 195 + \ 196 + feature; \ 197 + }) 198 + 199 + static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event) 200 + { 201 + return !(*(u64 *)&event); 202 + } 203 + 204 + static void guest_test_arch_event(uint8_t idx) 205 + { 206 + const struct { 207 + struct kvm_x86_pmu_feature gp_event; 208 + struct kvm_x86_pmu_feature fixed_event; 209 + } intel_event_to_feature[] = { 210 + [INTEL_ARCH_CPU_CYCLES_INDEX] = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED }, 211 + [INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX] = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED }, 212 + /* 213 + * Note, the fixed counter for reference cycles is NOT the same 214 + * as the general purpose architectural event. The fixed counter 215 + * explicitly counts at the same frequency as the TSC, whereas 216 + * the GP event counts at a fixed, but uarch specific, frequency. 217 + * Bundle them here for simplicity. 218 + */ 219 + [INTEL_ARCH_REFERENCE_CYCLES_INDEX] = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED }, 220 + [INTEL_ARCH_LLC_REFERENCES_INDEX] = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL }, 221 + [INTEL_ARCH_LLC_MISSES_INDEX] = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL }, 222 + [INTEL_ARCH_BRANCHES_RETIRED_INDEX] = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL }, 223 + [INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL }, 224 + [INTEL_ARCH_TOPDOWN_SLOTS_INDEX] = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED }, 225 + }; 226 + 227 + uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); 228 + uint32_t pmu_version = guest_get_pmu_version(); 229 + /* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */ 230 + bool guest_has_perf_global_ctrl = pmu_version >= 2; 231 + struct kvm_x86_pmu_feature gp_event, fixed_event; 232 + uint32_t base_pmc_msr; 233 + unsigned int i; 234 + 235 + /* The host side shouldn't invoke this without a guest PMU. */ 236 + GUEST_ASSERT(pmu_version); 237 + 238 + if (this_cpu_has(X86_FEATURE_PDCM) && 239 + rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) 240 + base_pmc_msr = MSR_IA32_PMC0; 241 + else 242 + base_pmc_msr = MSR_IA32_PERFCTR0; 243 + 244 + gp_event = intel_event_to_feature[idx].gp_event; 245 + GUEST_ASSERT_EQ(idx, gp_event.f.bit); 246 + 247 + GUEST_ASSERT(nr_gp_counters); 248 + 249 + for (i = 0; i < nr_gp_counters; i++) { 250 + uint64_t eventsel = ARCH_PERFMON_EVENTSEL_OS | 251 + ARCH_PERFMON_EVENTSEL_ENABLE | 252 + intel_pmu_arch_events[idx]; 253 + 254 + wrmsr(MSR_P6_EVNTSEL0 + i, 0); 255 + if (guest_has_perf_global_ctrl) 256 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i)); 257 + 258 + __guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i, 259 + MSR_P6_EVNTSEL0 + i, eventsel); 260 + } 261 + 262 + if (!guest_has_perf_global_ctrl) 263 + return; 264 + 265 + fixed_event = intel_event_to_feature[idx].fixed_event; 266 + if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event)) 267 + return; 268 + 269 + i = fixed_event.f.bit; 270 + 271 + wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL)); 272 + 273 + __guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED, 274 + MSR_CORE_PERF_FIXED_CTR0 + i, 275 + MSR_CORE_PERF_GLOBAL_CTRL, 276 + FIXED_PMC_GLOBAL_CTRL_ENABLE(i)); 277 + } 278 + 279 + static void guest_test_arch_events(void) 280 + { 281 + uint8_t i; 282 + 283 + for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) 284 + guest_test_arch_event(i); 285 + 286 + GUEST_DONE(); 287 + } 288 + 289 + static void test_arch_events(uint8_t pmu_version, uint64_t perf_capabilities, 290 + uint8_t length, uint8_t unavailable_mask) 291 + { 292 + struct kvm_vcpu *vcpu; 293 + struct kvm_vm *vm; 294 + 295 + /* Testing arch events requires a vPMU (there are no negative tests). */ 296 + if (!pmu_version) 297 + return; 298 + 299 + vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_arch_events, 300 + pmu_version, perf_capabilities); 301 + 302 + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH, 303 + length); 304 + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK, 305 + unavailable_mask); 306 + 307 + run_vcpu(vcpu); 308 + 309 + kvm_vm_free(vm); 310 + } 311 + 312 + /* 313 + * Limit testing to MSRs that are actually defined by Intel (in the SDM). MSRs 314 + * that aren't defined counter MSRs *probably* don't exist, but there's no 315 + * guarantee that currently undefined MSR indices won't be used for something 316 + * other than PMCs in the future. 317 + */ 318 + #define MAX_NR_GP_COUNTERS 8 319 + #define MAX_NR_FIXED_COUNTERS 3 320 + 321 + #define GUEST_ASSERT_PMC_MSR_ACCESS(insn, msr, expect_gp, vector) \ 322 + __GUEST_ASSERT(expect_gp ? vector == GP_VECTOR : !vector, \ 323 + "Expected %s on " #insn "(0x%x), got vector %u", \ 324 + expect_gp ? "#GP" : "no fault", msr, vector) \ 325 + 326 + #define GUEST_ASSERT_PMC_VALUE(insn, msr, val, expected) \ 327 + __GUEST_ASSERT(val == expected_val, \ 328 + "Expected " #insn "(0x%x) to yield 0x%lx, got 0x%lx", \ 329 + msr, expected_val, val); 330 + 331 + static void guest_test_rdpmc(uint32_t rdpmc_idx, bool expect_success, 332 + uint64_t expected_val) 333 + { 334 + uint8_t vector; 335 + uint64_t val; 336 + 337 + vector = rdpmc_safe(rdpmc_idx, &val); 338 + GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector); 339 + if (expect_success) 340 + GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val); 341 + 342 + if (!is_forced_emulation_enabled) 343 + return; 344 + 345 + vector = rdpmc_safe_fep(rdpmc_idx, &val); 346 + GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector); 347 + if (expect_success) 348 + GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val); 349 + } 350 + 351 + static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters, 352 + uint8_t nr_counters, uint32_t or_mask) 353 + { 354 + const bool pmu_has_fast_mode = !guest_get_pmu_version(); 355 + uint8_t i; 356 + 357 + for (i = 0; i < nr_possible_counters; i++) { 358 + /* 359 + * TODO: Test a value that validates full-width writes and the 360 + * width of the counters. 361 + */ 362 + const uint64_t test_val = 0xffff; 363 + const uint32_t msr = base_msr + i; 364 + 365 + /* 366 + * Fixed counters are supported if the counter is less than the 367 + * number of enumerated contiguous counters *or* the counter is 368 + * explicitly enumerated in the supported counters mask. 369 + */ 370 + const bool expect_success = i < nr_counters || (or_mask & BIT(i)); 371 + 372 + /* 373 + * KVM drops writes to MSR_P6_PERFCTR[0|1] if the counters are 374 + * unsupported, i.e. doesn't #GP and reads back '0'. 375 + */ 376 + const uint64_t expected_val = expect_success ? test_val : 0; 377 + const bool expect_gp = !expect_success && msr != MSR_P6_PERFCTR0 && 378 + msr != MSR_P6_PERFCTR1; 379 + uint32_t rdpmc_idx; 380 + uint8_t vector; 381 + uint64_t val; 382 + 383 + vector = wrmsr_safe(msr, test_val); 384 + GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector); 385 + 386 + vector = rdmsr_safe(msr, &val); 387 + GUEST_ASSERT_PMC_MSR_ACCESS(RDMSR, msr, expect_gp, vector); 388 + 389 + /* On #GP, the result of RDMSR is undefined. */ 390 + if (!expect_gp) 391 + GUEST_ASSERT_PMC_VALUE(RDMSR, msr, val, expected_val); 392 + 393 + /* 394 + * Redo the read tests with RDPMC, which has different indexing 395 + * semantics and additional capabilities. 396 + */ 397 + rdpmc_idx = i; 398 + if (base_msr == MSR_CORE_PERF_FIXED_CTR0) 399 + rdpmc_idx |= INTEL_RDPMC_FIXED; 400 + 401 + guest_test_rdpmc(rdpmc_idx, expect_success, expected_val); 402 + 403 + /* 404 + * KVM doesn't support non-architectural PMUs, i.e. it should 405 + * impossible to have fast mode RDPMC. Verify that attempting 406 + * to use fast RDPMC always #GPs. 407 + */ 408 + GUEST_ASSERT(!expect_success || !pmu_has_fast_mode); 409 + rdpmc_idx |= INTEL_RDPMC_FAST; 410 + guest_test_rdpmc(rdpmc_idx, false, -1ull); 411 + 412 + vector = wrmsr_safe(msr, 0); 413 + GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector); 414 + } 415 + } 416 + 417 + static void guest_test_gp_counters(void) 418 + { 419 + uint8_t nr_gp_counters = 0; 420 + uint32_t base_msr; 421 + 422 + if (guest_get_pmu_version()) 423 + nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); 424 + 425 + if (this_cpu_has(X86_FEATURE_PDCM) && 426 + rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) 427 + base_msr = MSR_IA32_PMC0; 428 + else 429 + base_msr = MSR_IA32_PERFCTR0; 430 + 431 + guest_rd_wr_counters(base_msr, MAX_NR_GP_COUNTERS, nr_gp_counters, 0); 432 + GUEST_DONE(); 433 + } 434 + 435 + static void test_gp_counters(uint8_t pmu_version, uint64_t perf_capabilities, 436 + uint8_t nr_gp_counters) 437 + { 438 + struct kvm_vcpu *vcpu; 439 + struct kvm_vm *vm; 440 + 441 + vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_gp_counters, 442 + pmu_version, perf_capabilities); 443 + 444 + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_GP_COUNTERS, 445 + nr_gp_counters); 446 + 447 + run_vcpu(vcpu); 448 + 449 + kvm_vm_free(vm); 450 + } 451 + 452 + static void guest_test_fixed_counters(void) 453 + { 454 + uint64_t supported_bitmask = 0; 455 + uint8_t nr_fixed_counters = 0; 456 + uint8_t i; 457 + 458 + /* Fixed counters require Architectural vPMU Version 2+. */ 459 + if (guest_get_pmu_version() >= 2) 460 + nr_fixed_counters = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); 461 + 462 + /* 463 + * The supported bitmask for fixed counters was introduced in PMU 464 + * version 5. 465 + */ 466 + if (guest_get_pmu_version() >= 5) 467 + supported_bitmask = this_cpu_property(X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK); 468 + 469 + guest_rd_wr_counters(MSR_CORE_PERF_FIXED_CTR0, MAX_NR_FIXED_COUNTERS, 470 + nr_fixed_counters, supported_bitmask); 471 + 472 + for (i = 0; i < MAX_NR_FIXED_COUNTERS; i++) { 473 + uint8_t vector; 474 + uint64_t val; 475 + 476 + if (i >= nr_fixed_counters && !(supported_bitmask & BIT_ULL(i))) { 477 + vector = wrmsr_safe(MSR_CORE_PERF_FIXED_CTR_CTRL, 478 + FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL)); 479 + __GUEST_ASSERT(vector == GP_VECTOR, 480 + "Expected #GP for counter %u in FIXED_CTR_CTRL", i); 481 + 482 + vector = wrmsr_safe(MSR_CORE_PERF_GLOBAL_CTRL, 483 + FIXED_PMC_GLOBAL_CTRL_ENABLE(i)); 484 + __GUEST_ASSERT(vector == GP_VECTOR, 485 + "Expected #GP for counter %u in PERF_GLOBAL_CTRL", i); 486 + continue; 487 + } 488 + 489 + wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0); 490 + wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL)); 491 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i)); 492 + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); 493 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); 494 + val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i); 495 + 496 + GUEST_ASSERT_NE(val, 0); 497 + } 498 + GUEST_DONE(); 499 + } 500 + 501 + static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities, 502 + uint8_t nr_fixed_counters, 503 + uint32_t supported_bitmask) 504 + { 505 + struct kvm_vcpu *vcpu; 506 + struct kvm_vm *vm; 507 + 508 + vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_fixed_counters, 509 + pmu_version, perf_capabilities); 510 + 511 + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK, 512 + supported_bitmask); 513 + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_FIXED_COUNTERS, 514 + nr_fixed_counters); 515 + 516 + run_vcpu(vcpu); 517 + 518 + kvm_vm_free(vm); 519 + } 520 + 521 + static void test_intel_counters(void) 522 + { 523 + uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); 524 + uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); 525 + uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); 526 + uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); 527 + unsigned int i; 528 + uint8_t v, j; 529 + uint32_t k; 530 + 531 + const uint64_t perf_caps[] = { 532 + 0, 533 + PMU_CAP_FW_WRITES, 534 + }; 535 + 536 + /* 537 + * Test up to PMU v5, which is the current maximum version defined by 538 + * Intel, i.e. is the last version that is guaranteed to be backwards 539 + * compatible with KVM's existing behavior. 540 + */ 541 + uint8_t max_pmu_version = max_t(typeof(pmu_version), pmu_version, 5); 542 + 543 + /* 544 + * Detect the existence of events that aren't supported by selftests. 545 + * This will (obviously) fail any time the kernel adds support for a 546 + * new event, but it's worth paying that price to keep the test fresh. 547 + */ 548 + TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS, 549 + "New architectural event(s) detected; please update this test (length = %u, mask = %x)", 550 + nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK)); 551 + 552 + /* 553 + * Force iterating over known arch events regardless of whether or not 554 + * KVM/hardware supports a given event. 555 + */ 556 + nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS); 557 + 558 + for (v = 0; v <= max_pmu_version; v++) { 559 + for (i = 0; i < ARRAY_SIZE(perf_caps); i++) { 560 + if (!kvm_has_perf_caps && perf_caps[i]) 561 + continue; 562 + 563 + pr_info("Testing arch events, PMU version %u, perf_caps = %lx\n", 564 + v, perf_caps[i]); 565 + /* 566 + * To keep the total runtime reasonable, test every 567 + * possible non-zero, non-reserved bitmap combination 568 + * only with the native PMU version and the full bit 569 + * vector length. 570 + */ 571 + if (v == pmu_version) { 572 + for (k = 1; k < (BIT(nr_arch_events) - 1); k++) 573 + test_arch_events(v, perf_caps[i], nr_arch_events, k); 574 + } 575 + /* 576 + * Test single bits for all PMU version and lengths up 577 + * the number of events +1 (to verify KVM doesn't do 578 + * weird things if the guest length is greater than the 579 + * host length). Explicitly test a mask of '0' and all 580 + * ones i.e. all events being available and unavailable. 581 + */ 582 + for (j = 0; j <= nr_arch_events + 1; j++) { 583 + test_arch_events(v, perf_caps[i], j, 0); 584 + test_arch_events(v, perf_caps[i], j, 0xff); 585 + 586 + for (k = 0; k < nr_arch_events; k++) 587 + test_arch_events(v, perf_caps[i], j, BIT(k)); 588 + } 589 + 590 + pr_info("Testing GP counters, PMU version %u, perf_caps = %lx\n", 591 + v, perf_caps[i]); 592 + for (j = 0; j <= nr_gp_counters; j++) 593 + test_gp_counters(v, perf_caps[i], j); 594 + 595 + pr_info("Testing fixed counters, PMU version %u, perf_caps = %lx\n", 596 + v, perf_caps[i]); 597 + for (j = 0; j <= nr_fixed_counters; j++) { 598 + for (k = 0; k <= (BIT(nr_fixed_counters) - 1); k++) 599 + test_fixed_counters(v, perf_caps[i], j, k); 600 + } 601 + } 602 + } 603 + } 604 + 605 + int main(int argc, char *argv[]) 606 + { 607 + TEST_REQUIRE(kvm_is_pmu_enabled()); 608 + 609 + TEST_REQUIRE(host_cpu_is_intel); 610 + TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION)); 611 + TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0); 612 + 613 + kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); 614 + kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM); 615 + is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); 616 + 617 + test_intel_counters(); 618 + 619 + return 0; 620 + }

+45 -98

tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c

··· 11 11 */ 12 12 13 13 #define _GNU_SOURCE /* for program_invocation_short_name */ 14 - #include "test_util.h" 14 + 15 15 #include "kvm_util.h" 16 + #include "pmu.h" 16 17 #include "processor.h" 17 - 18 - /* 19 - * In lieu of copying perf_event.h into tools... 20 - */ 21 - #define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) 22 - #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) 23 - 24 - /* End of stuff taken from perf_event.h. */ 25 - 26 - /* Oddly, this isn't in perf_event.h. */ 27 - #define ARCH_PERFMON_BRANCHES_RETIRED 5 18 + #include "test_util.h" 28 19 29 20 #define NUM_BRANCHES 42 30 - #define INTEL_PMC_IDX_FIXED 32 31 - 32 - /* Matches KVM_PMU_EVENT_FILTER_MAX_EVENTS in pmu.c */ 33 - #define MAX_FILTER_EVENTS 300 34 21 #define MAX_TEST_EVENTS 10 35 22 36 23 #define PMU_EVENT_FILTER_INVALID_ACTION (KVM_PMU_EVENT_DENY + 1) 37 24 #define PMU_EVENT_FILTER_INVALID_FLAGS (KVM_PMU_EVENT_FLAGS_VALID_MASK << 1) 38 - #define PMU_EVENT_FILTER_INVALID_NEVENTS (MAX_FILTER_EVENTS + 1) 39 - 40 - /* 41 - * This is how the event selector and unit mask are stored in an AMD 42 - * core performance event-select register. Intel's format is similar, 43 - * but the event selector is only 8 bits. 44 - */ 45 - #define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \ 46 - (umask & 0xff) << 8) 47 - 48 - /* 49 - * "Branch instructions retired", from the Intel SDM, volume 3, 50 - * "Pre-defined Architectural Performance Events." 51 - */ 52 - 53 - #define INTEL_BR_RETIRED EVENT(0xc4, 0) 54 - 55 - /* 56 - * "Retired branch instructions", from Processor Programming Reference 57 - * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, 58 - * Preliminary Processor Programming Reference (PPR) for AMD Family 59 - * 17h Model 31h, Revision B0 Processors, and Preliminary Processor 60 - * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision 61 - * B1 Processors Volume 1 of 2. 62 - */ 63 - 64 - #define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0) 65 - 66 - 67 - /* 68 - * "Retired instructions", from Processor Programming Reference 69 - * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, 70 - * Preliminary Processor Programming Reference (PPR) for AMD Family 71 - * 17h Model 31h, Revision B0 Processors, and Preliminary Processor 72 - * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision 73 - * B1 Processors Volume 1 of 2. 74 - * --- and --- 75 - * "Instructions retired", from the Intel SDM, volume 3, 76 - * "Pre-defined Architectural Performance Events." 77 - */ 78 - 79 - #define INST_RETIRED EVENT(0xc0, 0) 25 + #define PMU_EVENT_FILTER_INVALID_NEVENTS (KVM_PMU_EVENT_FILTER_MAX_EVENTS + 1) 80 26 81 27 struct __kvm_pmu_event_filter { 82 28 __u32 action; ··· 30 84 __u32 fixed_counter_bitmap; 31 85 __u32 flags; 32 86 __u32 pad[4]; 33 - __u64 events[MAX_FILTER_EVENTS]; 87 + __u64 events[KVM_PMU_EVENT_FILTER_MAX_EVENTS]; 34 88 }; 35 89 36 90 /* 37 - * This event list comprises Intel's eight architectural events plus 38 - * AMD's "retired branch instructions" for Zen[123] (and possibly 39 - * other AMD CPUs). 91 + * This event list comprises Intel's known architectural events, plus AMD's 92 + * "retired branch instructions" for Zen1-Zen3 (and* possibly other AMD CPUs). 93 + * Note, AMD and Intel use the same encoding for instructions retired. 40 94 */ 95 + kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED); 96 + 41 97 static const struct __kvm_pmu_event_filter base_event_filter = { 42 98 .nevents = ARRAY_SIZE(base_event_filter.events), 43 99 .events = { 44 - EVENT(0x3c, 0), 45 - INST_RETIRED, 46 - EVENT(0x3c, 1), 47 - EVENT(0x2e, 0x4f), 48 - EVENT(0x2e, 0x41), 49 - EVENT(0xc4, 0), 50 - EVENT(0xc5, 0), 51 - EVENT(0xa4, 1), 52 - AMD_ZEN_BR_RETIRED, 100 + INTEL_ARCH_CPU_CYCLES, 101 + INTEL_ARCH_INSTRUCTIONS_RETIRED, 102 + INTEL_ARCH_REFERENCE_CYCLES, 103 + INTEL_ARCH_LLC_REFERENCES, 104 + INTEL_ARCH_LLC_MISSES, 105 + INTEL_ARCH_BRANCHES_RETIRED, 106 + INTEL_ARCH_BRANCHES_MISPREDICTED, 107 + INTEL_ARCH_TOPDOWN_SLOTS, 108 + AMD_ZEN_BRANCHES_RETIRED, 53 109 }, 54 110 }; 55 111 ··· 113 165 for (;;) { 114 166 wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); 115 167 wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | 116 - ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED); 168 + ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_BRANCHES_RETIRED); 117 169 wrmsr(MSR_P6_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE | 118 - ARCH_PERFMON_EVENTSEL_OS | INST_RETIRED); 170 + ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_INSTRUCTIONS_RETIRED); 119 171 wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); 120 172 121 173 run_and_measure_loop(MSR_IA32_PMC0); ··· 137 189 for (;;) { 138 190 wrmsr(MSR_K7_EVNTSEL0, 0); 139 191 wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | 140 - ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED); 192 + ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BRANCHES_RETIRED); 141 193 wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE | 142 - ARCH_PERFMON_EVENTSEL_OS | INST_RETIRED); 194 + ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_INSTRUCTIONS_RETIRED); 143 195 144 196 run_and_measure_loop(MSR_K7_PERFCTR0); 145 197 GUEST_SYNC(0); ··· 260 312 .action = KVM_PMU_EVENT_DENY, 261 313 .nevents = 1, 262 314 .events = { 263 - EVENT(0x1C2, 0), 315 + RAW_EVENT(0x1C2, 0), 264 316 }, 265 317 }; 266 318 ··· 295 347 296 348 f.action = KVM_PMU_EVENT_DENY; 297 349 298 - remove_event(&f, INST_RETIRED); 299 - remove_event(&f, INTEL_BR_RETIRED); 300 - remove_event(&f, AMD_ZEN_BR_RETIRED); 350 + remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED); 351 + remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED); 352 + remove_event(&f, AMD_ZEN_BRANCHES_RETIRED); 301 353 test_with_filter(vcpu, &f); 302 354 303 355 ASSERT_PMC_COUNTING_INSTRUCTIONS(); ··· 309 361 310 362 f.action = KVM_PMU_EVENT_ALLOW; 311 363 312 - remove_event(&f, INST_RETIRED); 313 - remove_event(&f, INTEL_BR_RETIRED); 314 - remove_event(&f, AMD_ZEN_BR_RETIRED); 364 + remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED); 365 + remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED); 366 + remove_event(&f, AMD_ZEN_BRANCHES_RETIRED); 315 367 test_with_filter(vcpu, &f); 316 368 317 369 ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS(); ··· 400 452 * - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake. 401 453 */ 402 454 #define MEM_INST_RETIRED 0xD0 403 - #define MEM_INST_RETIRED_LOAD EVENT(MEM_INST_RETIRED, 0x81) 404 - #define MEM_INST_RETIRED_STORE EVENT(MEM_INST_RETIRED, 0x82) 405 - #define MEM_INST_RETIRED_LOAD_STORE EVENT(MEM_INST_RETIRED, 0x83) 455 + #define MEM_INST_RETIRED_LOAD RAW_EVENT(MEM_INST_RETIRED, 0x81) 456 + #define MEM_INST_RETIRED_STORE RAW_EVENT(MEM_INST_RETIRED, 0x82) 457 + #define MEM_INST_RETIRED_LOAD_STORE RAW_EVENT(MEM_INST_RETIRED, 0x83) 406 458 407 459 static bool supports_event_mem_inst_retired(void) 408 460 { ··· 434 486 * B1 Processors Volume 1 of 2. 435 487 */ 436 488 #define LS_DISPATCH 0x29 437 - #define LS_DISPATCH_LOAD EVENT(LS_DISPATCH, BIT(0)) 438 - #define LS_DISPATCH_STORE EVENT(LS_DISPATCH, BIT(1)) 439 - #define LS_DISPATCH_LOAD_STORE EVENT(LS_DISPATCH, BIT(2)) 489 + #define LS_DISPATCH_LOAD RAW_EVENT(LS_DISPATCH, BIT(0)) 490 + #define LS_DISPATCH_STORE RAW_EVENT(LS_DISPATCH, BIT(1)) 491 + #define LS_DISPATCH_LOAD_STORE RAW_EVENT(LS_DISPATCH, BIT(2)) 440 492 441 493 #define INCLUDE_MASKED_ENTRY(event_select, mask, match) \ 442 494 KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false) ··· 677 729 678 730 static void test_masked_events(struct kvm_vcpu *vcpu) 679 731 { 680 - int nevents = MAX_FILTER_EVENTS - MAX_TEST_EVENTS; 681 - uint64_t events[MAX_FILTER_EVENTS]; 732 + int nevents = KVM_PMU_EVENT_FILTER_MAX_EVENTS - MAX_TEST_EVENTS; 733 + uint64_t events[KVM_PMU_EVENT_FILTER_MAX_EVENTS]; 682 734 683 735 /* Run the test cases against a sparse PMU event filter. */ 684 736 run_masked_events_tests(vcpu, events, 0); 685 737 686 738 /* Run the test cases against a dense PMU event filter. */ 687 - add_dummy_events(events, MAX_FILTER_EVENTS); 739 + add_dummy_events(events, KVM_PMU_EVENT_FILTER_MAX_EVENTS); 688 740 run_masked_events_tests(vcpu, events, nevents); 689 741 } 690 742 ··· 757 809 TEST_ASSERT(!r, "Masking non-existent fixed counters should be allowed"); 758 810 } 759 811 760 - static void intel_run_fixed_counter_guest_code(uint8_t fixed_ctr_idx) 812 + static void intel_run_fixed_counter_guest_code(uint8_t idx) 761 813 { 762 814 for (;;) { 763 815 wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); 764 - wrmsr(MSR_CORE_PERF_FIXED_CTR0 + fixed_ctr_idx, 0); 816 + wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0); 765 817 766 818 /* Only OS_EN bit is enabled for fixed counter[idx]. */ 767 - wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, BIT_ULL(4 * fixed_ctr_idx)); 768 - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 769 - BIT_ULL(INTEL_PMC_IDX_FIXED + fixed_ctr_idx)); 819 + wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL)); 820 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(idx)); 770 821 __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); 771 822 wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); 772 823 773 - GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + fixed_ctr_idx)); 824 + GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + idx)); 774 825 } 775 826 } 776 827 ··· 867 920 struct kvm_vcpu *vcpu, *vcpu2 = NULL; 868 921 struct kvm_vm *vm; 869 922 870 - TEST_REQUIRE(get_kvm_param_bool("enable_pmu")); 923 + TEST_REQUIRE(kvm_is_pmu_enabled()); 871 924 TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER)); 872 925 TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS)); 873 926

+2

tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c

··· 434 434 435 435 r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); 436 436 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); 437 + 438 + close(memfd); 437 439 } 438 440 439 441 static void usage(const char *cmd)

+18 -42

tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c

··· 10 10 #include "test_util.h" 11 11 #include "kvm_util.h" 12 12 #include "processor.h" 13 - #include "svm_util.h" 13 + #include "sev.h" 14 14 #include "kselftest.h" 15 - 16 - #define SEV_POLICY_ES 0b100 17 15 18 16 #define NR_MIGRATE_TEST_VCPUS 4 19 17 #define NR_MIGRATE_TEST_VMS 3 ··· 20 22 21 23 bool have_sev_es; 22 24 23 - static int __sev_ioctl(int vm_fd, int cmd_id, void *data, __u32 *fw_error) 24 - { 25 - struct kvm_sev_cmd cmd = { 26 - .id = cmd_id, 27 - .data = (uint64_t)data, 28 - .sev_fd = open_sev_dev_path_or_exit(), 29 - }; 30 - int ret; 31 - 32 - ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd); 33 - *fw_error = cmd.error; 34 - return ret; 35 - } 36 - 37 - static void sev_ioctl(int vm_fd, int cmd_id, void *data) 38 - { 39 - int ret; 40 - __u32 fw_error; 41 - 42 - ret = __sev_ioctl(vm_fd, cmd_id, data, &fw_error); 43 - TEST_ASSERT(ret == 0 && fw_error == SEV_RET_SUCCESS, 44 - "%d failed: return code: %d, errno: %d, fw error: %d", 45 - cmd_id, ret, errno, fw_error); 46 - } 47 - 48 25 static struct kvm_vm *sev_vm_create(bool es) 49 26 { 50 27 struct kvm_vm *vm; 51 - struct kvm_sev_launch_start start = { 0 }; 52 28 int i; 53 29 54 30 vm = vm_create_barebones(); 55 - sev_ioctl(vm->fd, es ? KVM_SEV_ES_INIT : KVM_SEV_INIT, NULL); 31 + if (!es) 32 + sev_vm_init(vm); 33 + else 34 + sev_es_vm_init(vm); 35 + 56 36 for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) 57 37 __vm_vcpu_add(vm, i); 38 + 39 + sev_vm_launch(vm, es ? SEV_POLICY_ES : 0); 40 + 58 41 if (es) 59 - start.policy |= SEV_POLICY_ES; 60 - sev_ioctl(vm->fd, KVM_SEV_LAUNCH_START, &start); 61 - if (es) 62 - sev_ioctl(vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); 42 + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); 63 43 return vm; 64 44 } 65 45 ··· 157 181 sev_vm = sev_vm_create(/* es= */ false); 158 182 sev_es_vm = sev_vm_create(/* es= */ true); 159 183 sev_es_vm_no_vmsa = vm_create_barebones(); 160 - sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); 184 + sev_es_vm_init(sev_es_vm_no_vmsa); 161 185 __vm_vcpu_add(sev_es_vm_no_vmsa, 1); 162 186 163 187 ret = __sev_migrate_from(sev_vm, sev_es_vm); ··· 206 230 TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d", ret, errno); 207 231 } 208 232 209 - static void verify_mirror_allowed_cmds(int vm_fd) 233 + static void verify_mirror_allowed_cmds(struct kvm_vm *vm) 210 234 { 211 235 struct kvm_sev_guest_status status; 236 + int cmd_id; 212 237 213 - for (int cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) { 238 + for (cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) { 214 239 int ret; 215 - __u32 fw_error; 216 240 217 241 /* 218 242 * These commands are allowed for mirror VMs, all others are ··· 232 256 * These commands should be disallowed before the data 233 257 * parameter is examined so NULL is OK here. 234 258 */ 235 - ret = __sev_ioctl(vm_fd, cmd_id, NULL, &fw_error); 259 + ret = __vm_sev_ioctl(vm, cmd_id, NULL); 236 260 TEST_ASSERT( 237 261 ret == -1 && errno == EINVAL, 238 262 "Should not be able call command: %d. ret: %d, errno: %d", 239 263 cmd_id, ret, errno); 240 264 } 241 265 242 - sev_ioctl(vm_fd, KVM_SEV_GUEST_STATUS, &status); 266 + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); 243 267 } 244 268 245 269 static void test_sev_mirror(bool es) ··· 257 281 __vm_vcpu_add(dst_vm, i); 258 282 259 283 if (es) 260 - sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); 284 + vm_sev_ioctl(dst_vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); 261 285 262 - verify_mirror_allowed_cmds(dst_vm->fd); 286 + verify_mirror_allowed_cmds(dst_vm); 263 287 264 288 kvm_vm_free(src_vm); 265 289 kvm_vm_free(dst_vm);

+88

tools/testing/selftests/kvm/x86_64/sev_smoke_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <fcntl.h> 3 + #include <stdio.h> 4 + #include <stdlib.h> 5 + #include <string.h> 6 + #include <sys/ioctl.h> 7 + 8 + #include "test_util.h" 9 + #include "kvm_util.h" 10 + #include "processor.h" 11 + #include "svm_util.h" 12 + #include "linux/psp-sev.h" 13 + #include "sev.h" 14 + 15 + 16 + static void guest_sev_es_code(void) 17 + { 18 + /* TODO: Check CPUID after GHCB-based hypercall support is added. */ 19 + GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED); 20 + GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ES_ENABLED); 21 + 22 + /* 23 + * TODO: Add GHCB and ucall support for SEV-ES guests. For now, simply 24 + * force "termination" to signal "done" via the GHCB MSR protocol. 25 + */ 26 + wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ); 27 + __asm__ __volatile__("rep; vmmcall"); 28 + } 29 + 30 + static void guest_sev_code(void) 31 + { 32 + GUEST_ASSERT(this_cpu_has(X86_FEATURE_SEV)); 33 + GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED); 34 + 35 + GUEST_DONE(); 36 + } 37 + 38 + static void test_sev(void *guest_code, uint64_t policy) 39 + { 40 + struct kvm_vcpu *vcpu; 41 + struct kvm_vm *vm; 42 + struct ucall uc; 43 + 44 + vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu); 45 + 46 + for (;;) { 47 + vcpu_run(vcpu); 48 + 49 + if (policy & SEV_POLICY_ES) { 50 + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, 51 + "Wanted SYSTEM_EVENT, got %s", 52 + exit_reason_str(vcpu->run->exit_reason)); 53 + TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM); 54 + TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1); 55 + TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ); 56 + break; 57 + } 58 + 59 + switch (get_ucall(vcpu, &uc)) { 60 + case UCALL_SYNC: 61 + continue; 62 + case UCALL_DONE: 63 + return; 64 + case UCALL_ABORT: 65 + REPORT_GUEST_ASSERT(uc); 66 + default: 67 + TEST_FAIL("Unexpected exit: %s", 68 + exit_reason_str(vcpu->run->exit_reason)); 69 + } 70 + } 71 + 72 + kvm_vm_free(vm); 73 + } 74 + 75 + int main(int argc, char *argv[]) 76 + { 77 + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); 78 + 79 + test_sev(guest_sev_code, SEV_POLICY_NO_DBG); 80 + test_sev(guest_sev_code, 0); 81 + 82 + if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { 83 + test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); 84 + test_sev(guest_sev_es_code, SEV_POLICY_ES); 85 + } 86 + 87 + return 0; 88 + }

+1 -1

tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c

··· 63 63 vm_init_descriptor_tables(vm); 64 64 vcpu_init_descriptor_tables(vcpu); 65 65 66 - vcpu_set_cpuid_maxphyaddr(vcpu, MAXPHYADDR); 66 + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR); 67 67 68 68 rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); 69 69 TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable");

+90 -31

tools/testing/selftests/kvm/x86_64/sync_regs_test.c

··· 17 17 #include <sys/ioctl.h> 18 18 #include <pthread.h> 19 19 20 + #include "kvm_test_harness.h" 20 21 #include "test_util.h" 21 22 #include "kvm_util.h" 22 23 #include "processor.h" ··· 41 40 : : [port] "d" (UCALL_PIO_PORT), "D" (&uc_none) 42 41 : "rax", "rbx"); 43 42 } 43 + 44 + KVM_ONE_VCPU_TEST_SUITE(sync_regs_test); 44 45 45 46 static void compare_regs(struct kvm_regs *left, struct kvm_regs *right) 46 47 { ··· 155 152 return NULL; 156 153 } 157 154 158 - static void race_sync_regs(void *racer) 155 + static void race_sync_regs(struct kvm_vcpu *vcpu, void *racer) 159 156 { 160 157 const time_t TIMEOUT = 2; /* seconds, roughly */ 161 158 struct kvm_x86_state *state; 162 159 struct kvm_translation tr; 163 - struct kvm_vcpu *vcpu; 164 160 struct kvm_run *run; 165 - struct kvm_vm *vm; 166 161 pthread_t thread; 167 162 time_t t; 168 163 169 - vm = vm_create_with_one_vcpu(&vcpu, guest_code); 170 164 run = vcpu->run; 171 165 172 166 run->kvm_valid_regs = KVM_SYNC_X86_SREGS; ··· 205 205 TEST_ASSERT_EQ(pthread_join(thread, NULL), 0); 206 206 207 207 kvm_x86_state_cleanup(state); 208 - kvm_vm_free(vm); 209 208 } 210 209 211 - int main(int argc, char *argv[]) 210 + KVM_ONE_VCPU_TEST(sync_regs_test, read_invalid, guest_code) 212 211 { 213 - struct kvm_vcpu *vcpu; 214 - struct kvm_vm *vm; 215 - struct kvm_run *run; 216 - struct kvm_regs regs; 217 - struct kvm_sregs sregs; 218 - struct kvm_vcpu_events events; 219 - int rv, cap; 220 - 221 - cap = kvm_check_cap(KVM_CAP_SYNC_REGS); 222 - TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS); 223 - TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD)); 224 - 225 - vm = vm_create_with_one_vcpu(&vcpu, guest_code); 226 - 227 - run = vcpu->run; 212 + struct kvm_run *run = vcpu->run; 213 + int rv; 228 214 229 215 /* Request reading invalid register set from VCPU. */ 230 216 run->kvm_valid_regs = INVALID_SYNC_FIELD; ··· 226 240 "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", 227 241 rv); 228 242 run->kvm_valid_regs = 0; 243 + } 244 + 245 + KVM_ONE_VCPU_TEST(sync_regs_test, set_invalid, guest_code) 246 + { 247 + struct kvm_run *run = vcpu->run; 248 + int rv; 229 249 230 250 /* Request setting invalid register set into VCPU. */ 231 251 run->kvm_dirty_regs = INVALID_SYNC_FIELD; ··· 247 255 "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", 248 256 rv); 249 257 run->kvm_dirty_regs = 0; 258 + } 259 + 260 + KVM_ONE_VCPU_TEST(sync_regs_test, req_and_verify_all_valid, guest_code) 261 + { 262 + struct kvm_run *run = vcpu->run; 263 + struct kvm_vcpu_events events; 264 + struct kvm_sregs sregs; 265 + struct kvm_regs regs; 250 266 251 267 /* Request and verify all valid register sets. */ 252 268 /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */ 253 269 run->kvm_valid_regs = TEST_SYNC_FIELDS; 254 - rv = _vcpu_run(vcpu); 270 + vcpu_run(vcpu); 255 271 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 256 272 257 273 vcpu_regs_get(vcpu, &regs); ··· 270 270 271 271 vcpu_events_get(vcpu, &events); 272 272 compare_vcpu_events(&events, &run->s.regs.events); 273 + } 274 + 275 + KVM_ONE_VCPU_TEST(sync_regs_test, set_and_verify_various, guest_code) 276 + { 277 + struct kvm_run *run = vcpu->run; 278 + struct kvm_vcpu_events events; 279 + struct kvm_sregs sregs; 280 + struct kvm_regs regs; 281 + 282 + /* Run once to get register set */ 283 + run->kvm_valid_regs = TEST_SYNC_FIELDS; 284 + vcpu_run(vcpu); 285 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 273 286 274 287 /* Set and verify various register values. */ 275 288 run->s.regs.regs.rbx = 0xBAD1DEA; ··· 291 278 292 279 run->kvm_valid_regs = TEST_SYNC_FIELDS; 293 280 run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS; 294 - rv = _vcpu_run(vcpu); 281 + vcpu_run(vcpu); 295 282 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 296 283 TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1, 297 284 "rbx sync regs value incorrect 0x%llx.", ··· 308 295 309 296 vcpu_events_get(vcpu, &events); 310 297 compare_vcpu_events(&events, &run->s.regs.events); 298 + } 299 + 300 + KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_dirty_regs_bits, guest_code) 301 + { 302 + struct kvm_run *run = vcpu->run; 311 303 312 304 /* Clear kvm_dirty_regs bits, verify new s.regs values are 313 305 * overwritten with existing guest values. ··· 320 302 run->kvm_valid_regs = TEST_SYNC_FIELDS; 321 303 run->kvm_dirty_regs = 0; 322 304 run->s.regs.regs.rbx = 0xDEADBEEF; 323 - rv = _vcpu_run(vcpu); 305 + vcpu_run(vcpu); 324 306 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 325 307 TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF, 326 308 "rbx sync regs value incorrect 0x%llx.", 327 309 run->s.regs.regs.rbx); 310 + } 311 + 312 + KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_and_dirty_regs, guest_code) 313 + { 314 + struct kvm_run *run = vcpu->run; 315 + struct kvm_regs regs; 316 + 317 + /* Run once to get register set */ 318 + run->kvm_valid_regs = TEST_SYNC_FIELDS; 319 + vcpu_run(vcpu); 320 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 328 321 329 322 /* Clear kvm_valid_regs bits and kvm_dirty_bits. 330 323 * Verify s.regs values are not overwritten with existing guest values ··· 344 315 run->kvm_valid_regs = 0; 345 316 run->kvm_dirty_regs = 0; 346 317 run->s.regs.regs.rbx = 0xAAAA; 318 + vcpu_regs_get(vcpu, &regs); 347 319 regs.rbx = 0xBAC0; 348 320 vcpu_regs_set(vcpu, &regs); 349 - rv = _vcpu_run(vcpu); 321 + vcpu_run(vcpu); 350 322 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 351 323 TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA, 352 324 "rbx sync regs value incorrect 0x%llx.", ··· 356 326 TEST_ASSERT(regs.rbx == 0xBAC0 + 1, 357 327 "rbx guest value incorrect 0x%llx.", 358 328 regs.rbx); 329 + } 330 + 331 + KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_regs_bits, guest_code) 332 + { 333 + struct kvm_run *run = vcpu->run; 334 + struct kvm_regs regs; 335 + 336 + /* Run once to get register set */ 337 + run->kvm_valid_regs = TEST_SYNC_FIELDS; 338 + vcpu_run(vcpu); 339 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 359 340 360 341 /* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten 361 342 * with existing guest values but that guest values are overwritten ··· 375 334 run->kvm_valid_regs = 0; 376 335 run->kvm_dirty_regs = TEST_SYNC_FIELDS; 377 336 run->s.regs.regs.rbx = 0xBBBB; 378 - rv = _vcpu_run(vcpu); 337 + vcpu_run(vcpu); 379 338 TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 380 339 TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB, 381 340 "rbx sync regs value incorrect 0x%llx.", ··· 384 343 TEST_ASSERT(regs.rbx == 0xBBBB + 1, 385 344 "rbx guest value incorrect 0x%llx.", 386 345 regs.rbx); 346 + } 387 347 388 - kvm_vm_free(vm); 348 + KVM_ONE_VCPU_TEST(sync_regs_test, race_cr4, guest_code) 349 + { 350 + race_sync_regs(vcpu, race_sregs_cr4); 351 + } 389 352 390 - race_sync_regs(race_sregs_cr4); 391 - race_sync_regs(race_events_exc); 392 - race_sync_regs(race_events_inj_pen); 353 + KVM_ONE_VCPU_TEST(sync_regs_test, race_exc, guest_code) 354 + { 355 + race_sync_regs(vcpu, race_events_exc); 356 + } 393 357 394 - return 0; 358 + KVM_ONE_VCPU_TEST(sync_regs_test, race_inj_pen, guest_code) 359 + { 360 + race_sync_regs(vcpu, race_events_inj_pen); 361 + } 362 + 363 + int main(int argc, char *argv[]) 364 + { 365 + int cap; 366 + 367 + cap = kvm_check_cap(KVM_CAP_SYNC_REGS); 368 + TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS); 369 + TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD)); 370 + 371 + return test_harness_run(argc, argv); 395 372 }

+21 -57

tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c

··· 8 8 #define _GNU_SOURCE /* for program_invocation_short_name */ 9 9 #include <sys/ioctl.h> 10 10 11 + #include "kvm_test_harness.h" 11 12 #include "test_util.h" 12 13 #include "kvm_util.h" 13 14 #include "vmx.h" 14 15 15 - /* Forced emulation prefix, used to invoke the emulator unconditionally. */ 16 - #define KVM_FEP "ud2; .byte 'k', 'v', 'm';" 17 - #define KVM_FEP_LENGTH 5 18 - static int fep_available = 1; 16 + static bool fep_available; 19 17 20 18 #define MSR_NON_EXISTENT 0x474f4f00 21 19 ··· 258 260 GUEST_ASSERT(data == 2); 259 261 GUEST_ASSERT(guest_exception_count == 0); 260 262 261 - /* 262 - * Test to see if the instruction emulator is available (ie: the module 263 - * parameter 'kvm.force_emulation_prefix=1' is set). This instruction 264 - * will #UD if it isn't available. 265 - */ 266 - __asm__ __volatile__(KVM_FEP "nop"); 267 - 268 263 if (fep_available) { 269 264 /* Let userspace know we aren't done. */ 270 265 GUEST_SYNC(0); ··· 377 386 { 378 387 __guest_gp_handler(regs, &em_rdmsr_start, &em_rdmsr_end, 379 388 &em_wrmsr_start, &em_wrmsr_end); 380 - } 381 - 382 - static void guest_ud_handler(struct ex_regs *regs) 383 - { 384 - fep_available = 0; 385 - regs->rip += KVM_FEP_LENGTH; 386 389 } 387 390 388 391 static void check_for_guest_assert(struct kvm_vcpu *vcpu) ··· 512 527 process_ucall_done(vcpu); 513 528 } 514 529 515 - static void test_msr_filter_allow(void) 530 + KVM_ONE_VCPU_TEST_SUITE(user_msr); 531 + 532 + KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) 516 533 { 517 - struct kvm_vcpu *vcpu; 518 - struct kvm_vm *vm; 534 + struct kvm_vm *vm = vcpu->vm; 535 + uint64_t cmd; 519 536 int rc; 520 537 521 - vm = vm_create_with_one_vcpu(&vcpu, guest_code_filter_allow); 538 + sync_global_to_guest(vm, fep_available); 522 539 523 540 rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); 524 541 TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); ··· 548 561 run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT); 549 562 run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT); 550 563 551 - vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); 552 564 vcpu_run(vcpu); 553 - vm_install_exception_handler(vm, UD_VECTOR, NULL); 565 + cmd = process_ucall(vcpu); 554 566 555 - if (process_ucall(vcpu) != UCALL_DONE) { 567 + if (fep_available) { 568 + TEST_ASSERT_EQ(cmd, UCALL_SYNC); 556 569 vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); 557 570 558 571 /* Process emulated rdmsr and wrmsr instructions. */ ··· 570 583 /* Confirm the guest completed without issues. */ 571 584 run_guest_then_process_ucall_done(vcpu); 572 585 } else { 586 + TEST_ASSERT_EQ(cmd, UCALL_DONE); 573 587 printf("To run the instruction emulated tests set the module parameter 'kvm.force_emulation_prefix=1'\n"); 574 588 } 575 - 576 - kvm_vm_free(vm); 577 589 } 578 590 579 591 static int handle_ucall(struct kvm_vcpu *vcpu) ··· 632 646 } 633 647 } 634 648 635 - static void test_msr_filter_deny(void) 649 + KVM_ONE_VCPU_TEST(user_msr, msr_filter_deny, guest_code_filter_deny) 636 650 { 637 - struct kvm_vcpu *vcpu; 638 - struct kvm_vm *vm; 639 - struct kvm_run *run; 651 + struct kvm_vm *vm = vcpu->vm; 652 + struct kvm_run *run = vcpu->run; 640 653 int rc; 641 - 642 - vm = vm_create_with_one_vcpu(&vcpu, guest_code_filter_deny); 643 - run = vcpu->run; 644 654 645 655 rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); 646 656 TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); ··· 671 689 done: 672 690 TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space"); 673 691 TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space"); 674 - 675 - kvm_vm_free(vm); 676 692 } 677 693 678 - static void test_msr_permission_bitmap(void) 694 + KVM_ONE_VCPU_TEST(user_msr, msr_permission_bitmap, guest_code_permission_bitmap) 679 695 { 680 - struct kvm_vcpu *vcpu; 681 - struct kvm_vm *vm; 696 + struct kvm_vm *vm = vcpu->vm; 682 697 int rc; 683 - 684 - vm = vm_create_with_one_vcpu(&vcpu, guest_code_permission_bitmap); 685 698 686 699 rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); 687 700 TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); ··· 692 715 vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs); 693 716 run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE); 694 717 run_guest_then_process_ucall_done(vcpu); 695 - 696 - kvm_vm_free(vm); 697 718 } 698 719 699 720 #define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask) \ ··· 761 786 } 762 787 763 788 /* Test that attempts to write to the unused bits in a flag fails. */ 764 - static void test_user_exit_msr_flags(void) 789 + KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL) 765 790 { 766 - struct kvm_vcpu *vcpu; 767 - struct kvm_vm *vm; 768 - 769 - vm = vm_create_with_one_vcpu(&vcpu, NULL); 791 + struct kvm_vm *vm = vcpu->vm; 770 792 771 793 /* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */ 772 794 run_user_space_msr_flag_test(vm); 773 795 774 796 /* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */ 775 797 run_msr_filter_flag_test(vm); 776 - 777 - kvm_vm_free(vm); 778 798 } 779 799 780 800 int main(int argc, char *argv[]) 781 801 { 782 - test_msr_filter_allow(); 802 + fep_available = kvm_is_forced_emulation_enabled(); 783 803 784 - test_msr_filter_deny(); 785 - 786 - test_msr_permission_bitmap(); 787 - 788 - test_user_exit_msr_flags(); 789 - 790 - return 0; 804 + return test_harness_run(argc, argv); 791 805 }

+13 -41

tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c

··· 15 15 16 16 #include <linux/bitmap.h> 17 17 18 + #include "kvm_test_harness.h" 18 19 #include "kvm_util.h" 19 20 #include "vmx.h" 20 21 21 - union perf_capabilities { 22 + static union perf_capabilities { 22 23 struct { 23 24 u64 lbr_format:6; 24 25 u64 pebs_trap:1; ··· 33 32 u64 anythread_deprecated:1; 34 33 }; 35 34 u64 capabilities; 36 - }; 35 + } host_cap; 37 36 38 37 /* 39 38 * The LBR format and most PEBS features are immutable, all other features are ··· 74 73 GUEST_DONE(); 75 74 } 76 75 76 + KVM_ONE_VCPU_TEST_SUITE(vmx_pmu_caps); 77 + 77 78 /* 78 79 * Verify that guest WRMSRs to PERF_CAPABILITIES #GP regardless of the value 79 80 * written, that the guest always sees the userspace controlled value, and that 80 81 * PERF_CAPABILITIES is immutable after KVM_RUN. 81 82 */ 82 - static void test_guest_wrmsr_perf_capabilities(union perf_capabilities host_cap) 83 + KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code) 83 84 { 84 - struct kvm_vcpu *vcpu; 85 - struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, guest_code); 86 85 struct ucall uc; 87 86 int r, i; 88 87 89 - vm_init_descriptor_tables(vm); 88 + vm_init_descriptor_tables(vcpu->vm); 90 89 vcpu_init_descriptor_tables(vcpu); 91 90 92 91 vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); ··· 118 117 TEST_ASSERT(!r, "Post-KVM_RUN write '0x%llx'didn't fail", 119 118 host_cap.capabilities ^ BIT_ULL(i)); 120 119 } 121 - 122 - kvm_vm_free(vm); 123 120 } 124 121 125 122 /* 126 123 * Verify KVM allows writing PERF_CAPABILITIES with all KVM-supported features 127 124 * enabled, as well as '0' (to disable all features). 128 125 */ 129 - static void test_basic_perf_capabilities(union perf_capabilities host_cap) 126 + KVM_ONE_VCPU_TEST(vmx_pmu_caps, basic_perf_capabilities, guest_code) 130 127 { 131 - struct kvm_vcpu *vcpu; 132 - struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, NULL); 133 - 134 128 vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0); 135 129 vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); 136 - 137 - kvm_vm_free(vm); 138 130 } 139 131 140 - static void test_fungible_perf_capabilities(union perf_capabilities host_cap) 132 + KVM_ONE_VCPU_TEST(vmx_pmu_caps, fungible_perf_capabilities, guest_code) 141 133 { 142 134 const uint64_t fungible_caps = host_cap.capabilities & ~immutable_caps.capabilities; 143 - 144 - struct kvm_vcpu *vcpu; 145 - struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, NULL); 146 135 int bit; 147 136 148 137 for_each_set_bit(bit, &fungible_caps, 64) { ··· 141 150 host_cap.capabilities & ~BIT_ULL(bit)); 142 151 } 143 152 vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); 144 - 145 - kvm_vm_free(vm); 146 153 } 147 154 148 155 /* ··· 149 160 * separately as they are multi-bit values, e.g. toggling or setting a single 150 161 * bit can generate a false positive without dedicated safeguards. 151 162 */ 152 - static void test_immutable_perf_capabilities(union perf_capabilities host_cap) 163 + KVM_ONE_VCPU_TEST(vmx_pmu_caps, immutable_perf_capabilities, guest_code) 153 164 { 154 165 const uint64_t reserved_caps = (~host_cap.capabilities | 155 166 immutable_caps.capabilities) & 156 167 ~format_caps.capabilities; 157 - 158 - struct kvm_vcpu *vcpu; 159 - struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, NULL); 160 168 union perf_capabilities val = host_cap; 161 169 int r, bit; 162 170 ··· 187 201 TEST_ASSERT(!r, "Bad PEBS FMT = 0x%x didn't fail, host = 0x%x", 188 202 val.pebs_format, host_cap.pebs_format); 189 203 } 190 - 191 - kvm_vm_free(vm); 192 204 } 193 205 194 206 /* ··· 195 211 * LBR_TOS as those bits are writable across all uarch implementations (arch 196 212 * LBRs will need to poke a different MSR). 197 213 */ 198 - static void test_lbr_perf_capabilities(union perf_capabilities host_cap) 214 + KVM_ONE_VCPU_TEST(vmx_pmu_caps, lbr_perf_capabilities, guest_code) 199 215 { 200 - struct kvm_vcpu *vcpu; 201 - struct kvm_vm *vm; 202 216 int r; 203 217 204 218 if (!host_cap.lbr_format) 205 219 return; 206 - 207 - vm = vm_create_with_one_vcpu(&vcpu, NULL); 208 220 209 221 vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); 210 222 vcpu_set_msr(vcpu, MSR_LBR_TOS, 7); ··· 209 229 210 230 r = _vcpu_set_msr(vcpu, MSR_LBR_TOS, 7); 211 231 TEST_ASSERT(!r, "Writing LBR_TOS should fail after disabling vPMU"); 212 - 213 - kvm_vm_free(vm); 214 232 } 215 233 216 234 int main(int argc, char *argv[]) 217 235 { 218 - union perf_capabilities host_cap; 219 - 220 - TEST_REQUIRE(get_kvm_param_bool("enable_pmu")); 236 + TEST_REQUIRE(kvm_is_pmu_enabled()); 221 237 TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM)); 222 238 223 239 TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION)); ··· 224 248 TEST_ASSERT(host_cap.full_width_write, 225 249 "Full-width writes should always be supported"); 226 250 227 - test_basic_perf_capabilities(host_cap); 228 - test_fungible_perf_capabilities(host_cap); 229 - test_immutable_perf_capabilities(host_cap); 230 - test_guest_wrmsr_perf_capabilities(host_cap); 231 - test_lbr_perf_capabilities(host_cap); 251 + return test_harness_run(argc, argv); 232 252 }

+50 -9

tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c

··· 62 62 TEST_POLL_TIMEOUT, 63 63 TEST_POLL_MASKED, 64 64 TEST_POLL_WAKE, 65 + SET_VCPU_INFO, 65 66 TEST_TIMER_PAST, 66 67 TEST_LOCKING_SEND_RACE, 67 68 TEST_LOCKING_POLL_RACE, ··· 322 321 323 322 GUEST_SYNC(TEST_POLL_WAKE); 324 323 324 + /* Set the vcpu_info to point at exactly the place it already is to 325 + * make sure the attribute is functional. */ 326 + GUEST_SYNC(SET_VCPU_INFO); 327 + 325 328 /* A timer wake an *unmasked* port which should wake us with an 326 329 * actual interrupt, while we're polling on a different port. */ 327 330 ports[0]++; ··· 394 389 return 0; 395 390 } 396 391 392 + static struct shared_info *shinfo; 397 393 static struct vcpu_info *vinfo; 398 394 static struct kvm_vcpu *vcpu; 399 395 ··· 410 404 { 411 405 struct kvm_vm *vm = (struct kvm_vm *)arg; 412 406 413 - struct kvm_xen_hvm_attr cache_activate = { 407 + struct kvm_xen_hvm_attr cache_activate_gfn = { 414 408 .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, 415 409 .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE 416 410 }; 417 411 418 - struct kvm_xen_hvm_attr cache_deactivate = { 412 + struct kvm_xen_hvm_attr cache_deactivate_gfn = { 419 413 .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, 420 414 .u.shared_info.gfn = KVM_XEN_INVALID_GFN 421 415 }; 422 416 417 + struct kvm_xen_hvm_attr cache_activate_hva = { 418 + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA, 419 + .u.shared_info.hva = (unsigned long)shinfo 420 + }; 421 + 422 + struct kvm_xen_hvm_attr cache_deactivate_hva = { 423 + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, 424 + .u.shared_info.hva = 0 425 + }; 426 + 427 + int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); 428 + 423 429 for (;;) { 424 - __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate); 425 - __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate); 430 + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_gfn); 426 431 pthread_testcancel(); 432 + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_gfn); 433 + 434 + if (xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) { 435 + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_hva); 436 + pthread_testcancel(); 437 + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_hva); 438 + } 427 439 } 428 440 429 441 return NULL; ··· 466 442 bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG); 467 443 bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); 468 444 bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); 445 + bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA); 469 446 470 447 clock_gettime(CLOCK_REALTIME, &min_ts); 471 448 ··· 477 452 SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0); 478 453 virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3); 479 454 480 - struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR); 455 + shinfo = addr_gpa2hva(vm, SHINFO_VADDR); 481 456 482 457 int zero_fd = open("/dev/zero", O_RDONLY); 483 458 TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero"); ··· 513 488 "Failed to read back RUNSTATE_UPDATE_FLAG attr"); 514 489 } 515 490 516 - struct kvm_xen_hvm_attr ha = { 517 - .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, 518 - .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE, 519 - }; 491 + struct kvm_xen_hvm_attr ha = {}; 492 + 493 + if (has_shinfo_hva) { 494 + ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA; 495 + ha.u.shared_info.hva = (unsigned long)shinfo; 496 + } else { 497 + ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO; 498 + ha.u.shared_info.gfn = SHINFO_ADDR / PAGE_SIZE; 499 + } 500 + 520 501 vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha); 521 502 522 503 /* ··· 891 860 TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000, 892 861 "Timer not reported pending"); 893 862 alarm(1); 863 + break; 864 + 865 + case SET_VCPU_INFO: 866 + if (has_shinfo_hva) { 867 + struct kvm_xen_vcpu_attr vih = { 868 + .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA, 869 + .u.hva = (unsigned long)vinfo 870 + }; 871 + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vih); 872 + } 894 873 break; 895 874 896 875 case TEST_TIMER_PAST:

+4 -3

virt/kvm/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 # KVM common configuration items and defaults 3 3 4 - config HAVE_KVM 5 - bool 6 - 7 4 config KVM_COMMON 8 5 bool 9 6 select EVENTFD ··· 52 55 config HAVE_KVM_MSI 53 56 bool 54 57 58 + config HAVE_KVM_READONLY_MEM 59 + bool 60 + 55 61 config HAVE_KVM_CPU_RELAX_INTERCEPT 56 62 bool 57 63 ··· 73 73 74 74 config HAVE_KVM_IRQ_BYPASS 75 75 bool 76 + select IRQ_BYPASS_MANAGER 76 77 77 78 config HAVE_KVM_VCPU_ASYNC_IOCTL 78 79 bool

+49 -24

virt/kvm/async_pf.c

··· 46 46 { 47 47 struct kvm_async_pf *apf = 48 48 container_of(work, struct kvm_async_pf, work); 49 - struct mm_struct *mm = apf->mm; 50 49 struct kvm_vcpu *vcpu = apf->vcpu; 50 + struct mm_struct *mm = vcpu->kvm->mm; 51 51 unsigned long addr = apf->addr; 52 52 gpa_t cr2_or_gpa = apf->cr2_or_gpa; 53 53 int locked = 1; ··· 56 56 might_sleep(); 57 57 58 58 /* 59 - * This work is run asynchronously to the task which owns 60 - * mm and might be done in another context, so we must 61 - * access remotely. 59 + * Attempt to pin the VM's host address space, and simply skip gup() if 60 + * acquiring a pin fail, i.e. if the process is exiting. Note, KVM 61 + * holds a reference to its associated mm_struct until the very end of 62 + * kvm_destroy_vm(), i.e. the struct itself won't be freed before this 63 + * work item is fully processed. 62 64 */ 63 - mmap_read_lock(mm); 64 - get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); 65 - if (locked) 66 - mmap_read_unlock(mm); 65 + if (mmget_not_zero(mm)) { 66 + mmap_read_lock(mm); 67 + get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); 68 + if (locked) 69 + mmap_read_unlock(mm); 70 + mmput(mm); 71 + } 67 72 73 + /* 74 + * Notify and kick the vCPU even if faulting in the page failed, e.g. 75 + * so that the vCPU can retry the fault synchronously. 76 + */ 68 77 if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) 69 78 kvm_arch_async_page_present(vcpu, apf); 70 79 ··· 83 74 apf->vcpu = NULL; 84 75 spin_unlock(&vcpu->async_pf.lock); 85 76 77 + /* 78 + * The apf struct may be freed by kvm_check_async_pf_completion() as 79 + * soon as the lock is dropped. Nullify it to prevent improper usage. 80 + */ 81 + apf = NULL; 82 + 86 83 if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) 87 84 kvm_arch_async_page_present_queued(vcpu); 88 - 89 - /* 90 - * apf may be freed by kvm_check_async_pf_completion() after 91 - * this point 92 - */ 93 85 94 86 trace_kvm_async_pf_completed(addr, cr2_or_gpa); 95 87 96 88 __kvm_vcpu_wake_up(vcpu); 89 + } 97 90 98 - mmput(mm); 99 - kvm_put_kvm(vcpu->kvm); 91 + static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work) 92 + { 93 + /* 94 + * The async #PF is "done", but KVM must wait for the work item itself, 95 + * i.e. async_pf_execute(), to run to completion. If KVM is a module, 96 + * KVM must ensure *no* code owned by the KVM (the module) can be run 97 + * after the last call to module_put(). Note, flushing the work item 98 + * is always required when the item is taken off the completion queue. 99 + * E.g. even if the vCPU handles the item in the "normal" path, the VM 100 + * could be terminated before async_pf_execute() completes. 101 + * 102 + * Wake all events skip the queue and go straight done, i.e. don't 103 + * need to be flushed (but sanity check that the work wasn't queued). 104 + */ 105 + if (work->wakeup_all) 106 + WARN_ON_ONCE(work->work.func); 107 + else 108 + flush_work(&work->work); 109 + kmem_cache_free(async_pf_cache, work); 100 110 } 101 111 102 112 void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) ··· 140 112 #ifdef CONFIG_KVM_ASYNC_PF_SYNC 141 113 flush_work(&work->work); 142 114 #else 143 - if (cancel_work_sync(&work->work)) { 144 - mmput(work->mm); 145 - kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ 115 + if (cancel_work_sync(&work->work)) 146 116 kmem_cache_free(async_pf_cache, work); 147 - } 148 117 #endif 149 118 spin_lock(&vcpu->async_pf.lock); 150 119 } ··· 151 126 list_first_entry(&vcpu->async_pf.done, 152 127 typeof(*work), link); 153 128 list_del(&work->link); 154 - kmem_cache_free(async_pf_cache, work); 129 + 130 + spin_unlock(&vcpu->async_pf.lock); 131 + kvm_flush_and_free_async_pf_work(work); 132 + spin_lock(&vcpu->async_pf.lock); 155 133 } 156 134 spin_unlock(&vcpu->async_pf.lock); 157 135 ··· 179 151 180 152 list_del(&work->queue); 181 153 vcpu->async_pf.queued--; 182 - kmem_cache_free(async_pf_cache, work); 154 + kvm_flush_and_free_async_pf_work(work); 183 155 } 184 156 } 185 157 ··· 212 184 work->cr2_or_gpa = cr2_or_gpa; 213 185 work->addr = hva; 214 186 work->arch = *arch; 215 - work->mm = current->mm; 216 - mmget(work->mm); 217 - kvm_get_kvm(work->vcpu->kvm); 218 187 219 188 INIT_WORK(&work->work, async_pf_execute); 220 189

+27 -10

virt/kvm/kvm_main.c

··· 421 421 if (WARN_ON_ONCE(!capacity)) 422 422 return -EIO; 423 423 424 - mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); 424 + mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); 425 425 if (!mc->objects) 426 426 return -ENOMEM; 427 427 ··· 890 890 891 891 /* Pairs with the increment in range_start(). */ 892 892 spin_lock(&kvm->mn_invalidate_lock); 893 - wake = (--kvm->mn_active_invalidate_count == 0); 893 + if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) 894 + --kvm->mn_active_invalidate_count; 895 + wake = !kvm->mn_active_invalidate_count; 894 896 spin_unlock(&kvm->mn_invalidate_lock); 895 897 896 898 /* ··· 1152 1150 &stat_fops_per_vm); 1153 1151 } 1154 1152 1155 - ret = kvm_arch_create_vm_debugfs(kvm); 1156 - if (ret) 1157 - goto out_err; 1158 - 1153 + kvm_arch_create_vm_debugfs(kvm); 1159 1154 return 0; 1160 1155 out_err: 1161 1156 kvm_destroy_vm_debugfs(kvm); ··· 1182 1183 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so 1183 1184 * a per-arch destroy interface is not needed. 1184 1185 */ 1185 - int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) 1186 + void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) 1186 1187 { 1187 - return 0; 1188 1188 } 1189 1189 1190 1190 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) ··· 1612 1614 if (mem->flags & KVM_MEM_GUEST_MEMFD) 1613 1615 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 1614 1616 1615 - #ifdef __KVM_HAVE_READONLY_MEM 1617 + #ifdef CONFIG_HAVE_KVM_READONLY_MEM 1616 1618 /* 1617 1619 * GUEST_MEMFD is incompatible with read-only memslots, as writes to 1618 1620 * read-only memslots have emulated MMIO, not page fault, semantics, ··· 4046 4048 return false; 4047 4049 } 4048 4050 4051 + /* 4052 + * By default, simply query the target vCPU's current mode when checking if a 4053 + * vCPU was preempted in kernel mode. All architectures except x86 (or more 4054 + * specifical, except VMX) allow querying whether or not a vCPU is in kernel 4055 + * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel() 4056 + * directly for cross-vCPU checks is functionally correct and accurate. 4057 + */ 4058 + bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 4059 + { 4060 + return kvm_arch_vcpu_in_kernel(vcpu); 4061 + } 4062 + 4049 4063 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 4050 4064 { 4051 4065 return false; ··· 4094 4084 continue; 4095 4085 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) 4096 4086 continue; 4087 + 4088 + /* 4089 + * Treat the target vCPU as being in-kernel if it has a 4090 + * pending interrupt, as the vCPU trying to yield may 4091 + * be spinning waiting on IPI delivery, i.e. the target 4092 + * vCPU is in-kernel for the purposes of directed yield. 4093 + */ 4097 4094 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 4098 4095 !kvm_arch_dy_has_pending_interrupt(vcpu) && 4099 - !kvm_arch_vcpu_in_kernel(vcpu)) 4096 + !kvm_arch_vcpu_preempted_in_kernel(vcpu)) 4100 4097 continue; 4101 4098 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 4102 4099 continue;

+142 -107

virt/kvm/pfncache.c

··· 25 25 void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, 26 26 unsigned long end, bool may_block) 27 27 { 28 - DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); 29 28 struct gfn_to_pfn_cache *gpc; 30 - bool evict_vcpus = false; 31 29 32 30 spin_lock(&kvm->gpc_lock); 33 31 list_for_each_entry(gpc, &kvm->gpc_list, list) { 34 - write_lock_irq(&gpc->lock); 32 + read_lock_irq(&gpc->lock); 35 33 36 34 /* Only a single page so no need to care about length */ 37 35 if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && 38 36 gpc->uhva >= start && gpc->uhva < end) { 39 - gpc->valid = false; 37 + read_unlock_irq(&gpc->lock); 40 38 41 39 /* 42 - * If a guest vCPU could be using the physical address, 43 - * it needs to be forced out of guest mode. 40 + * There is a small window here where the cache could 41 + * be modified, and invalidation would no longer be 42 + * necessary. Hence check again whether invalidation 43 + * is still necessary once the write lock has been 44 + * acquired. 44 45 */ 45 - if (gpc->usage & KVM_GUEST_USES_PFN) { 46 - if (!evict_vcpus) { 47 - evict_vcpus = true; 48 - bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); 49 - } 50 - __set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap); 51 - } 46 + 47 + write_lock_irq(&gpc->lock); 48 + if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && 49 + gpc->uhva >= start && gpc->uhva < end) 50 + gpc->valid = false; 51 + write_unlock_irq(&gpc->lock); 52 + continue; 52 53 } 53 - write_unlock_irq(&gpc->lock); 54 + 55 + read_unlock_irq(&gpc->lock); 54 56 } 55 57 spin_unlock(&kvm->gpc_lock); 56 - 57 - if (evict_vcpus) { 58 - /* 59 - * KVM needs to ensure the vCPU is fully out of guest context 60 - * before allowing the invalidation to continue. 61 - */ 62 - unsigned int req = KVM_REQ_OUTSIDE_GUEST_MODE; 63 - bool called; 64 - 65 - /* 66 - * If the OOM reaper is active, then all vCPUs should have 67 - * been stopped already, so perform the request without 68 - * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd. 69 - */ 70 - if (!may_block) 71 - req &= ~KVM_REQUEST_WAIT; 72 - 73 - called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap); 74 - 75 - WARN_ON_ONCE(called && !may_block); 76 - } 77 58 } 78 59 79 60 bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) ··· 64 83 if (!gpc->active) 65 84 return false; 66 85 67 - if ((gpc->gpa & ~PAGE_MASK) + len > PAGE_SIZE) 86 + /* 87 + * If the page was cached from a memslot, make sure the memslots have 88 + * not been re-configured. 89 + */ 90 + if (!kvm_is_error_gpa(gpc->gpa) && gpc->generation != slots->generation) 68 91 return false; 69 92 70 - if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) 93 + if (kvm_is_error_hva(gpc->uhva)) 94 + return false; 95 + 96 + if (offset_in_page(gpc->uhva) + len > PAGE_SIZE) 71 97 return false; 72 98 73 99 if (!gpc->valid) ··· 82 94 83 95 return true; 84 96 } 85 - EXPORT_SYMBOL_GPL(kvm_gpc_check); 86 97 87 - static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva) 98 + static void *gpc_map(kvm_pfn_t pfn) 99 + { 100 + if (pfn_valid(pfn)) 101 + return kmap(pfn_to_page(pfn)); 102 + 103 + #ifdef CONFIG_HAS_IOMEM 104 + return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 105 + #else 106 + return NULL; 107 + #endif 108 + } 109 + 110 + static void gpc_unmap(kvm_pfn_t pfn, void *khva) 88 111 { 89 112 /* Unmap the old pfn/page if it was mapped before. */ 90 - if (!is_error_noslot_pfn(pfn) && khva) { 91 - if (pfn_valid(pfn)) 92 - kunmap(pfn_to_page(pfn)); 93 - #ifdef CONFIG_HAS_IOMEM 94 - else 95 - memunmap(khva); 96 - #endif 113 + if (is_error_noslot_pfn(pfn) || !khva) 114 + return; 115 + 116 + if (pfn_valid(pfn)) { 117 + kunmap(pfn_to_page(pfn)); 118 + return; 97 119 } 120 + 121 + #ifdef CONFIG_HAS_IOMEM 122 + memunmap(khva); 123 + #endif 98 124 } 99 125 100 126 static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) ··· 142 140 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) 143 141 { 144 142 /* Note, the new page offset may be different than the old! */ 145 - void *old_khva = gpc->khva - offset_in_page(gpc->khva); 143 + void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); 146 144 kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; 147 145 void *new_khva = NULL; 148 146 unsigned long mmu_seq; ··· 177 175 * the existing mapping and didn't create a new one. 178 176 */ 179 177 if (new_khva != old_khva) 180 - gpc_unmap_khva(new_pfn, new_khva); 178 + gpc_unmap(new_pfn, new_khva); 181 179 182 180 kvm_release_pfn_clean(new_pfn); 183 181 ··· 194 192 * pfn. Note, kmap() and memremap() can both sleep, so this 195 193 * too must be done outside of gpc->lock! 196 194 */ 197 - if (gpc->usage & KVM_HOST_USES_PFN) { 198 - if (new_pfn == gpc->pfn) { 199 - new_khva = old_khva; 200 - } else if (pfn_valid(new_pfn)) { 201 - new_khva = kmap(pfn_to_page(new_pfn)); 202 - #ifdef CONFIG_HAS_IOMEM 203 - } else { 204 - new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB); 205 - #endif 206 - } 207 - if (!new_khva) { 208 - kvm_release_pfn_clean(new_pfn); 209 - goto out_error; 210 - } 195 + if (new_pfn == gpc->pfn) 196 + new_khva = old_khva; 197 + else 198 + new_khva = gpc_map(new_pfn); 199 + 200 + if (!new_khva) { 201 + kvm_release_pfn_clean(new_pfn); 202 + goto out_error; 211 203 } 212 204 213 205 write_lock_irq(&gpc->lock); ··· 215 219 216 220 gpc->valid = true; 217 221 gpc->pfn = new_pfn; 218 - gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK); 222 + gpc->khva = new_khva + offset_in_page(gpc->uhva); 219 223 220 224 /* 221 225 * Put the reference to the _new_ pfn. The pfn is now tracked by the ··· 232 236 return -EFAULT; 233 237 } 234 238 235 - static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, 239 + static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva, 236 240 unsigned long len) 237 241 { 238 - struct kvm_memslots *slots = kvm_memslots(gpc->kvm); 239 - unsigned long page_offset = gpa & ~PAGE_MASK; 242 + unsigned long page_offset; 240 243 bool unmap_old = false; 241 244 unsigned long old_uhva; 242 245 kvm_pfn_t old_pfn; 246 + bool hva_change = false; 243 247 void *old_khva; 244 248 int ret; 245 249 246 - /* 247 - * If must fit within a single page. The 'len' argument is 248 - * only to enforce that. 249 - */ 250 - if (page_offset + len > PAGE_SIZE) 250 + /* Either gpa or uhva must be valid, but not both */ 251 + if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva))) 251 252 return -EINVAL; 252 253 253 254 /* 254 - * If another task is refreshing the cache, wait for it to complete. 255 - * There is no guarantee that concurrent refreshes will see the same 256 - * gpa, memslots generation, etc..., so they must be fully serialized. 255 + * The cached acces must fit within a single page. The 'len' argument 256 + * exists only to enforce that. 257 257 */ 258 - mutex_lock(&gpc->refresh_lock); 258 + page_offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) : 259 + offset_in_page(gpa); 260 + if (page_offset + len > PAGE_SIZE) 261 + return -EINVAL; 262 + 263 + lockdep_assert_held(&gpc->refresh_lock); 259 264 260 265 write_lock_irq(&gpc->lock); 261 266 ··· 266 269 } 267 270 268 271 old_pfn = gpc->pfn; 269 - old_khva = gpc->khva - offset_in_page(gpc->khva); 270 - old_uhva = gpc->uhva; 272 + old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); 273 + old_uhva = PAGE_ALIGN_DOWN(gpc->uhva); 271 274 272 - /* If the userspace HVA is invalid, refresh that first */ 273 - if (gpc->gpa != gpa || gpc->generation != slots->generation || 274 - kvm_is_error_hva(gpc->uhva)) { 275 - gfn_t gfn = gpa_to_gfn(gpa); 275 + if (kvm_is_error_gpa(gpa)) { 276 + gpc->gpa = INVALID_GPA; 277 + gpc->memslot = NULL; 278 + gpc->uhva = PAGE_ALIGN_DOWN(uhva); 276 279 277 - gpc->gpa = gpa; 278 - gpc->generation = slots->generation; 279 - gpc->memslot = __gfn_to_memslot(slots, gfn); 280 - gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); 280 + if (gpc->uhva != old_uhva) 281 + hva_change = true; 282 + } else { 283 + struct kvm_memslots *slots = kvm_memslots(gpc->kvm); 281 284 282 - if (kvm_is_error_hva(gpc->uhva)) { 283 - ret = -EFAULT; 284 - goto out; 285 + if (gpc->gpa != gpa || gpc->generation != slots->generation || 286 + kvm_is_error_hva(gpc->uhva)) { 287 + gfn_t gfn = gpa_to_gfn(gpa); 288 + 289 + gpc->gpa = gpa; 290 + gpc->generation = slots->generation; 291 + gpc->memslot = __gfn_to_memslot(slots, gfn); 292 + gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); 293 + 294 + if (kvm_is_error_hva(gpc->uhva)) { 295 + ret = -EFAULT; 296 + goto out; 297 + } 298 + 299 + /* 300 + * Even if the GPA and/or the memslot generation changed, the 301 + * HVA may still be the same. 302 + */ 303 + if (gpc->uhva != old_uhva) 304 + hva_change = true; 305 + } else { 306 + gpc->uhva = old_uhva; 285 307 } 286 308 } 309 + 310 + /* Note: the offset must be correct before calling hva_to_pfn_retry() */ 311 + gpc->uhva += page_offset; 287 312 288 313 /* 289 314 * If the userspace HVA changed or the PFN was already invalid, 290 315 * drop the lock and do the HVA to PFN lookup again. 291 316 */ 292 - if (!gpc->valid || old_uhva != gpc->uhva) { 317 + if (!gpc->valid || hva_change) { 293 318 ret = hva_to_pfn_retry(gpc); 294 319 } else { 295 320 /* ··· 342 323 out_unlock: 343 324 write_unlock_irq(&gpc->lock); 344 325 345 - mutex_unlock(&gpc->refresh_lock); 346 - 347 326 if (unmap_old) 348 - gpc_unmap_khva(old_pfn, old_khva); 327 + gpc_unmap(old_pfn, old_khva); 349 328 350 329 return ret; 351 330 } 352 331 353 332 int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) 354 333 { 355 - return __kvm_gpc_refresh(gpc, gpc->gpa, len); 334 + unsigned long uhva; 335 + 336 + guard(mutex)(&gpc->refresh_lock); 337 + 338 + /* 339 + * If the GPA is valid then ignore the HVA, as a cache can be GPA-based 340 + * or HVA-based, not both. For GPA-based caches, the HVA will be 341 + * recomputed during refresh if necessary. 342 + */ 343 + uhva = kvm_is_error_gpa(gpc->gpa) ? gpc->uhva : KVM_HVA_ERR_BAD; 344 + 345 + return __kvm_gpc_refresh(gpc, gpc->gpa, uhva, len); 356 346 } 357 - EXPORT_SYMBOL_GPL(kvm_gpc_refresh); 358 347 359 - void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, 360 - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage) 348 + void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) 361 349 { 362 - WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); 363 - WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu); 364 - 365 350 rwlock_init(&gpc->lock); 366 351 mutex_init(&gpc->refresh_lock); 367 352 368 353 gpc->kvm = kvm; 369 - gpc->vcpu = vcpu; 370 - gpc->usage = usage; 371 354 gpc->pfn = KVM_PFN_ERR_FAULT; 355 + gpc->gpa = INVALID_GPA; 372 356 gpc->uhva = KVM_HVA_ERR_BAD; 357 + gpc->active = gpc->valid = false; 373 358 } 374 - EXPORT_SYMBOL_GPL(kvm_gpc_init); 375 359 376 - int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) 360 + static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva, 361 + unsigned long len) 377 362 { 378 363 struct kvm *kvm = gpc->kvm; 364 + 365 + guard(mutex)(&gpc->refresh_lock); 379 366 380 367 if (!gpc->active) { 381 368 if (KVM_BUG_ON(gpc->valid, kvm)) ··· 400 375 gpc->active = true; 401 376 write_unlock_irq(&gpc->lock); 402 377 } 403 - return __kvm_gpc_refresh(gpc, gpa, len); 378 + return __kvm_gpc_refresh(gpc, gpa, uhva, len); 404 379 } 405 - EXPORT_SYMBOL_GPL(kvm_gpc_activate); 380 + 381 + int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) 382 + { 383 + return __kvm_gpc_activate(gpc, gpa, KVM_HVA_ERR_BAD, len); 384 + } 385 + 386 + int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len) 387 + { 388 + return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len); 389 + } 406 390 407 391 void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) 408 392 { 409 393 struct kvm *kvm = gpc->kvm; 410 394 kvm_pfn_t old_pfn; 411 395 void *old_khva; 396 + 397 + guard(mutex)(&gpc->refresh_lock); 412 398 413 399 if (gpc->active) { 414 400 /* ··· 448 412 list_del(&gpc->list); 449 413 spin_unlock(&kvm->gpc_lock); 450 414 451 - gpc_unmap_khva(old_pfn, old_khva); 415 + gpc_unmap(old_pfn, old_khva); 452 416 } 453 417 } 454 - EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);