Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+5 -3

Documentation/virt/kvm/vcpu-requests.rst

··· 118 118 necessary to inform each VCPU to completely refresh the tables. This 119 119 request is used for that. 120 120 121 - KVM_REQ_PENDING_TIMER 121 + KVM_REQ_UNBLOCK 122 122 123 - This request may be made from a timer handler run on the host on behalf 124 - of a VCPU. It informs the VCPU thread to inject a timer interrupt. 123 + This request informs the vCPU to exit kvm_vcpu_block. It is used for 124 + example from timer handlers that run on the host on behalf of a vCPU, 125 + or in order to update the interrupt routing and ensure that assigned 126 + devices will wake up the vCPU. 125 127 126 128 KVM_REQ_UNHALT 127 129

+3

arch/arm64/include/asm/kvm_asm.h

··· 63 63 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 64 64 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 65 65 #define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 66 + #define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 66 67 67 68 #ifndef __ASSEMBLY__ 68 69 ··· 201 200 extern void __kvm_timer_set_cntvoff(u64 cntvoff); 202 201 203 202 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 203 + 204 + extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu); 204 205 205 206 extern u64 __vgic_v3_get_gic_config(void); 206 207 extern u64 __vgic_v3_read_vmcr(void);

+5

arch/arm64/include/asm/kvm_emulate.h

··· 463 463 vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; 464 464 } 465 465 466 + static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) 467 + { 468 + return test_bit(feature, vcpu->arch.features); 469 + } 470 + 466 471 #endif /* __ARM64_KVM_EMULATE_H__ */

+17 -3

arch/arm64/kvm/arm.c

··· 720 720 return ret; 721 721 } 722 722 723 - if (run->immediate_exit) 724 - return -EINTR; 725 - 726 723 vcpu_load(vcpu); 724 + 725 + if (run->immediate_exit) { 726 + ret = -EINTR; 727 + goto out; 728 + } 727 729 728 730 kvm_sigset_activate(vcpu); 729 731 ··· 898 896 } 899 897 900 898 kvm_sigset_deactivate(vcpu); 899 + 900 + out: 901 + /* 902 + * In the unlikely event that we are returning to userspace 903 + * with pending exceptions or PC adjustment, commit these 904 + * adjustments in order to give userspace a consistent view of 905 + * the vcpu state. Note that this relies on __kvm_adjust_pc() 906 + * being preempt-safe on VHE. 907 + */ 908 + if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | 909 + KVM_ARM64_INCREMENT_PC))) 910 + kvm_call_hyp(__kvm_adjust_pc, vcpu); 901 911 902 912 vcpu_put(vcpu); 903 913 return ret;

+17 -1

arch/arm64/kvm/hyp/exception.c

··· 296 296 *vcpu_pc(vcpu) = vect_offset; 297 297 } 298 298 299 - void kvm_inject_exception(struct kvm_vcpu *vcpu) 299 + static void kvm_inject_exception(struct kvm_vcpu *vcpu) 300 300 { 301 301 if (vcpu_el1_is_32bit(vcpu)) { 302 302 switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { ··· 327 327 */ 328 328 break; 329 329 } 330 + } 331 + } 332 + 333 + /* 334 + * Adjust the guest PC (and potentially exception state) depending on 335 + * flags provided by the emulation code. 336 + */ 337 + void __kvm_adjust_pc(struct kvm_vcpu *vcpu) 338 + { 339 + if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { 340 + kvm_inject_exception(vcpu); 341 + vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | 342 + KVM_ARM64_EXCEPT_MASK); 343 + } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { 344 + kvm_skip_instr(vcpu); 345 + vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; 330 346 } 331 347 }

-18

arch/arm64/kvm/hyp/include/hyp/adjust_pc.h

··· 13 13 #include <asm/kvm_emulate.h> 14 14 #include <asm/kvm_host.h> 15 15 16 - void kvm_inject_exception(struct kvm_vcpu *vcpu); 17 - 18 16 static inline void kvm_skip_instr(struct kvm_vcpu *vcpu) 19 17 { 20 18 if (vcpu_mode_is_32bit(vcpu)) { ··· 39 41 40 42 write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR); 41 43 write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); 42 - } 43 - 44 - /* 45 - * Adjust the guest PC on entry, depending on flags provided by EL1 46 - * for the purpose of emulation (MMIO, sysreg) or exception injection. 47 - */ 48 - static inline void __adjust_pc(struct kvm_vcpu *vcpu) 49 - { 50 - if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { 51 - kvm_inject_exception(vcpu); 52 - vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | 53 - KVM_ARM64_EXCEPT_MASK); 54 - } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { 55 - kvm_skip_instr(vcpu); 56 - vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; 57 - } 58 44 } 59 45 60 46 /*

+8

arch/arm64/kvm/hyp/nvhe/hyp-main.c

··· 28 28 cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); 29 29 } 30 30 31 + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) 32 + { 33 + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); 34 + 35 + __kvm_adjust_pc(kern_hyp_va(vcpu)); 36 + } 37 + 31 38 static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt) 32 39 { 33 40 __kvm_flush_vm_context(); ··· 177 170 178 171 static const hcall_t host_hcall[] = { 179 172 HANDLE_FUNC(__kvm_vcpu_run), 173 + HANDLE_FUNC(__kvm_adjust_pc), 180 174 HANDLE_FUNC(__kvm_flush_vm_context), 181 175 HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), 182 176 HANDLE_FUNC(__kvm_tlb_flush_vmid),

+2 -2

arch/arm64/kvm/hyp/nvhe/mem_protect.c

··· 23 23 extern unsigned long hyp_nr_cpus; 24 24 struct host_kvm host_kvm; 25 25 26 - struct hyp_pool host_s2_mem; 27 - struct hyp_pool host_s2_dev; 26 + static struct hyp_pool host_s2_mem; 27 + static struct hyp_pool host_s2_dev; 28 28 29 29 /* 30 30 * Copies of the host's CPU features registers holding sanitized values.

+1 -1

arch/arm64/kvm/hyp/nvhe/setup.c

··· 17 17 #include <nvhe/trap_handler.h> 18 18 19 19 struct hyp_pool hpool; 20 - struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; 21 20 unsigned long hyp_nr_cpus; 22 21 23 22 #define hyp_percpu_size ((unsigned long)__per_cpu_end - \ ··· 26 27 static void *hyp_pgt_base; 27 28 static void *host_s2_mem_pgt_base; 28 29 static void *host_s2_dev_pgt_base; 30 + static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; 29 31 30 32 static int divide_memory_pool(void *virt, unsigned long size) 31 33 {

+1 -2

arch/arm64/kvm/hyp/nvhe/switch.c

··· 4 4 * Author: Marc Zyngier <marc.zyngier@arm.com> 5 5 */ 6 6 7 - #include <hyp/adjust_pc.h> 8 7 #include <hyp/switch.h> 9 8 #include <hyp/sysreg-sr.h> 10 9 ··· 200 201 */ 201 202 __debug_save_host_buffers_nvhe(vcpu); 202 203 203 - __adjust_pc(vcpu); 204 + __kvm_adjust_pc(vcpu); 204 205 205 206 /* 206 207 * We must restore the 32-bit state before the sysregs, thanks

+1 -2

arch/arm64/kvm/hyp/vhe/switch.c

··· 4 4 * Author: Marc Zyngier <marc.zyngier@arm.com> 5 5 */ 6 6 7 - #include <hyp/adjust_pc.h> 8 7 #include <hyp/switch.h> 9 8 10 9 #include <linux/arm-smccc.h> ··· 131 132 __load_guest_stage2(vcpu->arch.hw_mmu); 132 133 __activate_traps(vcpu); 133 134 134 - __adjust_pc(vcpu); 135 + __kvm_adjust_pc(vcpu); 135 136 136 137 sysreg_restore_guest_state_vhe(guest_ctxt); 137 138 __debug_switch_to_guest(vcpu);

+6 -6

arch/arm64/kvm/mmu.c

··· 1156 1156 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1157 1157 { 1158 1158 if (!kvm->arch.mmu.pgt) 1159 - return 0; 1159 + return false; 1160 1160 1161 1161 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 1162 1162 (range->end - range->start) << PAGE_SHIFT, 1163 1163 range->may_block); 1164 1164 1165 - return 0; 1165 + return false; 1166 1166 } 1167 1167 1168 1168 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) ··· 1170 1170 kvm_pfn_t pfn = pte_pfn(range->pte); 1171 1171 1172 1172 if (!kvm->arch.mmu.pgt) 1173 - return 0; 1173 + return false; 1174 1174 1175 1175 WARN_ON(range->end - range->start != 1); 1176 1176 ··· 1190 1190 PAGE_SIZE, __pfn_to_phys(pfn), 1191 1191 KVM_PGTABLE_PROT_R, NULL); 1192 1192 1193 - return 0; 1193 + return false; 1194 1194 } 1195 1195 1196 1196 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) ··· 1200 1200 pte_t pte; 1201 1201 1202 1202 if (!kvm->arch.mmu.pgt) 1203 - return 0; 1203 + return false; 1204 1204 1205 1205 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 1206 1206 ··· 1213 1213 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1214 1214 { 1215 1215 if (!kvm->arch.mmu.pgt) 1216 - return 0; 1216 + return false; 1217 1217 1218 1218 return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, 1219 1219 range->start << PAGE_SHIFT);

+24 -4

arch/arm64/kvm/reset.c

··· 166 166 return 0; 167 167 } 168 168 169 + static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) 170 + { 171 + struct kvm_vcpu *tmp; 172 + bool is32bit; 173 + int i; 174 + 175 + is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); 176 + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) 177 + return false; 178 + 179 + /* Check that the vcpus are either all 32bit or all 64bit */ 180 + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { 181 + if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit) 182 + return false; 183 + } 184 + 185 + return true; 186 + } 187 + 169 188 /** 170 189 * kvm_reset_vcpu - sets core registers and sys_regs to reset value 171 190 * @vcpu: The VCPU pointer ··· 236 217 } 237 218 } 238 219 220 + if (!vcpu_allowed_register_width(vcpu)) { 221 + ret = -EINVAL; 222 + goto out; 223 + } 224 + 239 225 switch (vcpu->arch.target) { 240 226 default: 241 227 if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { 242 - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) { 243 - ret = -EINVAL; 244 - goto out; 245 - } 246 228 pstate = VCPU_RESET_PSTATE_SVC; 247 229 } else { 248 230 pstate = VCPU_RESET_PSTATE_EL1;

+21 -21

arch/arm64/kvm/sys_regs.c

··· 399 399 struct sys_reg_params *p, 400 400 const struct sys_reg_desc *rd) 401 401 { 402 - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; 402 + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; 403 403 404 404 if (p->is_write) 405 405 reg_to_dbg(vcpu, p, rd, dbg_reg); 406 406 else 407 407 dbg_to_reg(vcpu, p, rd, dbg_reg); 408 408 409 - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); 409 + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); 410 410 411 411 return true; 412 412 } ··· 414 414 static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 415 415 const struct kvm_one_reg *reg, void __user *uaddr) 416 416 { 417 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; 417 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; 418 418 419 419 if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) 420 420 return -EFAULT; ··· 424 424 static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 425 425 const struct kvm_one_reg *reg, void __user *uaddr) 426 426 { 427 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; 427 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; 428 428 429 429 if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) 430 430 return -EFAULT; ··· 434 434 static void reset_bvr(struct kvm_vcpu *vcpu, 435 435 const struct sys_reg_desc *rd) 436 436 { 437 - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val; 437 + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; 438 438 } 439 439 440 440 static bool trap_bcr(struct kvm_vcpu *vcpu, 441 441 struct sys_reg_params *p, 442 442 const struct sys_reg_desc *rd) 443 443 { 444 - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; 444 + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; 445 445 446 446 if (p->is_write) 447 447 reg_to_dbg(vcpu, p, rd, dbg_reg); 448 448 else 449 449 dbg_to_reg(vcpu, p, rd, dbg_reg); 450 450 451 - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); 451 + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); 452 452 453 453 return true; 454 454 } ··· 456 456 static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 457 457 const struct kvm_one_reg *reg, void __user *uaddr) 458 458 { 459 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; 459 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; 460 460 461 461 if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) 462 462 return -EFAULT; ··· 467 467 static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 468 468 const struct kvm_one_reg *reg, void __user *uaddr) 469 469 { 470 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; 470 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; 471 471 472 472 if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) 473 473 return -EFAULT; ··· 477 477 static void reset_bcr(struct kvm_vcpu *vcpu, 478 478 const struct sys_reg_desc *rd) 479 479 { 480 - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val; 480 + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; 481 481 } 482 482 483 483 static bool trap_wvr(struct kvm_vcpu *vcpu, 484 484 struct sys_reg_params *p, 485 485 const struct sys_reg_desc *rd) 486 486 { 487 - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; 487 + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; 488 488 489 489 if (p->is_write) 490 490 reg_to_dbg(vcpu, p, rd, dbg_reg); 491 491 else 492 492 dbg_to_reg(vcpu, p, rd, dbg_reg); 493 493 494 - trace_trap_reg(__func__, rd->reg, p->is_write, 495 - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]); 494 + trace_trap_reg(__func__, rd->CRm, p->is_write, 495 + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); 496 496 497 497 return true; 498 498 } ··· 500 500 static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 501 501 const struct kvm_one_reg *reg, void __user *uaddr) 502 502 { 503 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; 503 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; 504 504 505 505 if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) 506 506 return -EFAULT; ··· 510 510 static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 511 511 const struct kvm_one_reg *reg, void __user *uaddr) 512 512 { 513 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; 513 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; 514 514 515 515 if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) 516 516 return -EFAULT; ··· 520 520 static void reset_wvr(struct kvm_vcpu *vcpu, 521 521 const struct sys_reg_desc *rd) 522 522 { 523 - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val; 523 + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; 524 524 } 525 525 526 526 static bool trap_wcr(struct kvm_vcpu *vcpu, 527 527 struct sys_reg_params *p, 528 528 const struct sys_reg_desc *rd) 529 529 { 530 - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; 530 + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; 531 531 532 532 if (p->is_write) 533 533 reg_to_dbg(vcpu, p, rd, dbg_reg); 534 534 else 535 535 dbg_to_reg(vcpu, p, rd, dbg_reg); 536 536 537 - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); 537 + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); 538 538 539 539 return true; 540 540 } ··· 542 542 static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 543 543 const struct kvm_one_reg *reg, void __user *uaddr) 544 544 { 545 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; 545 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; 546 546 547 547 if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) 548 548 return -EFAULT; ··· 552 552 static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 553 553 const struct kvm_one_reg *reg, void __user *uaddr) 554 554 { 555 - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; 555 + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; 556 556 557 557 if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) 558 558 return -EFAULT; ··· 562 562 static void reset_wcr(struct kvm_vcpu *vcpu, 563 563 const struct sys_reg_desc *rd) 564 564 { 565 - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val; 565 + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; 566 566 } 567 567 568 568 static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)

+1

arch/powerpc/include/asm/kvm_host.h

··· 51 51 /* PPC-specific vcpu->requests bit members */ 52 52 #define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0) 53 53 #define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1) 54 + #define KVM_REQ_PENDING_TIMER KVM_ARCH_REQ(2) 54 55 55 56 #include <linux/mmu_notifier.h> 56 57

+1 -1

arch/powerpc/kvm/book3s_hv.c

··· 3936 3936 break; 3937 3937 } 3938 3938 cur = ktime_get(); 3939 - } while (single_task_running() && ktime_before(cur, stop)); 3939 + } while (kvm_vcpu_can_poll(cur, stop)); 3940 3940 3941 3941 spin_lock(&vc->lock); 3942 3942 vc->vcore_state = VCORE_INACTIVE;

+1

arch/x86/include/asm/kvm-x86-ops.h

··· 99 99 KVM_X86_OP_NULL(vcpu_blocking) 100 100 KVM_X86_OP_NULL(vcpu_unblocking) 101 101 KVM_X86_OP_NULL(update_pi_irte) 102 + KVM_X86_OP_NULL(start_assignment) 102 103 KVM_X86_OP_NULL(apicv_post_state_restore) 103 104 KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) 104 105 KVM_X86_OP_NULL(set_hv_timer)

+1

arch/x86/include/asm/kvm_host.h

··· 1352 1352 1353 1353 int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, 1354 1354 uint32_t guest_irq, bool set); 1355 + void (*start_assignment)(struct kvm *kvm); 1355 1356 void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); 1356 1357 bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); 1357 1358

+3 -2

arch/x86/kvm/emulate.c

··· 5111 5111 return rc; 5112 5112 } 5113 5113 5114 - int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) 5114 + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) 5115 5115 { 5116 5116 int rc = X86EMUL_CONTINUE; 5117 5117 int mode = ctxt->mode; ··· 5322 5322 5323 5323 ctxt->execute = opcode.u.execute; 5324 5324 5325 - if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD))) 5325 + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && 5326 + likely(!(ctxt->d & EmulateOnUD))) 5326 5327 return EMULATION_FAILED; 5327 5328 5328 5329 if (unlikely(ctxt->d &

+8

arch/x86/kvm/hyperv.c

··· 1172 1172 { 1173 1173 struct kvm_hv *hv = to_kvm_hv(kvm); 1174 1174 u64 gfn; 1175 + int idx; 1175 1176 1176 1177 if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || 1177 1178 hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || ··· 1191 1190 gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; 1192 1191 1193 1192 hv->tsc_ref.tsc_sequence = 0; 1193 + 1194 + /* 1195 + * Take the srcu lock as memslots will be accessed to check the gfn 1196 + * cache generation against the memslots generation. 1197 + */ 1198 + idx = srcu_read_lock(&kvm->srcu); 1194 1199 if (kvm_write_guest(kvm, gfn_to_gpa(gfn), 1195 1200 &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) 1196 1201 hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; 1202 + srcu_read_unlock(&kvm->srcu, idx); 1197 1203 1198 1204 out_unlock: 1199 1205 mutex_unlock(&hv->hv_lock);

+1 -2

arch/x86/kvm/kvm_emulate.h

··· 314 314 int interruptibility; 315 315 316 316 bool perm_ok; /* do not check permissions if true */ 317 - bool ud; /* inject an #UD if host doesn't support insn */ 318 317 bool tf; /* TF value before instruction (after for syscall/sysret) */ 319 318 320 319 bool have_exception; ··· 490 491 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 491 492 #endif 492 493 493 - int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); 494 + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); 494 495 bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); 495 496 #define EMULATION_FAILED -1 496 497 #define EMULATION_OK 0

+12 -4

arch/x86/kvm/lapic.c

··· 1598 1598 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); 1599 1599 apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; 1600 1600 1601 + if (lapic_timer_advance_dynamic) { 1602 + adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); 1603 + /* 1604 + * If the timer fired early, reread the TSC to account for the 1605 + * overhead of the above adjustment to avoid waiting longer 1606 + * than is necessary. 1607 + */ 1608 + if (guest_tsc < tsc_deadline) 1609 + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); 1610 + } 1611 + 1601 1612 if (guest_tsc < tsc_deadline) 1602 1613 __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); 1603 - 1604 - if (lapic_timer_advance_dynamic) 1605 - adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); 1606 1614 } 1607 1615 1608 1616 void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) ··· 1669 1661 } 1670 1662 1671 1663 atomic_inc(&apic->lapic_timer.pending); 1672 - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); 1664 + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); 1673 1665 if (from_timer_fn) 1674 1666 kvm_vcpu_kick(vcpu); 1675 1667 }

+3 -3

arch/x86/kvm/mmu/tdp_mmu.c

··· 1192 1192 } 1193 1193 1194 1194 /* 1195 - * Remove write access from all the SPTEs mapping GFNs [start, end). If 1196 - * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 1197 - * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1195 + * Remove write access from all SPTEs at or above min_level that map GFNs 1196 + * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1197 + * be flushed. 1198 1198 */ 1199 1199 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1200 1200 gfn_t start, gfn_t end, int min_level)

+2 -4

arch/x86/kvm/svm/avic.c

··· 28 28 #include "svm.h" 29 29 30 30 /* enable / disable AVIC */ 31 - int avic; 32 - #ifdef CONFIG_X86_LOCAL_APIC 33 - module_param(avic, int, S_IRUGO); 34 - #endif 31 + bool avic; 32 + module_param(avic, bool, S_IRUGO); 35 33 36 34 #define SVM_AVIC_DOORBELL 0xc001011b 37 35

+1 -3

arch/x86/kvm/svm/svm.c

··· 1010 1010 } 1011 1011 1012 1012 if (avic) { 1013 - if (!npt_enabled || 1014 - !boot_cpu_has(X86_FEATURE_AVIC) || 1015 - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { 1013 + if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) { 1016 1014 avic = false; 1017 1015 } else { 1018 1016 pr_info("AVIC enabled\n");

+1 -1

arch/x86/kvm/svm/svm.h

··· 480 480 481 481 #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL 482 482 483 - extern int avic; 483 + extern bool avic; 484 484 485 485 static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) 486 486 {

+1 -2

arch/x86/kvm/vmx/capabilities.h

··· 90 90 91 91 static inline bool cpu_has_vmx_posted_intr(void) 92 92 { 93 - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && 94 - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 93 + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 95 94 } 96 95 97 96 static inline bool cpu_has_load_ia32_efer(void)

+14

arch/x86/kvm/vmx/posted_intr.c

··· 238 238 239 239 240 240 /* 241 + * Bail out of the block loop if the VM has an assigned 242 + * device, but the blocking vCPU didn't reconfigure the 243 + * PI.NV to the wakeup vector, i.e. the assigned device 244 + * came along after the initial check in pi_pre_block(). 245 + */ 246 + void vmx_pi_start_assignment(struct kvm *kvm) 247 + { 248 + if (!irq_remapping_cap(IRQ_POSTING_CAP)) 249 + return; 250 + 251 + kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); 252 + } 253 + 254 + /* 241 255 * pi_update_irte - set IRTE for Posted-Interrupts 242 256 * 243 257 * @kvm: kvm

+1

arch/x86/kvm/vmx/posted_intr.h

··· 95 95 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); 96 96 int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, 97 97 bool set); 98 + void vmx_pi_start_assignment(struct kvm *kvm); 98 99 99 100 #endif /* __KVM_X86_VMX_POSTED_INTR_H */

+3 -3

arch/x86/kvm/vmx/vmx.c

··· 4843 4843 struct vcpu_vmx *vmx = to_vmx(vcpu); 4844 4844 struct kvm_run *kvm_run = vcpu->run; 4845 4845 u32 intr_info, ex_no, error_code; 4846 - unsigned long cr2, rip, dr6; 4846 + unsigned long cr2, dr6; 4847 4847 u32 vect_info; 4848 4848 4849 4849 vect_info = vmx->idt_vectoring_info; ··· 4933 4933 vmx->vcpu.arch.event_exit_inst_len = 4934 4934 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4935 4935 kvm_run->exit_reason = KVM_EXIT_DEBUG; 4936 - rip = kvm_rip_read(vcpu); 4937 - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 4936 + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 4938 4937 kvm_run->debug.arch.exception = ex_no; 4939 4938 break; 4940 4939 case AC_VECTOR: ··· 7720 7721 .nested_ops = &vmx_nested_ops, 7721 7722 7722 7723 .update_pi_irte = pi_update_irte, 7724 + .start_assignment = vmx_pi_start_assignment, 7723 7725 7724 7726 #ifdef CONFIG_X86_64 7725 7727 .set_hv_timer = vmx_set_hv_timer,

+15 -12

arch/x86/kvm/x86.c

··· 3105 3105 st->preempted & KVM_VCPU_FLUSH_TLB); 3106 3106 if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) 3107 3107 kvm_vcpu_flush_tlb_guest(vcpu); 3108 + } else { 3109 + st->preempted = 0; 3108 3110 } 3109 3111 3110 3112 vcpu->arch.st.preempted = 0; ··· 7228 7226 BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); 7229 7227 BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); 7230 7228 7229 + ctxt->interruptibility = 0; 7230 + ctxt->have_exception = false; 7231 + ctxt->exception.vector = -1; 7232 + ctxt->perm_ok = false; 7233 + 7231 7234 init_decode_cache(ctxt); 7232 7235 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 7233 7236 } ··· 7568 7561 kvm_vcpu_check_breakpoint(vcpu, &r)) 7569 7562 return r; 7570 7563 7571 - ctxt->interruptibility = 0; 7572 - ctxt->have_exception = false; 7573 - ctxt->exception.vector = -1; 7574 - ctxt->perm_ok = false; 7575 - 7576 - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; 7577 - 7578 - r = x86_decode_insn(ctxt, insn, insn_len); 7564 + r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); 7579 7565 7580 7566 trace_kvm_emulate_insn_start(vcpu); 7581 7567 ++vcpu->stat.insn_emulation; ··· 8359 8359 struct kvm_apic_map *map; 8360 8360 8361 8361 vcpu->stat.directed_yield_attempted++; 8362 + 8363 + if (single_task_running()) 8364 + goto no_yield; 8362 8365 8363 8366 rcu_read_lock(); 8364 8367 map = rcu_dereference(vcpu->kvm->arch.apic_map); ··· 9499 9496 if (r <= 0) 9500 9497 break; 9501 9498 9502 - kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); 9499 + kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); 9503 9500 if (kvm_cpu_has_pending_timer(vcpu)) 9504 9501 kvm_inject_pending_timer_irqs(vcpu); 9505 9502 ··· 10118 10115 kvm_update_dr7(vcpu); 10119 10116 10120 10117 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 10121 - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 10122 - get_segment_base(vcpu, VCPU_SREG_CS); 10118 + vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu); 10123 10119 10124 10120 /* 10125 10121 * Trigger an rflags update that will inject or remove the trace ··· 11501 11499 11502 11500 void kvm_arch_start_assignment(struct kvm *kvm) 11503 11501 { 11504 - atomic_inc(&kvm->arch.assigned_device_count); 11502 + if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) 11503 + static_call_cond(kvm_x86_start_assignment)(kvm); 11505 11504 } 11506 11505 EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); 11507 11506

+7 -1

include/linux/kvm_host.h

··· 10 10 #include <linux/spinlock.h> 11 11 #include <linux/signal.h> 12 12 #include <linux/sched.h> 13 + #include <linux/sched/stat.h> 13 14 #include <linux/bug.h> 14 15 #include <linux/minmax.h> 15 16 #include <linux/mm.h> ··· 147 146 */ 148 147 #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 149 148 #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 150 - #define KVM_REQ_PENDING_TIMER 2 149 + #define KVM_REQ_UNBLOCK 2 151 150 #define KVM_REQ_UNHALT 3 152 151 #define KVM_REQUEST_ARCH_BASE 8 153 152 ··· 264 263 static inline bool kvm_vcpu_mapped(struct kvm_host_map *map) 265 264 { 266 265 return !!map->hva; 266 + } 267 + 268 + static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) 269 + { 270 + return single_task_running() && !need_resched() && ktime_before(cur, stop); 267 271 } 268 272 269 273 /*

+3 -2

include/uapi/linux/kvm.h

··· 8 8 * Note: you must update KVM_API_VERSION if you change this interface. 9 9 */ 10 10 11 + #include <linux/const.h> 11 12 #include <linux/types.h> 12 13 #include <linux/compiler.h> 13 14 #include <linux/ioctl.h> ··· 1880 1879 * conversion after harvesting an entry. Also, it must not skip any 1881 1880 * dirty bits, so that dirty bits are always harvested in sequence. 1882 1881 */ 1883 - #define KVM_DIRTY_GFN_F_DIRTY BIT(0) 1884 - #define KVM_DIRTY_GFN_F_RESET BIT(1) 1882 + #define KVM_DIRTY_GFN_F_DIRTY _BITUL(0) 1883 + #define KVM_DIRTY_GFN_F_RESET _BITUL(1) 1885 1884 #define KVM_DIRTY_GFN_F_MASK 0x3 1886 1885 1887 1886 /*

+3 -2

tools/include/uapi/linux/kvm.h

··· 8 8 * Note: you must update KVM_API_VERSION if you change this interface. 9 9 */ 10 10 11 + #include <linux/const.h> 11 12 #include <linux/types.h> 12 13 #include <linux/compiler.h> 13 14 #include <linux/ioctl.h> ··· 1880 1879 * conversion after harvesting an entry. Also, it must not skip any 1881 1880 * dirty bits, so that dirty bits are always harvested in sequence. 1882 1881 */ 1883 - #define KVM_DIRTY_GFN_F_DIRTY BIT(0) 1884 - #define KVM_DIRTY_GFN_F_RESET BIT(1) 1882 + #define KVM_DIRTY_GFN_F_DIRTY _BITUL(0) 1883 + #define KVM_DIRTY_GFN_F_RESET _BITUL(1) 1885 1884 #define KVM_DIRTY_GFN_F_MASK 0x3 1886 1885 1887 1886 /*

+1

tools/testing/selftests/kvm/.gitignore

··· 41 41 /kvm_create_max_vcpus 42 42 /kvm_page_table_test 43 43 /memslot_modification_stress_test 44 + /memslot_perf_test 44 45 /set_memory_region_test 45 46 /steal_time

+2 -1

tools/testing/selftests/kvm/Makefile

··· 33 33 UNAME_M := s390x 34 34 endif 35 35 36 - LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c 36 + LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c 37 37 LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S 38 38 LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c 39 39 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c ··· 74 74 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus 75 75 TEST_GEN_PROGS_x86_64 += kvm_page_table_test 76 76 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test 77 + TEST_GEN_PROGS_x86_64 += memslot_perf_test 77 78 TEST_GEN_PROGS_x86_64 += set_memory_region_test 78 79 TEST_GEN_PROGS_x86_64 += steal_time 79 80

+110 -66

tools/testing/selftests/kvm/demand_paging_test.c

··· 9 9 10 10 #define _GNU_SOURCE /* for pipe2 */ 11 11 12 + #include <inttypes.h> 12 13 #include <stdio.h> 13 14 #include <stdlib.h> 14 15 #include <time.h> ··· 39 38 40 39 static int nr_vcpus = 1; 41 40 static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; 41 + static size_t demand_paging_size; 42 42 static char *guest_data_prototype; 43 43 44 44 static void *vcpu_worker(void *data) ··· 73 71 return NULL; 74 72 } 75 73 76 - static int handle_uffd_page_request(int uffd, uint64_t addr) 74 + static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr) 77 75 { 78 - pid_t tid; 76 + pid_t tid = syscall(__NR_gettid); 79 77 struct timespec start; 80 78 struct timespec ts_diff; 81 - struct uffdio_copy copy; 82 79 int r; 83 - 84 - tid = syscall(__NR_gettid); 85 - 86 - copy.src = (uint64_t)guest_data_prototype; 87 - copy.dst = addr; 88 - copy.len = perf_test_args.host_page_size; 89 - copy.mode = 0; 90 80 91 81 clock_gettime(CLOCK_MONOTONIC, &start); 92 82 93 - r = ioctl(uffd, UFFDIO_COPY, &copy); 94 - if (r == -1) { 95 - pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n", 96 - addr, tid, errno); 97 - return r; 83 + if (uffd_mode == UFFDIO_REGISTER_MODE_MISSING) { 84 + struct uffdio_copy copy; 85 + 86 + copy.src = (uint64_t)guest_data_prototype; 87 + copy.dst = addr; 88 + copy.len = demand_paging_size; 89 + copy.mode = 0; 90 + 91 + r = ioctl(uffd, UFFDIO_COPY, &copy); 92 + if (r == -1) { 93 + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", 94 + addr, tid, errno); 95 + return r; 96 + } 97 + } else if (uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { 98 + struct uffdio_continue cont = {0}; 99 + 100 + cont.range.start = addr; 101 + cont.range.len = demand_paging_size; 102 + 103 + r = ioctl(uffd, UFFDIO_CONTINUE, &cont); 104 + if (r == -1) { 105 + pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", 106 + addr, tid, errno); 107 + return r; 108 + } 109 + } else { 110 + TEST_FAIL("Invalid uffd mode %d", uffd_mode); 98 111 } 99 112 100 113 ts_diff = timespec_elapsed(start); 101 114 102 - PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, 115 + PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid, 103 116 timespec_to_ns(ts_diff)); 104 117 PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", 105 - perf_test_args.host_page_size, addr, tid); 118 + demand_paging_size, addr, tid); 106 119 107 120 return 0; 108 121 } ··· 125 108 bool quit_uffd_thread; 126 109 127 110 struct uffd_handler_args { 111 + int uffd_mode; 128 112 int uffd; 129 113 int pipefd; 130 114 useconds_t delay; ··· 187 169 if (r == -1) { 188 170 if (errno == EAGAIN) 189 171 continue; 190 - pr_info("Read of uffd gor errno %d", errno); 172 + pr_info("Read of uffd got errno %d\n", errno); 191 173 return NULL; 192 174 } 193 175 ··· 202 184 if (delay) 203 185 usleep(delay); 204 186 addr = msg.arg.pagefault.address; 205 - r = handle_uffd_page_request(uffd, addr); 187 + r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr); 206 188 if (r < 0) 207 189 return NULL; 208 190 pages++; ··· 216 198 return NULL; 217 199 } 218 200 219 - static int setup_demand_paging(struct kvm_vm *vm, 220 - pthread_t *uffd_handler_thread, int pipefd, 221 - useconds_t uffd_delay, 222 - struct uffd_handler_args *uffd_args, 223 - void *hva, uint64_t len) 201 + static void setup_demand_paging(struct kvm_vm *vm, 202 + pthread_t *uffd_handler_thread, int pipefd, 203 + int uffd_mode, useconds_t uffd_delay, 204 + struct uffd_handler_args *uffd_args, 205 + void *hva, void *alias, uint64_t len) 224 206 { 207 + bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); 225 208 int uffd; 226 209 struct uffdio_api uffdio_api; 227 210 struct uffdio_register uffdio_register; 211 + uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; 212 + 213 + PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", 214 + is_minor ? "MINOR" : "MISSING", 215 + is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); 216 + 217 + /* In order to get minor faults, prefault via the alias. */ 218 + if (is_minor) { 219 + size_t p; 220 + 221 + expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; 222 + 223 + TEST_ASSERT(alias != NULL, "Alias required for minor faults"); 224 + for (p = 0; p < (len / demand_paging_size); ++p) { 225 + memcpy(alias + (p * demand_paging_size), 226 + guest_data_prototype, demand_paging_size); 227 + } 228 + } 228 229 229 230 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 230 - if (uffd == -1) { 231 - pr_info("uffd creation failed\n"); 232 - return -1; 233 - } 231 + TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); 234 232 235 233 uffdio_api.api = UFFD_API; 236 234 uffdio_api.features = 0; 237 - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { 238 - pr_info("ioctl uffdio_api failed\n"); 239 - return -1; 240 - } 235 + TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, 236 + "ioctl UFFDIO_API failed: %" PRIu64, 237 + (uint64_t)uffdio_api.api); 241 238 242 239 uffdio_register.range.start = (uint64_t)hva; 243 240 uffdio_register.range.len = len; 244 - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 245 - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { 246 - pr_info("ioctl uffdio_register failed\n"); 247 - return -1; 248 - } 241 + uffdio_register.mode = uffd_mode; 242 + TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, 243 + "ioctl UFFDIO_REGISTER failed"); 244 + TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == 245 + expected_ioctls, "missing userfaultfd ioctls"); 249 246 250 - if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != 251 - UFFD_API_RANGE_IOCTLS) { 252 - pr_info("unexpected userfaultfd ioctl set\n"); 253 - return -1; 254 - } 255 - 247 + uffd_args->uffd_mode = uffd_mode; 256 248 uffd_args->uffd = uffd; 257 249 uffd_args->pipefd = pipefd; 258 250 uffd_args->delay = uffd_delay; ··· 271 243 272 244 PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", 273 245 hva, hva + len); 274 - 275 - return 0; 276 246 } 277 247 278 248 struct test_params { 279 - bool use_uffd; 249 + int uffd_mode; 280 250 useconds_t uffd_delay; 251 + enum vm_mem_backing_src_type src_type; 281 252 bool partition_vcpu_memory_access; 282 253 }; 283 254 ··· 294 267 int r; 295 268 296 269 vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 297 - VM_MEM_SRC_ANONYMOUS); 270 + p->src_type); 298 271 299 272 perf_test_args.wr_fract = 1; 300 273 301 - guest_data_prototype = malloc(perf_test_args.host_page_size); 274 + demand_paging_size = get_backing_src_pagesz(p->src_type); 275 + 276 + guest_data_prototype = malloc(demand_paging_size); 302 277 TEST_ASSERT(guest_data_prototype, 303 278 "Failed to allocate buffer for guest data pattern"); 304 - memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); 279 + memset(guest_data_prototype, 0xAB, demand_paging_size); 305 280 306 281 vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); 307 282 TEST_ASSERT(vcpu_threads, "Memory allocation failed"); ··· 311 282 perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, 312 283 p->partition_vcpu_memory_access); 313 284 314 - if (p->use_uffd) { 285 + if (p->uffd_mode) { 315 286 uffd_handler_threads = 316 287 malloc(nr_vcpus * sizeof(*uffd_handler_threads)); 317 288 TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); ··· 325 296 for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { 326 297 vm_paddr_t vcpu_gpa; 327 298 void *vcpu_hva; 299 + void *vcpu_alias; 328 300 uint64_t vcpu_mem_size; 329 301 330 302 ··· 340 310 PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", 341 311 vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size); 342 312 343 - /* Cache the HVA pointer of the region */ 313 + /* Cache the host addresses of the region */ 344 314 vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); 315 + vcpu_alias = addr_gpa2alias(vm, vcpu_gpa); 345 316 346 317 /* 347 318 * Set up user fault fd to handle demand paging ··· 352 321 O_CLOEXEC | O_NONBLOCK); 353 322 TEST_ASSERT(!r, "Failed to set up pipefd"); 354 323 355 - r = setup_demand_paging(vm, 356 - &uffd_handler_threads[vcpu_id], 357 - pipefds[vcpu_id * 2], 358 - p->uffd_delay, &uffd_args[vcpu_id], 359 - vcpu_hva, vcpu_mem_size); 360 - if (r < 0) 361 - exit(-r); 324 + setup_demand_paging(vm, &uffd_handler_threads[vcpu_id], 325 + pipefds[vcpu_id * 2], p->uffd_mode, 326 + p->uffd_delay, &uffd_args[vcpu_id], 327 + vcpu_hva, vcpu_alias, 328 + vcpu_mem_size); 362 329 } 363 330 } 364 331 ··· 384 355 385 356 pr_info("All vCPU threads joined\n"); 386 357 387 - if (p->use_uffd) { 358 + if (p->uffd_mode) { 388 359 char c; 389 360 390 361 /* Tell the user fault fd handler threads to quit */ ··· 406 377 407 378 free(guest_data_prototype); 408 379 free(vcpu_threads); 409 - if (p->use_uffd) { 380 + if (p->uffd_mode) { 410 381 free(uffd_handler_threads); 411 382 free(uffd_args); 412 383 free(pipefds); ··· 416 387 static void help(char *name) 417 388 { 418 389 puts(""); 419 - printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" 420 - " [-b memory] [-v vcpus] [-o]\n", name); 390 + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" 391 + " [-b memory] [-t type] [-v vcpus] [-o]\n", name); 421 392 guest_modes_help(); 422 - printf(" -u: use User Fault FD to handle vCPU page\n" 423 - " faults.\n"); 393 + printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" 394 + " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); 424 395 printf(" -d: add a delay in usec to the User Fault\n" 425 396 " FD handler to simulate demand paging\n" 426 397 " overheads. Ignored without -u.\n"); 427 398 printf(" -b: specify the size of the memory region which should be\n" 428 399 " demand paged by each vCPU. e.g. 10M or 3G.\n" 429 400 " Default: 1G\n"); 401 + printf(" -t: The type of backing memory to use. Default: anonymous\n"); 402 + backing_src_help(); 430 403 printf(" -v: specify the number of vCPUs to run.\n"); 431 404 printf(" -o: Overlap guest memory accesses instead of partitioning\n" 432 405 " them into a separate region of memory for each vCPU.\n"); ··· 440 409 { 441 410 int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); 442 411 struct test_params p = { 412 + .src_type = VM_MEM_SRC_ANONYMOUS, 443 413 .partition_vcpu_memory_access = true, 444 414 }; 445 415 int opt; 446 416 447 417 guest_modes_append_default(); 448 418 449 - while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) { 419 + while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { 450 420 switch (opt) { 451 421 case 'm': 452 422 guest_modes_cmdline(optarg); 453 423 break; 454 424 case 'u': 455 - p.use_uffd = true; 425 + if (!strcmp("MISSING", optarg)) 426 + p.uffd_mode = UFFDIO_REGISTER_MODE_MISSING; 427 + else if (!strcmp("MINOR", optarg)) 428 + p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; 429 + TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); 456 430 break; 457 431 case 'd': 458 432 p.uffd_delay = strtoul(optarg, NULL, 0); ··· 465 429 break; 466 430 case 'b': 467 431 guest_percpu_mem_size = parse_size(optarg); 432 + break; 433 + case 't': 434 + p.src_type = parse_backing_src_type(optarg); 468 435 break; 469 436 case 'v': 470 437 nr_vcpus = atoi(optarg); ··· 482 443 help(argv[0]); 483 444 break; 484 445 } 446 + } 447 + 448 + if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && 449 + !backing_src_is_shared(p.src_type)) { 450 + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); 485 451 } 486 452 487 453 for_each_guest_mode(run_test, &p);

+31 -1

tools/testing/selftests/kvm/hardware_disable_test.c

··· 132 132 TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); 133 133 } 134 134 135 + void wait_for_child_setup(pid_t pid) 136 + { 137 + /* 138 + * Wait for the child to post to the semaphore, but wake up periodically 139 + * to check if the child exited prematurely. 140 + */ 141 + for (;;) { 142 + const struct timespec wait_period = { .tv_sec = 1 }; 143 + int status; 144 + 145 + if (!sem_timedwait(sem, &wait_period)) 146 + return; 147 + 148 + /* Child is still running, keep waiting. */ 149 + if (pid != waitpid(pid, &status, WNOHANG)) 150 + continue; 151 + 152 + /* 153 + * Child is no longer running, which is not expected. 154 + * 155 + * If it exited with a non-zero status, we explicitly forward 156 + * the child's status in case it exited with KSFT_SKIP. 157 + */ 158 + if (WIFEXITED(status)) 159 + exit(WEXITSTATUS(status)); 160 + else 161 + TEST_ASSERT(false, "Child exited unexpectedly"); 162 + } 163 + } 164 + 135 165 int main(int argc, char **argv) 136 166 { 137 167 uint32_t i; ··· 178 148 run_test(i); /* This function always exits */ 179 149 180 150 pr_debug("%s: [%d] waiting semaphore\n", __func__, i); 181 - sem_wait(sem); 151 + wait_for_child_setup(pid); 182 152 r = (rand() % DELAY_US_MAX) + 1; 183 153 pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); 184 154 usleep(r);

+3 -1

tools/testing/selftests/kvm/include/kvm_util.h

··· 77 77 }; 78 78 extern const struct vm_guest_mode_params vm_guest_mode_params[]; 79 79 80 + int open_kvm_dev_path_or_exit(void); 80 81 int kvm_check_cap(long cap); 81 82 int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); 82 83 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, ··· 147 146 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); 148 147 void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); 149 148 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); 149 + void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); 150 150 151 151 /* 152 152 * Address Guest Virtual to Guest Physical ··· 304 302 305 303 unsigned int vm_get_page_size(struct kvm_vm *vm); 306 304 unsigned int vm_get_page_shift(struct kvm_vm *vm); 307 - unsigned int vm_get_max_gfn(struct kvm_vm *vm); 305 + uint64_t vm_get_max_gfn(struct kvm_vm *vm); 308 306 int vm_get_fd(struct kvm_vm *vm); 309 307 310 308 unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);

+12

tools/testing/selftests/kvm/include/test_util.h

··· 17 17 #include <errno.h> 18 18 #include <unistd.h> 19 19 #include <fcntl.h> 20 + #include <sys/mman.h> 20 21 #include "kselftest.h" 21 22 22 23 static inline int _no_printf(const char *format, ...) { return 0; } ··· 85 84 VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB, 86 85 VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB, 87 86 VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, 87 + VM_MEM_SRC_SHMEM, 88 + VM_MEM_SRC_SHARED_HUGETLB, 88 89 NUM_SRC_TYPES, 89 90 }; 90 91 ··· 102 99 size_t get_backing_src_pagesz(uint32_t i); 103 100 void backing_src_help(void); 104 101 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); 102 + 103 + /* 104 + * Whether or not the given source type is shared memory (as opposed to 105 + * anonymous). 106 + */ 107 + static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) 108 + { 109 + return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; 110 + } 105 111 106 112 #endif /* SELFTEST_KVM_TEST_UTIL_H */

+220 -58

tools/testing/selftests/kvm/lib/kvm_util.c

··· 32 32 } 33 33 34 34 /* 35 + * Open KVM_DEV_PATH if available, otherwise exit the entire program. 36 + * 37 + * Input Args: 38 + * flags - The flags to pass when opening KVM_DEV_PATH. 39 + * 40 + * Return: 41 + * The opened file descriptor of /dev/kvm. 42 + */ 43 + static int _open_kvm_dev_path_or_exit(int flags) 44 + { 45 + int fd; 46 + 47 + fd = open(KVM_DEV_PATH, flags); 48 + if (fd < 0) { 49 + print_skip("%s not available, is KVM loaded? (errno: %d)", 50 + KVM_DEV_PATH, errno); 51 + exit(KSFT_SKIP); 52 + } 53 + 54 + return fd; 55 + } 56 + 57 + int open_kvm_dev_path_or_exit(void) 58 + { 59 + return _open_kvm_dev_path_or_exit(O_RDONLY); 60 + } 61 + 62 + /* 35 63 * Capability 36 64 * 37 65 * Input Args: ··· 80 52 int ret; 81 53 int kvm_fd; 82 54 83 - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); 84 - if (kvm_fd < 0) 85 - exit(KSFT_SKIP); 86 - 55 + kvm_fd = open_kvm_dev_path_or_exit(); 87 56 ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); 88 57 TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n" 89 58 " rc: %i errno: %i", ret, errno); ··· 153 128 154 129 static void vm_open(struct kvm_vm *vm, int perm) 155 130 { 156 - vm->kvm_fd = open(KVM_DEV_PATH, perm); 157 - if (vm->kvm_fd < 0) 158 - exit(KSFT_SKIP); 131 + vm->kvm_fd = _open_kvm_dev_path_or_exit(perm); 159 132 160 133 if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { 161 134 print_skip("immediate_exit not available"); ··· 226 203 TEST_ASSERT(vm != NULL, "Insufficient Memory"); 227 204 228 205 INIT_LIST_HEAD(&vm->vcpus); 229 - INIT_LIST_HEAD(&vm->userspace_mem_regions); 206 + vm->regions.gpa_tree = RB_ROOT; 207 + vm->regions.hva_tree = RB_ROOT; 208 + hash_init(vm->regions.slot_hash); 230 209 231 210 vm->mode = mode; 232 211 vm->type = 0; ··· 320 295 */ 321 296 uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; 322 297 uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; 323 - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; 298 + uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages; 324 299 struct kvm_vm *vm; 325 300 int i; 326 301 ··· 380 355 */ 381 356 void kvm_vm_restart(struct kvm_vm *vmp, int perm) 382 357 { 358 + int ctr; 383 359 struct userspace_mem_region *region; 384 360 385 361 vm_open(vmp, perm); 386 362 if (vmp->has_irqchip) 387 363 vm_create_irqchip(vmp); 388 364 389 - list_for_each_entry(region, &vmp->userspace_mem_regions, list) { 365 + hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { 390 366 int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region); 391 367 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" 392 368 " rc: %i errno: %i\n" ··· 450 424 static struct userspace_mem_region * 451 425 userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) 452 426 { 453 - struct userspace_mem_region *region; 427 + struct rb_node *node; 454 428 455 - list_for_each_entry(region, &vm->userspace_mem_regions, list) { 429 + for (node = vm->regions.gpa_tree.rb_node; node; ) { 430 + struct userspace_mem_region *region = 431 + container_of(node, struct userspace_mem_region, gpa_node); 456 432 uint64_t existing_start = region->region.guest_phys_addr; 457 433 uint64_t existing_end = region->region.guest_phys_addr 458 434 + region->region.memory_size - 1; 459 435 if (start <= existing_end && end >= existing_start) 460 436 return region; 437 + 438 + if (start < existing_start) 439 + node = node->rb_left; 440 + else 441 + node = node->rb_right; 461 442 } 462 443 463 444 return NULL; ··· 579 546 } 580 547 581 548 static void __vm_mem_region_delete(struct kvm_vm *vm, 582 - struct userspace_mem_region *region) 549 + struct userspace_mem_region *region, 550 + bool unlink) 583 551 { 584 552 int ret; 585 553 586 - list_del(&region->list); 554 + if (unlink) { 555 + rb_erase(&region->gpa_node, &vm->regions.gpa_tree); 556 + rb_erase(&region->hva_node, &vm->regions.hva_tree); 557 + hash_del(&region->slot_node); 558 + } 587 559 588 560 region->region.memory_size = 0; 589 561 ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region); ··· 607 569 */ 608 570 void kvm_vm_free(struct kvm_vm *vmp) 609 571 { 610 - struct userspace_mem_region *region, *tmp; 572 + int ctr; 573 + struct hlist_node *node; 574 + struct userspace_mem_region *region; 611 575 612 576 if (vmp == NULL) 613 577 return; 614 578 615 579 /* Free userspace_mem_regions. */ 616 - list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) 617 - __vm_mem_region_delete(vmp, region); 580 + hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) 581 + __vm_mem_region_delete(vmp, region, false); 618 582 619 583 /* Free sparsebit arrays. */ 620 584 sparsebit_free(&vmp->vpages_valid); ··· 698 658 return 0; 699 659 } 700 660 661 + static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, 662 + struct userspace_mem_region *region) 663 + { 664 + struct rb_node **cur, *parent; 665 + 666 + for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { 667 + struct userspace_mem_region *cregion; 668 + 669 + cregion = container_of(*cur, typeof(*cregion), gpa_node); 670 + parent = *cur; 671 + if (region->region.guest_phys_addr < 672 + cregion->region.guest_phys_addr) 673 + cur = &(*cur)->rb_left; 674 + else { 675 + TEST_ASSERT(region->region.guest_phys_addr != 676 + cregion->region.guest_phys_addr, 677 + "Duplicate GPA in region tree"); 678 + 679 + cur = &(*cur)->rb_right; 680 + } 681 + } 682 + 683 + rb_link_node(&region->gpa_node, parent, cur); 684 + rb_insert_color(&region->gpa_node, gpa_tree); 685 + } 686 + 687 + static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, 688 + struct userspace_mem_region *region) 689 + { 690 + struct rb_node **cur, *parent; 691 + 692 + for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { 693 + struct userspace_mem_region *cregion; 694 + 695 + cregion = container_of(*cur, typeof(*cregion), hva_node); 696 + parent = *cur; 697 + if (region->host_mem < cregion->host_mem) 698 + cur = &(*cur)->rb_left; 699 + else { 700 + TEST_ASSERT(region->host_mem != 701 + cregion->host_mem, 702 + "Duplicate HVA in region tree"); 703 + 704 + cur = &(*cur)->rb_right; 705 + } 706 + } 707 + 708 + rb_link_node(&region->hva_node, parent, cur); 709 + rb_insert_color(&region->hva_node, hva_tree); 710 + } 711 + 701 712 /* 702 713 * VM Userspace Memory Region Add 703 714 * 704 715 * Input Args: 705 716 * vm - Virtual Machine 706 - * backing_src - Storage source for this region. 707 - * NULL to use anonymous memory. 717 + * src_type - Storage source for this region. 718 + * NULL to use anonymous memory. 708 719 * guest_paddr - Starting guest physical address 709 720 * slot - KVM region slot 710 721 * npages - Number of physical pages ··· 813 722 (uint64_t) region->region.memory_size); 814 723 815 724 /* Confirm no region with the requested slot already exists. */ 816 - list_for_each_entry(region, &vm->userspace_mem_regions, list) { 725 + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, 726 + slot) { 817 727 if (region->region.slot != slot) 818 728 continue; 819 729 ··· 847 755 if (alignment > 1) 848 756 region->mmap_size += alignment; 849 757 758 + region->fd = -1; 759 + if (backing_src_is_shared(src_type)) { 760 + int memfd_flags = MFD_CLOEXEC; 761 + 762 + if (src_type == VM_MEM_SRC_SHARED_HUGETLB) 763 + memfd_flags |= MFD_HUGETLB; 764 + 765 + region->fd = memfd_create("kvm_selftest", memfd_flags); 766 + TEST_ASSERT(region->fd != -1, 767 + "memfd_create failed, errno: %i", errno); 768 + 769 + ret = ftruncate(region->fd, region->mmap_size); 770 + TEST_ASSERT(ret == 0, "ftruncate failed, errno: %i", errno); 771 + 772 + ret = fallocate(region->fd, 773 + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 774 + region->mmap_size); 775 + TEST_ASSERT(ret == 0, "fallocate failed, errno: %i", errno); 776 + } 777 + 850 778 region->mmap_start = mmap(NULL, region->mmap_size, 851 779 PROT_READ | PROT_WRITE, 852 - MAP_PRIVATE | MAP_ANONYMOUS 853 - | vm_mem_backing_src_alias(src_type)->flag, 854 - -1, 0); 780 + vm_mem_backing_src_alias(src_type)->flag, 781 + region->fd, 0); 855 782 TEST_ASSERT(region->mmap_start != MAP_FAILED, 856 783 "test_malloc failed, mmap_start: %p errno: %i", 857 784 region->mmap_start, errno); ··· 904 793 ret, errno, slot, flags, 905 794 guest_paddr, (uint64_t) region->region.memory_size); 906 795 907 - /* Add to linked-list of memory regions. */ 908 - list_add(&region->list, &vm->userspace_mem_regions); 796 + /* Add to quick lookup data structures */ 797 + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); 798 + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); 799 + hash_add(vm->regions.slot_hash, &region->slot_node, slot); 800 + 801 + /* If shared memory, create an alias. */ 802 + if (region->fd >= 0) { 803 + region->mmap_alias = mmap(NULL, region->mmap_size, 804 + PROT_READ | PROT_WRITE, 805 + vm_mem_backing_src_alias(src_type)->flag, 806 + region->fd, 0); 807 + TEST_ASSERT(region->mmap_alias != MAP_FAILED, 808 + "mmap of alias failed, errno: %i", errno); 809 + 810 + /* Align host alias address */ 811 + region->host_alias = align(region->mmap_alias, alignment); 812 + } 909 813 } 910 814 911 815 /* ··· 943 817 { 944 818 struct userspace_mem_region *region; 945 819 946 - list_for_each_entry(region, &vm->userspace_mem_regions, list) { 820 + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, 821 + memslot) 947 822 if (region->region.slot == memslot) 948 823 return region; 949 - } 950 824 951 825 fprintf(stderr, "No mem region with the requested slot found,\n" 952 826 " requested slot: %u\n", memslot); ··· 1031 905 */ 1032 906 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) 1033 907 { 1034 - __vm_mem_region_delete(vm, memslot2region(vm, slot)); 908 + __vm_mem_region_delete(vm, memslot2region(vm, slot), true); 1035 909 } 1036 910 1037 911 /* ··· 1051 925 { 1052 926 int dev_fd, ret; 1053 927 1054 - dev_fd = open(KVM_DEV_PATH, O_RDONLY); 1055 - if (dev_fd < 0) 1056 - exit(KSFT_SKIP); 928 + dev_fd = open_kvm_dev_path_or_exit(); 1057 929 1058 930 ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); 1059 931 TEST_ASSERT(ret >= sizeof(struct kvm_run), ··· 1223 1099 uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); 1224 1100 1225 1101 virt_pgd_alloc(vm, pgd_memslot); 1102 + vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, 1103 + KVM_UTIL_MIN_PFN * vm->page_size, 1104 + data_memslot); 1226 1105 1227 1106 /* 1228 1107 * Find an unused range of virtual page addresses of at least ··· 1235 1108 1236 1109 /* Map the virtual pages. */ 1237 1110 for (vm_vaddr_t vaddr = vaddr_start; pages > 0; 1238 - pages--, vaddr += vm->page_size) { 1239 - vm_paddr_t paddr; 1240 - 1241 - paddr = vm_phy_page_alloc(vm, 1242 - KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); 1111 + pages--, vaddr += vm->page_size, paddr += vm->page_size) { 1243 1112 1244 1113 virt_pg_map(vm, vaddr, paddr, pgd_memslot); 1245 1114 ··· 1300 1177 { 1301 1178 struct userspace_mem_region *region; 1302 1179 1303 - list_for_each_entry(region, &vm->userspace_mem_regions, list) { 1304 - if ((gpa >= region->region.guest_phys_addr) 1305 - && (gpa <= (region->region.guest_phys_addr 1306 - + region->region.memory_size - 1))) 1307 - return (void *) ((uintptr_t) region->host_mem 1308 - + (gpa - region->region.guest_phys_addr)); 1180 + region = userspace_mem_region_find(vm, gpa, gpa); 1181 + if (!region) { 1182 + TEST_FAIL("No vm physical memory at 0x%lx", gpa); 1183 + return NULL; 1309 1184 } 1310 1185 1311 - TEST_FAIL("No vm physical memory at 0x%lx", gpa); 1312 - return NULL; 1186 + return (void *)((uintptr_t)region->host_mem 1187 + + (gpa - region->region.guest_phys_addr)); 1313 1188 } 1314 1189 1315 1190 /* ··· 1329 1208 */ 1330 1209 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) 1331 1210 { 1332 - struct userspace_mem_region *region; 1211 + struct rb_node *node; 1333 1212 1334 - list_for_each_entry(region, &vm->userspace_mem_regions, list) { 1335 - if ((hva >= region->host_mem) 1336 - && (hva <= (region->host_mem 1337 - + region->region.memory_size - 1))) 1338 - return (vm_paddr_t) ((uintptr_t) 1339 - region->region.guest_phys_addr 1340 - + (hva - (uintptr_t) region->host_mem)); 1213 + for (node = vm->regions.hva_tree.rb_node; node; ) { 1214 + struct userspace_mem_region *region = 1215 + container_of(node, struct userspace_mem_region, hva_node); 1216 + 1217 + if (hva >= region->host_mem) { 1218 + if (hva <= (region->host_mem 1219 + + region->region.memory_size - 1)) 1220 + return (vm_paddr_t)((uintptr_t) 1221 + region->region.guest_phys_addr 1222 + + (hva - (uintptr_t)region->host_mem)); 1223 + 1224 + node = node->rb_right; 1225 + } else 1226 + node = node->rb_left; 1341 1227 } 1342 1228 1343 1229 TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); 1344 1230 return -1; 1231 + } 1232 + 1233 + /* 1234 + * Address VM physical to Host Virtual *alias*. 1235 + * 1236 + * Input Args: 1237 + * vm - Virtual Machine 1238 + * gpa - VM physical address 1239 + * 1240 + * Output Args: None 1241 + * 1242 + * Return: 1243 + * Equivalent address within the host virtual *alias* area, or NULL 1244 + * (without failing the test) if the guest memory is not shared (so 1245 + * no alias exists). 1246 + * 1247 + * When vm_create() and related functions are called with a shared memory 1248 + * src_type, we also create a writable, shared alias mapping of the 1249 + * underlying guest memory. This allows the host to manipulate guest memory 1250 + * without mapping that memory in the guest's address space. And, for 1251 + * userfaultfd-based demand paging, we can do so without triggering userfaults. 1252 + */ 1253 + void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) 1254 + { 1255 + struct userspace_mem_region *region; 1256 + uintptr_t offset; 1257 + 1258 + region = userspace_mem_region_find(vm, gpa, gpa); 1259 + if (!region) 1260 + return NULL; 1261 + 1262 + if (!region->host_alias) 1263 + return NULL; 1264 + 1265 + offset = gpa - region->region.guest_phys_addr; 1266 + return (void *) ((uintptr_t) region->host_alias + offset); 1345 1267 } 1346 1268 1347 1269 /* ··· 1986 1822 */ 1987 1823 void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) 1988 1824 { 1825 + int ctr; 1989 1826 struct userspace_mem_region *region; 1990 1827 struct vcpu *vcpu; 1991 1828 ··· 1994 1829 fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); 1995 1830 fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); 1996 1831 fprintf(stream, "%*sMem Regions:\n", indent, ""); 1997 - list_for_each_entry(region, &vm->userspace_mem_regions, list) { 1832 + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { 1998 1833 fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " 1999 1834 "host_virt: %p\n", indent + 2, "", 2000 1835 (uint64_t) region->region.guest_phys_addr, ··· 2180 2015 2181 2016 if (vm == NULL) { 2182 2017 /* Ensure that the KVM vendor-specific module is loaded. */ 2183 - f = fopen(KVM_DEV_PATH, "r"); 2184 - TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d", 2185 - errno); 2186 - fclose(f); 2018 + close(open_kvm_dev_path_or_exit()); 2187 2019 } 2188 2020 2189 2021 f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); ··· 2203 2041 return vm->page_shift; 2204 2042 } 2205 2043 2206 - unsigned int vm_get_max_gfn(struct kvm_vm *vm) 2044 + uint64_t vm_get_max_gfn(struct kvm_vm *vm) 2207 2045 { 2208 2046 return vm->max_gfn; 2209 2047 }

+15 -2

tools/testing/selftests/kvm/lib/kvm_util_internal.h

··· 8 8 #ifndef SELFTEST_KVM_UTIL_INTERNAL_H 9 9 #define SELFTEST_KVM_UTIL_INTERNAL_H 10 10 11 + #include "linux/hashtable.h" 12 + #include "linux/rbtree.h" 13 + 11 14 #include "sparsebit.h" 12 15 13 16 struct userspace_mem_region { ··· 19 16 int fd; 20 17 off_t offset; 21 18 void *host_mem; 19 + void *host_alias; 22 20 void *mmap_start; 21 + void *mmap_alias; 23 22 size_t mmap_size; 24 - struct list_head list; 23 + struct rb_node gpa_node; 24 + struct rb_node hva_node; 25 + struct hlist_node slot_node; 25 26 }; 26 27 27 28 struct vcpu { ··· 36 29 struct kvm_dirty_gfn *dirty_gfns; 37 30 uint32_t fetch_index; 38 31 uint32_t dirty_gfns_count; 32 + }; 33 + 34 + struct userspace_mem_regions { 35 + struct rb_root gpa_tree; 36 + struct rb_root hva_tree; 37 + DECLARE_HASHTABLE(slot_hash, 9); 39 38 }; 40 39 41 40 struct kvm_vm { ··· 56 43 unsigned int va_bits; 57 44 uint64_t max_gfn; 58 45 struct list_head vcpus; 59 - struct list_head userspace_mem_regions; 46 + struct userspace_mem_regions regions; 60 47 struct sparsebit *vpages_valid; 61 48 struct sparsebit *vpages_mapped; 62 49 bool has_irqchip;

+3 -1

tools/testing/selftests/kvm/lib/perf_test_util.c

··· 2 2 /* 3 3 * Copyright (C) 2020, Google LLC. 4 4 */ 5 + #include <inttypes.h> 5 6 6 7 #include "kvm_util.h" 7 8 #include "perf_test_util.h" ··· 81 80 */ 82 81 TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), 83 82 "Requested more guest memory than address space allows.\n" 84 - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", 83 + " guest pages: %" PRIx64 " max gfn: %" PRIx64 84 + " vcpus: %d wss: %" PRIx64 "]\n", 85 85 guest_num_pages, vm_get_max_gfn(vm), vcpus, 86 86 vcpu_memory_bytes); 87 87

+1

tools/testing/selftests/kvm/lib/rbtree.c

··· 1 + #include "../../../../lib/rbtree.c"

+35 -16

tools/testing/selftests/kvm/lib/test_util.c

··· 168 168 169 169 const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) 170 170 { 171 + static const int anon_flags = MAP_PRIVATE | MAP_ANONYMOUS; 172 + static const int anon_huge_flags = anon_flags | MAP_HUGETLB; 173 + 171 174 static const struct vm_mem_backing_src_alias aliases[] = { 172 175 [VM_MEM_SRC_ANONYMOUS] = { 173 176 .name = "anonymous", 174 - .flag = 0, 177 + .flag = anon_flags, 175 178 }, 176 179 [VM_MEM_SRC_ANONYMOUS_THP] = { 177 180 .name = "anonymous_thp", 178 - .flag = 0, 181 + .flag = anon_flags, 179 182 }, 180 183 [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { 181 184 .name = "anonymous_hugetlb", 182 - .flag = MAP_HUGETLB, 185 + .flag = anon_huge_flags, 183 186 }, 184 187 [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { 185 188 .name = "anonymous_hugetlb_16kb", 186 - .flag = MAP_HUGETLB | MAP_HUGE_16KB, 189 + .flag = anon_huge_flags | MAP_HUGE_16KB, 187 190 }, 188 191 [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { 189 192 .name = "anonymous_hugetlb_64kb", 190 - .flag = MAP_HUGETLB | MAP_HUGE_64KB, 193 + .flag = anon_huge_flags | MAP_HUGE_64KB, 191 194 }, 192 195 [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { 193 196 .name = "anonymous_hugetlb_512kb", 194 - .flag = MAP_HUGETLB | MAP_HUGE_512KB, 197 + .flag = anon_huge_flags | MAP_HUGE_512KB, 195 198 }, 196 199 [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { 197 200 .name = "anonymous_hugetlb_1mb", 198 - .flag = MAP_HUGETLB | MAP_HUGE_1MB, 201 + .flag = anon_huge_flags | MAP_HUGE_1MB, 199 202 }, 200 203 [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { 201 204 .name = "anonymous_hugetlb_2mb", 202 - .flag = MAP_HUGETLB | MAP_HUGE_2MB, 205 + .flag = anon_huge_flags | MAP_HUGE_2MB, 203 206 }, 204 207 [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { 205 208 .name = "anonymous_hugetlb_8mb", 206 - .flag = MAP_HUGETLB | MAP_HUGE_8MB, 209 + .flag = anon_huge_flags | MAP_HUGE_8MB, 207 210 }, 208 211 [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { 209 212 .name = "anonymous_hugetlb_16mb", 210 - .flag = MAP_HUGETLB | MAP_HUGE_16MB, 213 + .flag = anon_huge_flags | MAP_HUGE_16MB, 211 214 }, 212 215 [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { 213 216 .name = "anonymous_hugetlb_32mb", 214 - .flag = MAP_HUGETLB | MAP_HUGE_32MB, 217 + .flag = anon_huge_flags | MAP_HUGE_32MB, 215 218 }, 216 219 [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { 217 220 .name = "anonymous_hugetlb_256mb", 218 - .flag = MAP_HUGETLB | MAP_HUGE_256MB, 221 + .flag = anon_huge_flags | MAP_HUGE_256MB, 219 222 }, 220 223 [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { 221 224 .name = "anonymous_hugetlb_512mb", 222 - .flag = MAP_HUGETLB | MAP_HUGE_512MB, 225 + .flag = anon_huge_flags | MAP_HUGE_512MB, 223 226 }, 224 227 [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { 225 228 .name = "anonymous_hugetlb_1gb", 226 - .flag = MAP_HUGETLB | MAP_HUGE_1GB, 229 + .flag = anon_huge_flags | MAP_HUGE_1GB, 227 230 }, 228 231 [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { 229 232 .name = "anonymous_hugetlb_2gb", 230 - .flag = MAP_HUGETLB | MAP_HUGE_2GB, 233 + .flag = anon_huge_flags | MAP_HUGE_2GB, 231 234 }, 232 235 [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { 233 236 .name = "anonymous_hugetlb_16gb", 234 - .flag = MAP_HUGETLB | MAP_HUGE_16GB, 237 + .flag = anon_huge_flags | MAP_HUGE_16GB, 238 + }, 239 + [VM_MEM_SRC_SHMEM] = { 240 + .name = "shmem", 241 + .flag = MAP_SHARED, 242 + }, 243 + [VM_MEM_SRC_SHARED_HUGETLB] = { 244 + .name = "shared_hugetlb", 245 + /* 246 + * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since 247 + * we're using "file backed" memory, we need to specify 248 + * this when the FD is created, not when the area is 249 + * mapped. 250 + */ 251 + .flag = MAP_SHARED, 235 252 }, 236 253 }; 237 254 _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, ··· 267 250 268 251 switch (i) { 269 252 case VM_MEM_SRC_ANONYMOUS: 253 + case VM_MEM_SRC_SHMEM: 270 254 return getpagesize(); 271 255 case VM_MEM_SRC_ANONYMOUS_THP: 272 256 return get_trans_hugepagesz(); 273 257 case VM_MEM_SRC_ANONYMOUS_HUGETLB: 258 + case VM_MEM_SRC_SHARED_HUGETLB: 274 259 return get_def_hugetlb_pagesz(); 275 260 default: 276 261 return MAP_HUGE_PAGE_SIZE(flag);

+4 -12

tools/testing/selftests/kvm/lib/x86_64/processor.c

··· 657 657 return cpuid; 658 658 659 659 cpuid = allocate_kvm_cpuid2(); 660 - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); 661 - if (kvm_fd < 0) 662 - exit(KSFT_SKIP); 660 + kvm_fd = open_kvm_dev_path_or_exit(); 663 661 664 662 ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); 665 663 TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", ··· 689 691 690 692 buffer.header.nmsrs = 1; 691 693 buffer.entry.index = msr_index; 692 - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); 693 - if (kvm_fd < 0) 694 - exit(KSFT_SKIP); 694 + kvm_fd = open_kvm_dev_path_or_exit(); 695 695 696 696 r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); 697 697 TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" ··· 982 986 struct kvm_msr_list *list; 983 987 int nmsrs, r, kvm_fd; 984 988 985 - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); 986 - if (kvm_fd < 0) 987 - exit(KSFT_SKIP); 989 + kvm_fd = open_kvm_dev_path_or_exit(); 988 990 989 991 nmsrs = kvm_get_num_msrs_fd(kvm_fd); 990 992 list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); ··· 1306 1312 return cpuid; 1307 1313 1308 1314 cpuid = allocate_kvm_cpuid2(); 1309 - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); 1310 - if (kvm_fd < 0) 1311 - exit(KSFT_SKIP); 1315 + kvm_fd = open_kvm_dev_path_or_exit(); 1312 1316 1313 1317 ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1314 1318 TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n",

+11 -7

tools/testing/selftests/kvm/memslot_modification_stress_test.c

··· 71 71 }; 72 72 73 73 static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, 74 - uint64_t nr_modifications, uint64_t gpa) 74 + uint64_t nr_modifications) 75 75 { 76 + const uint64_t pages = 1; 77 + uint64_t gpa; 76 78 int i; 79 + 80 + /* 81 + * Add the dummy memslot just below the perf_test_util memslot, which is 82 + * at the top of the guest physical address space. 83 + */ 84 + gpa = guest_test_phys_mem - pages * vm_get_page_size(vm); 77 85 78 86 for (i = 0; i < nr_modifications; i++) { 79 87 usleep(delay); 80 88 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, 81 - DUMMY_MEMSLOT_INDEX, 1, 0); 89 + DUMMY_MEMSLOT_INDEX, pages, 0); 82 90 83 91 vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX); 84 92 } ··· 128 120 pr_info("Started all vCPUs\n"); 129 121 130 122 add_remove_memslot(vm, p->memslot_modification_delay, 131 - p->nr_memslot_modifications, 132 - guest_test_phys_mem + 133 - (guest_percpu_mem_size * nr_vcpus) + 134 - perf_test_args.host_page_size + 135 - perf_test_args.guest_page_size); 123 + p->nr_memslot_modifications); 136 124 137 125 run_vcpus = false; 138 126

+1037

tools/testing/selftests/kvm/memslot_perf_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * A memslot-related performance benchmark. 4 + * 5 + * Copyright (C) 2021 Oracle and/or its affiliates. 6 + * 7 + * Basic guest setup / host vCPU thread code lifted from set_memory_region_test. 8 + */ 9 + #include <pthread.h> 10 + #include <sched.h> 11 + #include <semaphore.h> 12 + #include <stdatomic.h> 13 + #include <stdbool.h> 14 + #include <stdint.h> 15 + #include <stdio.h> 16 + #include <stdlib.h> 17 + #include <string.h> 18 + #include <sys/mman.h> 19 + #include <time.h> 20 + #include <unistd.h> 21 + 22 + #include <linux/compiler.h> 23 + 24 + #include <test_util.h> 25 + #include <kvm_util.h> 26 + #include <processor.h> 27 + 28 + #define VCPU_ID 0 29 + 30 + #define MEM_SIZE ((512U << 20) + 4096) 31 + #define MEM_SIZE_PAGES (MEM_SIZE / 4096) 32 + #define MEM_GPA 0x10000000UL 33 + #define MEM_AUX_GPA MEM_GPA 34 + #define MEM_SYNC_GPA MEM_AUX_GPA 35 + #define MEM_TEST_GPA (MEM_AUX_GPA + 4096) 36 + #define MEM_TEST_SIZE (MEM_SIZE - 4096) 37 + static_assert(MEM_SIZE % 4096 == 0, "invalid mem size"); 38 + static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size"); 39 + 40 + /* 41 + * 32 MiB is max size that gets well over 100 iterations on 509 slots. 42 + * Considering that each slot needs to have at least one page up to 43 + * 8194 slots in use can then be tested (although with slightly 44 + * limited resolution). 45 + */ 46 + #define MEM_SIZE_MAP ((32U << 20) + 4096) 47 + #define MEM_SIZE_MAP_PAGES (MEM_SIZE_MAP / 4096) 48 + #define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096) 49 + #define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096) 50 + static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size"); 51 + static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size"); 52 + static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size"); 53 + static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size"); 54 + 55 + /* 56 + * 128 MiB is min size that fills 32k slots with at least one page in each 57 + * while at the same time gets 100+ iterations in such test 58 + */ 59 + #define MEM_TEST_UNMAP_SIZE (128U << 20) 60 + #define MEM_TEST_UNMAP_SIZE_PAGES (MEM_TEST_UNMAP_SIZE / 4096) 61 + /* 2 MiB chunk size like a typical huge page */ 62 + #define MEM_TEST_UNMAP_CHUNK_PAGES (2U << (20 - 12)) 63 + static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE, 64 + "invalid unmap test region size"); 65 + static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0, 66 + "invalid unmap test region size"); 67 + static_assert(MEM_TEST_UNMAP_SIZE_PAGES % 68 + (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0, 69 + "invalid unmap test region size"); 70 + 71 + /* 72 + * For the move active test the middle of the test area is placed on 73 + * a memslot boundary: half lies in the memslot being moved, half in 74 + * other memslot(s). 75 + * 76 + * When running this test with 32k memslots (32764, really) each memslot 77 + * contains 4 pages. 78 + * The last one additionally contains the remaining 21 pages of memory, 79 + * for the total size of 25 pages. 80 + * Hence, the maximum size here is 50 pages. 81 + */ 82 + #define MEM_TEST_MOVE_SIZE_PAGES (50) 83 + #define MEM_TEST_MOVE_SIZE (MEM_TEST_MOVE_SIZE_PAGES * 4096) 84 + #define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE) 85 + static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, 86 + "invalid move test region size"); 87 + 88 + #define MEM_TEST_VAL_1 0x1122334455667788 89 + #define MEM_TEST_VAL_2 0x99AABBCCDDEEFF00 90 + 91 + struct vm_data { 92 + struct kvm_vm *vm; 93 + pthread_t vcpu_thread; 94 + uint32_t nslots; 95 + uint64_t npages; 96 + uint64_t pages_per_slot; 97 + void **hva_slots; 98 + bool mmio_ok; 99 + uint64_t mmio_gpa_min; 100 + uint64_t mmio_gpa_max; 101 + }; 102 + 103 + struct sync_area { 104 + atomic_bool start_flag; 105 + atomic_bool exit_flag; 106 + atomic_bool sync_flag; 107 + void *move_area_ptr; 108 + }; 109 + 110 + /* 111 + * Technically, we need also for the atomic bool to be address-free, which 112 + * is recommended, but not strictly required, by C11 for lockless 113 + * implementations. 114 + * However, in practice both GCC and Clang fulfill this requirement on 115 + * all KVM-supported platforms. 116 + */ 117 + static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); 118 + 119 + static sem_t vcpu_ready; 120 + 121 + static bool map_unmap_verify; 122 + 123 + static bool verbose; 124 + #define pr_info_v(...) \ 125 + do { \ 126 + if (verbose) \ 127 + pr_info(__VA_ARGS__); \ 128 + } while (0) 129 + 130 + static void *vcpu_worker(void *data) 131 + { 132 + struct vm_data *vm = data; 133 + struct kvm_run *run; 134 + struct ucall uc; 135 + uint64_t cmd; 136 + 137 + run = vcpu_state(vm->vm, VCPU_ID); 138 + while (1) { 139 + vcpu_run(vm->vm, VCPU_ID); 140 + 141 + if (run->exit_reason == KVM_EXIT_IO) { 142 + cmd = get_ucall(vm->vm, VCPU_ID, &uc); 143 + if (cmd != UCALL_SYNC) 144 + break; 145 + 146 + sem_post(&vcpu_ready); 147 + continue; 148 + } 149 + 150 + if (run->exit_reason != KVM_EXIT_MMIO) 151 + break; 152 + 153 + TEST_ASSERT(vm->mmio_ok, "Unexpected mmio exit"); 154 + TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read"); 155 + TEST_ASSERT(run->mmio.len == 8, 156 + "Unexpected exit mmio size = %u", run->mmio.len); 157 + TEST_ASSERT(run->mmio.phys_addr >= vm->mmio_gpa_min && 158 + run->mmio.phys_addr <= vm->mmio_gpa_max, 159 + "Unexpected exit mmio address = 0x%llx", 160 + run->mmio.phys_addr); 161 + } 162 + 163 + if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) 164 + TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], 165 + __FILE__, uc.args[1], uc.args[2]); 166 + 167 + return NULL; 168 + } 169 + 170 + static void wait_for_vcpu(void) 171 + { 172 + struct timespec ts; 173 + 174 + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), 175 + "clock_gettime() failed: %d\n", errno); 176 + 177 + ts.tv_sec += 2; 178 + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), 179 + "sem_timedwait() failed: %d\n", errno); 180 + } 181 + 182 + static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) 183 + { 184 + uint64_t gpage, pgoffs; 185 + uint32_t slot, slotoffs; 186 + void *base; 187 + 188 + TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate"); 189 + TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096, 190 + "Too high gpa to translate"); 191 + gpa -= MEM_GPA; 192 + 193 + gpage = gpa / 4096; 194 + pgoffs = gpa % 4096; 195 + slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1); 196 + slotoffs = gpage - (slot * data->pages_per_slot); 197 + 198 + if (rempages) { 199 + uint64_t slotpages; 200 + 201 + if (slot == data->nslots - 1) 202 + slotpages = data->npages - slot * data->pages_per_slot; 203 + else 204 + slotpages = data->pages_per_slot; 205 + 206 + TEST_ASSERT(!pgoffs, 207 + "Asking for remaining pages in slot but gpa not page aligned"); 208 + *rempages = slotpages - slotoffs; 209 + } 210 + 211 + base = data->hva_slots[slot]; 212 + return (uint8_t *)base + slotoffs * 4096 + pgoffs; 213 + } 214 + 215 + static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot) 216 + { 217 + TEST_ASSERT(slot < data->nslots, "Too high slot number"); 218 + 219 + return MEM_GPA + slot * data->pages_per_slot * 4096; 220 + } 221 + 222 + static struct vm_data *alloc_vm(void) 223 + { 224 + struct vm_data *data; 225 + 226 + data = malloc(sizeof(*data)); 227 + TEST_ASSERT(data, "malloc(vmdata) failed"); 228 + 229 + data->vm = NULL; 230 + data->hva_slots = NULL; 231 + 232 + return data; 233 + } 234 + 235 + static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, 236 + void *guest_code, uint64_t mempages, 237 + struct timespec *slot_runtime) 238 + { 239 + uint32_t max_mem_slots; 240 + uint64_t rempages; 241 + uint64_t guest_addr; 242 + uint32_t slot; 243 + struct timespec tstart; 244 + struct sync_area *sync; 245 + 246 + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); 247 + TEST_ASSERT(max_mem_slots > 1, 248 + "KVM_CAP_NR_MEMSLOTS should be greater than 1"); 249 + TEST_ASSERT(nslots > 1 || nslots == -1, 250 + "Slot count cap should be greater than 1"); 251 + if (nslots != -1) 252 + max_mem_slots = min(max_mem_slots, (uint32_t)nslots); 253 + pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots); 254 + 255 + TEST_ASSERT(mempages > 1, 256 + "Can't test without any memory"); 257 + 258 + data->npages = mempages; 259 + data->nslots = max_mem_slots - 1; 260 + data->pages_per_slot = mempages / data->nslots; 261 + if (!data->pages_per_slot) { 262 + *maxslots = mempages + 1; 263 + return false; 264 + } 265 + 266 + rempages = mempages % data->nslots; 267 + data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); 268 + TEST_ASSERT(data->hva_slots, "malloc() fail"); 269 + 270 + data->vm = vm_create_default(VCPU_ID, 1024, guest_code); 271 + 272 + pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", 273 + max_mem_slots - 1, data->pages_per_slot, rempages); 274 + 275 + clock_gettime(CLOCK_MONOTONIC, &tstart); 276 + for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) { 277 + uint64_t npages; 278 + 279 + npages = data->pages_per_slot; 280 + if (slot == max_mem_slots - 1) 281 + npages += rempages; 282 + 283 + vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS, 284 + guest_addr, slot, npages, 285 + 0); 286 + guest_addr += npages * 4096; 287 + } 288 + *slot_runtime = timespec_elapsed(tstart); 289 + 290 + for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) { 291 + uint64_t npages; 292 + uint64_t gpa; 293 + 294 + npages = data->pages_per_slot; 295 + if (slot == max_mem_slots - 2) 296 + npages += rempages; 297 + 298 + gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, 299 + slot + 1); 300 + TEST_ASSERT(gpa == guest_addr, 301 + "vm_phy_pages_alloc() failed\n"); 302 + 303 + data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr); 304 + memset(data->hva_slots[slot], 0, npages * 4096); 305 + 306 + guest_addr += npages * 4096; 307 + } 308 + 309 + virt_map(data->vm, MEM_GPA, MEM_GPA, mempages, 0); 310 + 311 + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); 312 + atomic_init(&sync->start_flag, false); 313 + atomic_init(&sync->exit_flag, false); 314 + atomic_init(&sync->sync_flag, false); 315 + 316 + data->mmio_ok = false; 317 + 318 + return true; 319 + } 320 + 321 + static void launch_vm(struct vm_data *data) 322 + { 323 + pr_info_v("Launching the test VM\n"); 324 + 325 + pthread_create(&data->vcpu_thread, NULL, vcpu_worker, data); 326 + 327 + /* Ensure the guest thread is spun up. */ 328 + wait_for_vcpu(); 329 + } 330 + 331 + static void free_vm(struct vm_data *data) 332 + { 333 + kvm_vm_free(data->vm); 334 + free(data->hva_slots); 335 + free(data); 336 + } 337 + 338 + static void wait_guest_exit(struct vm_data *data) 339 + { 340 + pthread_join(data->vcpu_thread, NULL); 341 + } 342 + 343 + static void let_guest_run(struct sync_area *sync) 344 + { 345 + atomic_store_explicit(&sync->start_flag, true, memory_order_release); 346 + } 347 + 348 + static void guest_spin_until_start(void) 349 + { 350 + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; 351 + 352 + while (!atomic_load_explicit(&sync->start_flag, memory_order_acquire)) 353 + ; 354 + } 355 + 356 + static void make_guest_exit(struct sync_area *sync) 357 + { 358 + atomic_store_explicit(&sync->exit_flag, true, memory_order_release); 359 + } 360 + 361 + static bool _guest_should_exit(void) 362 + { 363 + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; 364 + 365 + return atomic_load_explicit(&sync->exit_flag, memory_order_acquire); 366 + } 367 + 368 + #define guest_should_exit() unlikely(_guest_should_exit()) 369 + 370 + /* 371 + * noinline so we can easily see how much time the host spends waiting 372 + * for the guest. 373 + * For the same reason use alarm() instead of polling clock_gettime() 374 + * to implement a wait timeout. 375 + */ 376 + static noinline void host_perform_sync(struct sync_area *sync) 377 + { 378 + alarm(2); 379 + 380 + atomic_store_explicit(&sync->sync_flag, true, memory_order_release); 381 + while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire)) 382 + ; 383 + 384 + alarm(0); 385 + } 386 + 387 + static bool guest_perform_sync(void) 388 + { 389 + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; 390 + bool expected; 391 + 392 + do { 393 + if (guest_should_exit()) 394 + return false; 395 + 396 + expected = true; 397 + } while (!atomic_compare_exchange_weak_explicit(&sync->sync_flag, 398 + &expected, false, 399 + memory_order_acq_rel, 400 + memory_order_relaxed)); 401 + 402 + return true; 403 + } 404 + 405 + static void guest_code_test_memslot_move(void) 406 + { 407 + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; 408 + uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr); 409 + 410 + GUEST_SYNC(0); 411 + 412 + guest_spin_until_start(); 413 + 414 + while (!guest_should_exit()) { 415 + uintptr_t ptr; 416 + 417 + for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE; 418 + ptr += 4096) 419 + *(uint64_t *)ptr = MEM_TEST_VAL_1; 420 + 421 + /* 422 + * No host sync here since the MMIO exits are so expensive 423 + * that the host would spend most of its time waiting for 424 + * the guest and so instead of measuring memslot move 425 + * performance we would measure the performance and 426 + * likelihood of MMIO exits 427 + */ 428 + } 429 + 430 + GUEST_DONE(); 431 + } 432 + 433 + static void guest_code_test_memslot_map(void) 434 + { 435 + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; 436 + 437 + GUEST_SYNC(0); 438 + 439 + guest_spin_until_start(); 440 + 441 + while (1) { 442 + uintptr_t ptr; 443 + 444 + for (ptr = MEM_TEST_GPA; 445 + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096) 446 + *(uint64_t *)ptr = MEM_TEST_VAL_1; 447 + 448 + if (!guest_perform_sync()) 449 + break; 450 + 451 + for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; 452 + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096) 453 + *(uint64_t *)ptr = MEM_TEST_VAL_2; 454 + 455 + if (!guest_perform_sync()) 456 + break; 457 + } 458 + 459 + GUEST_DONE(); 460 + } 461 + 462 + static void guest_code_test_memslot_unmap(void) 463 + { 464 + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; 465 + 466 + GUEST_SYNC(0); 467 + 468 + guest_spin_until_start(); 469 + 470 + while (1) { 471 + uintptr_t ptr = MEM_TEST_GPA; 472 + 473 + /* 474 + * We can afford to access (map) just a small number of pages 475 + * per host sync as otherwise the host will spend 476 + * a significant amount of its time waiting for the guest 477 + * (instead of doing unmap operations), so this will 478 + * effectively turn this test into a map performance test. 479 + * 480 + * Just access a single page to be on the safe side. 481 + */ 482 + *(uint64_t *)ptr = MEM_TEST_VAL_1; 483 + 484 + if (!guest_perform_sync()) 485 + break; 486 + 487 + ptr += MEM_TEST_UNMAP_SIZE / 2; 488 + *(uint64_t *)ptr = MEM_TEST_VAL_2; 489 + 490 + if (!guest_perform_sync()) 491 + break; 492 + } 493 + 494 + GUEST_DONE(); 495 + } 496 + 497 + static void guest_code_test_memslot_rw(void) 498 + { 499 + GUEST_SYNC(0); 500 + 501 + guest_spin_until_start(); 502 + 503 + while (1) { 504 + uintptr_t ptr; 505 + 506 + for (ptr = MEM_TEST_GPA; 507 + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) 508 + *(uint64_t *)ptr = MEM_TEST_VAL_1; 509 + 510 + if (!guest_perform_sync()) 511 + break; 512 + 513 + for (ptr = MEM_TEST_GPA + 4096 / 2; 514 + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) { 515 + uint64_t val = *(uint64_t *)ptr; 516 + 517 + GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val); 518 + *(uint64_t *)ptr = 0; 519 + } 520 + 521 + if (!guest_perform_sync()) 522 + break; 523 + } 524 + 525 + GUEST_DONE(); 526 + } 527 + 528 + static bool test_memslot_move_prepare(struct vm_data *data, 529 + struct sync_area *sync, 530 + uint64_t *maxslots, bool isactive) 531 + { 532 + uint64_t movesrcgpa, movetestgpa; 533 + 534 + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); 535 + 536 + if (isactive) { 537 + uint64_t lastpages; 538 + 539 + vm_gpa2hva(data, movesrcgpa, &lastpages); 540 + if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) { 541 + *maxslots = 0; 542 + return false; 543 + } 544 + } 545 + 546 + movetestgpa = movesrcgpa - (MEM_TEST_MOVE_SIZE / (isactive ? 2 : 1)); 547 + sync->move_area_ptr = (void *)movetestgpa; 548 + 549 + if (isactive) { 550 + data->mmio_ok = true; 551 + data->mmio_gpa_min = movesrcgpa; 552 + data->mmio_gpa_max = movesrcgpa + MEM_TEST_MOVE_SIZE / 2 - 1; 553 + } 554 + 555 + return true; 556 + } 557 + 558 + static bool test_memslot_move_prepare_active(struct vm_data *data, 559 + struct sync_area *sync, 560 + uint64_t *maxslots) 561 + { 562 + return test_memslot_move_prepare(data, sync, maxslots, true); 563 + } 564 + 565 + static bool test_memslot_move_prepare_inactive(struct vm_data *data, 566 + struct sync_area *sync, 567 + uint64_t *maxslots) 568 + { 569 + return test_memslot_move_prepare(data, sync, maxslots, false); 570 + } 571 + 572 + static void test_memslot_move_loop(struct vm_data *data, struct sync_area *sync) 573 + { 574 + uint64_t movesrcgpa; 575 + 576 + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); 577 + vm_mem_region_move(data->vm, data->nslots - 1 + 1, 578 + MEM_TEST_MOVE_GPA_DEST); 579 + vm_mem_region_move(data->vm, data->nslots - 1 + 1, movesrcgpa); 580 + } 581 + 582 + static void test_memslot_do_unmap(struct vm_data *data, 583 + uint64_t offsp, uint64_t count) 584 + { 585 + uint64_t gpa, ctr; 586 + 587 + for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) { 588 + uint64_t npages; 589 + void *hva; 590 + int ret; 591 + 592 + hva = vm_gpa2hva(data, gpa, &npages); 593 + TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa); 594 + npages = min(npages, count - ctr); 595 + ret = madvise(hva, npages * 4096, MADV_DONTNEED); 596 + TEST_ASSERT(!ret, 597 + "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64, 598 + hva, gpa); 599 + ctr += npages; 600 + gpa += npages * 4096; 601 + } 602 + TEST_ASSERT(ctr == count, 603 + "madvise(MADV_DONTNEED) should exactly cover all of the requested area"); 604 + } 605 + 606 + static void test_memslot_map_unmap_check(struct vm_data *data, 607 + uint64_t offsp, uint64_t valexp) 608 + { 609 + uint64_t gpa; 610 + uint64_t *val; 611 + 612 + if (!map_unmap_verify) 613 + return; 614 + 615 + gpa = MEM_TEST_GPA + offsp * 4096; 616 + val = (typeof(val))vm_gpa2hva(data, gpa, NULL); 617 + TEST_ASSERT(*val == valexp, 618 + "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")", 619 + *val, valexp, gpa); 620 + *val = 0; 621 + } 622 + 623 + static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) 624 + { 625 + /* 626 + * Unmap the second half of the test area while guest writes to (maps) 627 + * the first half. 628 + */ 629 + test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2, 630 + MEM_TEST_MAP_SIZE_PAGES / 2); 631 + 632 + /* 633 + * Wait for the guest to finish writing the first half of the test 634 + * area, verify the written value on the first and the last page of 635 + * this area and then unmap it. 636 + * Meanwhile, the guest is writing to (mapping) the second half of 637 + * the test area. 638 + */ 639 + host_perform_sync(sync); 640 + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); 641 + test_memslot_map_unmap_check(data, 642 + MEM_TEST_MAP_SIZE_PAGES / 2 - 1, 643 + MEM_TEST_VAL_1); 644 + test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2); 645 + 646 + 647 + /* 648 + * Wait for the guest to finish writing the second half of the test 649 + * area and verify the written value on the first and the last page 650 + * of this area. 651 + * The area will be unmapped at the beginning of the next loop 652 + * iteration. 653 + * Meanwhile, the guest is writing to (mapping) the first half of 654 + * the test area. 655 + */ 656 + host_perform_sync(sync); 657 + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2, 658 + MEM_TEST_VAL_2); 659 + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1, 660 + MEM_TEST_VAL_2); 661 + } 662 + 663 + static void test_memslot_unmap_loop_common(struct vm_data *data, 664 + struct sync_area *sync, 665 + uint64_t chunk) 666 + { 667 + uint64_t ctr; 668 + 669 + /* 670 + * Wait for the guest to finish mapping page(s) in the first half 671 + * of the test area, verify the written value and then perform unmap 672 + * of this area. 673 + * Meanwhile, the guest is writing to (mapping) page(s) in the second 674 + * half of the test area. 675 + */ 676 + host_perform_sync(sync); 677 + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); 678 + for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk) 679 + test_memslot_do_unmap(data, ctr, chunk); 680 + 681 + /* Likewise, but for the opposite host / guest areas */ 682 + host_perform_sync(sync); 683 + test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2, 684 + MEM_TEST_VAL_2); 685 + for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2; 686 + ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk) 687 + test_memslot_do_unmap(data, ctr, chunk); 688 + } 689 + 690 + static void test_memslot_unmap_loop(struct vm_data *data, 691 + struct sync_area *sync) 692 + { 693 + test_memslot_unmap_loop_common(data, sync, 1); 694 + } 695 + 696 + static void test_memslot_unmap_loop_chunked(struct vm_data *data, 697 + struct sync_area *sync) 698 + { 699 + test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES); 700 + } 701 + 702 + static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync) 703 + { 704 + uint64_t gptr; 705 + 706 + for (gptr = MEM_TEST_GPA + 4096 / 2; 707 + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) 708 + *(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2; 709 + 710 + host_perform_sync(sync); 711 + 712 + for (gptr = MEM_TEST_GPA; 713 + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) { 714 + uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL); 715 + uint64_t val = *vptr; 716 + 717 + TEST_ASSERT(val == MEM_TEST_VAL_1, 718 + "Guest written values should read back correctly (is %"PRIu64" @ %"PRIx64")", 719 + val, gptr); 720 + *vptr = 0; 721 + } 722 + 723 + host_perform_sync(sync); 724 + } 725 + 726 + struct test_data { 727 + const char *name; 728 + uint64_t mem_size; 729 + void (*guest_code)(void); 730 + bool (*prepare)(struct vm_data *data, struct sync_area *sync, 731 + uint64_t *maxslots); 732 + void (*loop)(struct vm_data *data, struct sync_area *sync); 733 + }; 734 + 735 + static bool test_execute(int nslots, uint64_t *maxslots, 736 + unsigned int maxtime, 737 + const struct test_data *tdata, 738 + uint64_t *nloops, 739 + struct timespec *slot_runtime, 740 + struct timespec *guest_runtime) 741 + { 742 + uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES; 743 + struct vm_data *data; 744 + struct sync_area *sync; 745 + struct timespec tstart; 746 + bool ret = true; 747 + 748 + data = alloc_vm(); 749 + if (!prepare_vm(data, nslots, maxslots, tdata->guest_code, 750 + mem_size, slot_runtime)) { 751 + ret = false; 752 + goto exit_free; 753 + } 754 + 755 + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); 756 + 757 + if (tdata->prepare && 758 + !tdata->prepare(data, sync, maxslots)) { 759 + ret = false; 760 + goto exit_free; 761 + } 762 + 763 + launch_vm(data); 764 + 765 + clock_gettime(CLOCK_MONOTONIC, &tstart); 766 + let_guest_run(sync); 767 + 768 + while (1) { 769 + *guest_runtime = timespec_elapsed(tstart); 770 + if (guest_runtime->tv_sec >= maxtime) 771 + break; 772 + 773 + tdata->loop(data, sync); 774 + 775 + (*nloops)++; 776 + } 777 + 778 + make_guest_exit(sync); 779 + wait_guest_exit(data); 780 + 781 + exit_free: 782 + free_vm(data); 783 + 784 + return ret; 785 + } 786 + 787 + static const struct test_data tests[] = { 788 + { 789 + .name = "map", 790 + .mem_size = MEM_SIZE_MAP_PAGES, 791 + .guest_code = guest_code_test_memslot_map, 792 + .loop = test_memslot_map_loop, 793 + }, 794 + { 795 + .name = "unmap", 796 + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, 797 + .guest_code = guest_code_test_memslot_unmap, 798 + .loop = test_memslot_unmap_loop, 799 + }, 800 + { 801 + .name = "unmap chunked", 802 + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, 803 + .guest_code = guest_code_test_memslot_unmap, 804 + .loop = test_memslot_unmap_loop_chunked, 805 + }, 806 + { 807 + .name = "move active area", 808 + .guest_code = guest_code_test_memslot_move, 809 + .prepare = test_memslot_move_prepare_active, 810 + .loop = test_memslot_move_loop, 811 + }, 812 + { 813 + .name = "move inactive area", 814 + .guest_code = guest_code_test_memslot_move, 815 + .prepare = test_memslot_move_prepare_inactive, 816 + .loop = test_memslot_move_loop, 817 + }, 818 + { 819 + .name = "RW", 820 + .guest_code = guest_code_test_memslot_rw, 821 + .loop = test_memslot_rw_loop 822 + }, 823 + }; 824 + 825 + #define NTESTS ARRAY_SIZE(tests) 826 + 827 + struct test_args { 828 + int tfirst; 829 + int tlast; 830 + int nslots; 831 + int seconds; 832 + int runs; 833 + }; 834 + 835 + static void help(char *name, struct test_args *targs) 836 + { 837 + int ctr; 838 + 839 + pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n", 840 + name); 841 + pr_info(" -h: print this help screen.\n"); 842 + pr_info(" -v: enable verbose mode (not for benchmarking).\n"); 843 + pr_info(" -d: enable extra debug checks.\n"); 844 + pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", 845 + targs->nslots); 846 + pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", 847 + targs->tfirst, NTESTS - 1); 848 + pr_info(" -e: specify the last test to run (currently: %i; max %zu)\n", 849 + targs->tlast, NTESTS - 1); 850 + pr_info(" -l: specify the test length in seconds (currently: %i)\n", 851 + targs->seconds); 852 + pr_info(" -r: specify the number of runs per test (currently: %i)\n", 853 + targs->runs); 854 + 855 + pr_info("\nAvailable tests:\n"); 856 + for (ctr = 0; ctr < NTESTS; ctr++) 857 + pr_info("%d: %s\n", ctr, tests[ctr].name); 858 + } 859 + 860 + static bool parse_args(int argc, char *argv[], 861 + struct test_args *targs) 862 + { 863 + int opt; 864 + 865 + while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { 866 + switch (opt) { 867 + case 'h': 868 + default: 869 + help(argv[0], targs); 870 + return false; 871 + case 'v': 872 + verbose = true; 873 + break; 874 + case 'd': 875 + map_unmap_verify = true; 876 + break; 877 + case 's': 878 + targs->nslots = atoi(optarg); 879 + if (targs->nslots <= 0 && targs->nslots != -1) { 880 + pr_info("Slot count cap has to be positive or -1 for no cap\n"); 881 + return false; 882 + } 883 + break; 884 + case 'f': 885 + targs->tfirst = atoi(optarg); 886 + if (targs->tfirst < 0) { 887 + pr_info("First test to run has to be non-negative\n"); 888 + return false; 889 + } 890 + break; 891 + case 'e': 892 + targs->tlast = atoi(optarg); 893 + if (targs->tlast < 0 || targs->tlast >= NTESTS) { 894 + pr_info("Last test to run has to be non-negative and less than %zu\n", 895 + NTESTS); 896 + return false; 897 + } 898 + break; 899 + case 'l': 900 + targs->seconds = atoi(optarg); 901 + if (targs->seconds < 0) { 902 + pr_info("Test length in seconds has to be non-negative\n"); 903 + return false; 904 + } 905 + break; 906 + case 'r': 907 + targs->runs = atoi(optarg); 908 + if (targs->runs <= 0) { 909 + pr_info("Runs per test has to be positive\n"); 910 + return false; 911 + } 912 + break; 913 + } 914 + } 915 + 916 + if (optind < argc) { 917 + help(argv[0], targs); 918 + return false; 919 + } 920 + 921 + if (targs->tfirst > targs->tlast) { 922 + pr_info("First test to run cannot be greater than the last test to run\n"); 923 + return false; 924 + } 925 + 926 + return true; 927 + } 928 + 929 + struct test_result { 930 + struct timespec slot_runtime, guest_runtime, iter_runtime; 931 + int64_t slottimens, runtimens; 932 + uint64_t nloops; 933 + }; 934 + 935 + static bool test_loop(const struct test_data *data, 936 + const struct test_args *targs, 937 + struct test_result *rbestslottime, 938 + struct test_result *rbestruntime) 939 + { 940 + uint64_t maxslots; 941 + struct test_result result; 942 + 943 + result.nloops = 0; 944 + if (!test_execute(targs->nslots, &maxslots, targs->seconds, data, 945 + &result.nloops, 946 + &result.slot_runtime, &result.guest_runtime)) { 947 + if (maxslots) 948 + pr_info("Memslot count too high for this test, decrease the cap (max is %"PRIu64")\n", 949 + maxslots); 950 + else 951 + pr_info("Memslot count may be too high for this test, try adjusting the cap\n"); 952 + 953 + return false; 954 + } 955 + 956 + pr_info("Test took %ld.%.9lds for slot setup + %ld.%.9lds all iterations\n", 957 + result.slot_runtime.tv_sec, result.slot_runtime.tv_nsec, 958 + result.guest_runtime.tv_sec, result.guest_runtime.tv_nsec); 959 + if (!result.nloops) { 960 + pr_info("No full loops done - too short test time or system too loaded?\n"); 961 + return true; 962 + } 963 + 964 + result.iter_runtime = timespec_div(result.guest_runtime, 965 + result.nloops); 966 + pr_info("Done %"PRIu64" iterations, avg %ld.%.9lds each\n", 967 + result.nloops, 968 + result.iter_runtime.tv_sec, 969 + result.iter_runtime.tv_nsec); 970 + result.slottimens = timespec_to_ns(result.slot_runtime); 971 + result.runtimens = timespec_to_ns(result.iter_runtime); 972 + 973 + /* 974 + * Only rank the slot setup time for tests using the whole test memory 975 + * area so they are comparable 976 + */ 977 + if (!data->mem_size && 978 + (!rbestslottime->slottimens || 979 + result.slottimens < rbestslottime->slottimens)) 980 + *rbestslottime = result; 981 + if (!rbestruntime->runtimens || 982 + result.runtimens < rbestruntime->runtimens) 983 + *rbestruntime = result; 984 + 985 + return true; 986 + } 987 + 988 + int main(int argc, char *argv[]) 989 + { 990 + struct test_args targs = { 991 + .tfirst = 0, 992 + .tlast = NTESTS - 1, 993 + .nslots = -1, 994 + .seconds = 5, 995 + .runs = 1, 996 + }; 997 + struct test_result rbestslottime; 998 + int tctr; 999 + 1000 + /* Tell stdout not to buffer its content */ 1001 + setbuf(stdout, NULL); 1002 + 1003 + if (!parse_args(argc, argv, &targs)) 1004 + return -1; 1005 + 1006 + rbestslottime.slottimens = 0; 1007 + for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) { 1008 + const struct test_data *data = &tests[tctr]; 1009 + unsigned int runctr; 1010 + struct test_result rbestruntime; 1011 + 1012 + if (tctr > targs.tfirst) 1013 + pr_info("\n"); 1014 + 1015 + pr_info("Testing %s performance with %i runs, %d seconds each\n", 1016 + data->name, targs.runs, targs.seconds); 1017 + 1018 + rbestruntime.runtimens = 0; 1019 + for (runctr = 0; runctr < targs.runs; runctr++) 1020 + if (!test_loop(data, &targs, 1021 + &rbestslottime, &rbestruntime)) 1022 + break; 1023 + 1024 + if (rbestruntime.runtimens) 1025 + pr_info("Best runtime result was %ld.%.9lds per iteration (with %"PRIu64" iterations)\n", 1026 + rbestruntime.iter_runtime.tv_sec, 1027 + rbestruntime.iter_runtime.tv_nsec, 1028 + rbestruntime.nloops); 1029 + } 1030 + 1031 + if (rbestslottime.slottimens) 1032 + pr_info("Best slot setup time for the whole test area was %ld.%.9lds\n", 1033 + rbestslottime.slot_runtime.tv_sec, 1034 + rbestslottime.slot_runtime.tv_nsec); 1035 + 1036 + return 0; 1037 + }

+5

tools/testing/selftests/kvm/x86_64/get_cpuid_test.c

··· 19 19 u32 function; 20 20 u32 index; 21 21 } mangled_cpuids[] = { 22 + /* 23 + * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR, 24 + * which are not controlled for by this test. 25 + */ 22 26 {.function = 0xd, .index = 0}, 27 + {.function = 0xd, .index = 1}, 23 28 }; 24 29 25 30 static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)

+2 -6

tools/testing/selftests/kvm/x86_64/get_msr_index_features.c

··· 37 37 int old_res, res, kvm_fd, r; 38 38 struct kvm_msr_list *list; 39 39 40 - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); 41 - if (kvm_fd < 0) 42 - exit(KSFT_SKIP); 40 + kvm_fd = open_kvm_dev_path_or_exit(); 43 41 44 42 old_res = kvm_num_index_msrs(kvm_fd, 0); 45 43 TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); ··· 99 101 int res, old_res, i, kvm_fd; 100 102 struct kvm_msr_list *feature_list; 101 103 102 - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); 103 - if (kvm_fd < 0) 104 - exit(KSFT_SKIP); 104 + kvm_fd = open_kvm_dev_path_or_exit(); 105 105 106 106 old_res = kvm_num_feature_msrs(kvm_fd, 0); 107 107 TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0");

+4 -2

virt/kvm/kvm_main.c

··· 307 307 { 308 308 return kvm_make_all_cpus_request_except(kvm, req, NULL); 309 309 } 310 + EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); 310 311 311 312 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 312 313 void kvm_flush_remote_tlbs(struct kvm *kvm) ··· 2930 2929 goto out; 2931 2930 if (signal_pending(current)) 2932 2931 goto out; 2932 + if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) 2933 + goto out; 2933 2934 2934 2935 ret = 0; 2935 2936 out: ··· 2976 2973 goto out; 2977 2974 } 2978 2975 poll_end = cur = ktime_get(); 2979 - } while (single_task_running() && !need_resched() && 2980 - ktime_before(cur, stop)); 2976 + } while (kvm_vcpu_can_poll(cur, stop)); 2981 2977 } 2982 2978 2983 2979 prepare_to_rcuwait(&vcpu->wait);

+6 -10

virt/lib/irqbypass.c

··· 40 40 if (prod->add_consumer) 41 41 ret = prod->add_consumer(prod, cons); 42 42 43 - if (ret) 44 - goto err_add_consumer; 45 - 46 - ret = cons->add_producer(cons, prod); 47 - if (ret) 48 - goto err_add_producer; 43 + if (!ret) { 44 + ret = cons->add_producer(cons, prod); 45 + if (ret && prod->del_consumer) 46 + prod->del_consumer(prod, cons); 47 + } 49 48 50 49 if (cons->start) 51 50 cons->start(cons); 52 51 if (prod->start) 53 52 prod->start(prod); 54 - err_add_producer: 55 - if (prod->del_consumer) 56 - prod->del_consumer(prod, cons); 57 - err_add_consumer: 53 + 58 54 return ret; 59 55 } 60 56