Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'kvmarm-fixes-6.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm64 fixes for 6.14, take #2

- Large set of fixes for vector handling, specially in the interactions
between host and guest state. This fixes a number of bugs affecting
actual deployments, and greatly simplifies the FP/SIMD/SVE handling.
Thanks to Mark Rutland for dealing with this thankless task.

- Fix an ugly race between vcpu and vgic creation/init, resulting in
unexpected behaviours.

- Fix use of kernel VAs at EL2 when emulating timers with nVHE.

- Small set of pKVM improvements and cleanups.

+286 -374
-42
arch/arm64/include/asm/kvm_emulate.h
··· 605 605 __cpacr_to_cptr_set(clr, set));\ 606 606 } while (0) 607 607 608 - static __always_inline void kvm_write_cptr_el2(u64 val) 609 - { 610 - if (has_vhe() || has_hvhe()) 611 - write_sysreg(val, cpacr_el1); 612 - else 613 - write_sysreg(val, cptr_el2); 614 - } 615 - 616 - /* Resets the value of cptr_el2 when returning to the host. */ 617 - static __always_inline void __kvm_reset_cptr_el2(struct kvm *kvm) 618 - { 619 - u64 val; 620 - 621 - if (has_vhe()) { 622 - val = (CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN); 623 - if (cpus_have_final_cap(ARM64_SME)) 624 - val |= CPACR_EL1_SMEN_EL1EN; 625 - } else if (has_hvhe()) { 626 - val = CPACR_EL1_FPEN; 627 - 628 - if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) 629 - val |= CPACR_EL1_ZEN; 630 - if (cpus_have_final_cap(ARM64_SME)) 631 - val |= CPACR_EL1_SMEN; 632 - } else { 633 - val = CPTR_NVHE_EL2_RES1; 634 - 635 - if (kvm_has_sve(kvm) && guest_owns_fp_regs()) 636 - val |= CPTR_EL2_TZ; 637 - if (!cpus_have_final_cap(ARM64_SME)) 638 - val |= CPTR_EL2_TSM; 639 - } 640 - 641 - kvm_write_cptr_el2(val); 642 - } 643 - 644 - #ifdef __KVM_NVHE_HYPERVISOR__ 645 - #define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2(kern_hyp_va((v)->kvm)) 646 - #else 647 - #define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2((v)->kvm) 648 - #endif 649 - 650 608 /* 651 609 * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE 652 610 * format if E2H isn't set.
+5 -17
arch/arm64/include/asm/kvm_host.h
··· 100 100 static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, 101 101 void *(*to_va)(phys_addr_t phys)) 102 102 { 103 - phys_addr_t *p = to_va(mc->head); 103 + phys_addr_t *p = to_va(mc->head & PAGE_MASK); 104 104 105 105 if (!mc->nr_pages) 106 106 return NULL; ··· 615 615 struct kvm_host_data { 616 616 #define KVM_HOST_DATA_FLAG_HAS_SPE 0 617 617 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 618 - #define KVM_HOST_DATA_FLAG_HOST_SVE_ENABLED 2 619 - #define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 620 618 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 621 619 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 622 620 unsigned long flags; ··· 622 624 struct kvm_cpu_context host_ctxt; 623 625 624 626 /* 625 - * All pointers in this union are hyp VA. 627 + * Hyp VA. 626 628 * sve_state is only used in pKVM and if system_supports_sve(). 627 629 */ 628 - union { 629 - struct user_fpsimd_state *fpsimd_state; 630 - struct cpu_sve_state *sve_state; 631 - }; 630 + struct cpu_sve_state *sve_state; 632 631 633 - union { 634 - /* HYP VA pointer to the host storage for FPMR */ 635 - u64 *fpmr_ptr; 636 - /* 637 - * Used by pKVM only, as it needs to provide storage 638 - * for the host 639 - */ 640 - u64 fpmr; 641 - }; 632 + /* Used by pKVM only. */ 633 + u64 fpmr; 642 634 643 635 /* Ownership of the FP regs */ 644 636 enum {
-25
arch/arm64/kernel/fpsimd.c
··· 1695 1695 } 1696 1696 1697 1697 /* 1698 - * Called by KVM when entering the guest. 1699 - */ 1700 - void fpsimd_kvm_prepare(void) 1701 - { 1702 - if (!system_supports_sve()) 1703 - return; 1704 - 1705 - /* 1706 - * KVM does not save host SVE state since we can only enter 1707 - * the guest from a syscall so the ABI means that only the 1708 - * non-saved SVE state needs to be saved. If we have left 1709 - * SVE enabled for performance reasons then update the task 1710 - * state to be FPSIMD only. 1711 - */ 1712 - get_cpu_fpsimd_context(); 1713 - 1714 - if (test_and_clear_thread_flag(TIF_SVE)) { 1715 - sve_to_fpsimd(current); 1716 - current->thread.fp_type = FP_STATE_FPSIMD; 1717 - } 1718 - 1719 - put_cpu_fpsimd_context(); 1720 - } 1721 - 1722 - /* 1723 1698 * Associate current's FPSIMD context with this cpu 1724 1699 * The caller must have ownership of the cpu FPSIMD context before calling 1725 1700 * this function.
+7 -9
arch/arm64/kvm/arch_timer.c
··· 447 447 static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, 448 448 struct arch_timer_context *timer_ctx) 449 449 { 450 - int ret; 451 - 452 450 kvm_timer_update_status(timer_ctx, new_level); 453 451 454 452 timer_ctx->irq.level = new_level; 455 453 trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), 456 454 timer_ctx->irq.level); 457 455 458 - if (!userspace_irqchip(vcpu->kvm)) { 459 - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, 460 - timer_irq(timer_ctx), 461 - timer_ctx->irq.level, 462 - timer_ctx); 463 - WARN_ON(ret); 464 - } 456 + if (userspace_irqchip(vcpu->kvm)) 457 + return; 458 + 459 + kvm_vgic_inject_irq(vcpu->kvm, vcpu, 460 + timer_irq(timer_ctx), 461 + timer_ctx->irq.level, 462 + timer_ctx); 465 463 } 466 464 467 465 /* Only called for a fully emulated timer */
-8
arch/arm64/kvm/arm.c
··· 2481 2481 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = 2482 2482 kern_hyp_va(sve_state); 2483 2483 } 2484 - } else { 2485 - for_each_possible_cpu(cpu) { 2486 - struct user_fpsimd_state *fpsimd_state; 2487 - 2488 - fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs; 2489 - per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state = 2490 - kern_hyp_va(fpsimd_state); 2491 - } 2492 2484 } 2493 2485 } 2494 2486
+9 -98
arch/arm64/kvm/fpsimd.c
··· 54 54 if (!system_supports_fpsimd()) 55 55 return; 56 56 57 - fpsimd_kvm_prepare(); 58 - 59 57 /* 60 - * We will check TIF_FOREIGN_FPSTATE just before entering the 61 - * guest in kvm_arch_vcpu_ctxflush_fp() and override this to 62 - * FP_STATE_FREE if the flag set. 58 + * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such 59 + * that the host kernel is responsible for restoring this state upon 60 + * return to userspace, and the hyp code doesn't need to save anything. 61 + * 62 + * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures 63 + * that PSTATE.{SM,ZA} == {0,0}. 63 64 */ 64 - *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; 65 - *host_data_ptr(fpsimd_state) = kern_hyp_va(&current->thread.uw.fpsimd_state); 66 - *host_data_ptr(fpmr_ptr) = kern_hyp_va(&current->thread.uw.fpmr); 65 + fpsimd_save_and_flush_cpu_state(); 66 + *host_data_ptr(fp_owner) = FP_STATE_FREE; 67 67 68 - host_data_clear_flag(HOST_SVE_ENABLED); 69 - if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) 70 - host_data_set_flag(HOST_SVE_ENABLED); 71 - 72 - if (system_supports_sme()) { 73 - host_data_clear_flag(HOST_SME_ENABLED); 74 - if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) 75 - host_data_set_flag(HOST_SME_ENABLED); 76 - 77 - /* 78 - * If PSTATE.SM is enabled then save any pending FP 79 - * state and disable PSTATE.SM. If we leave PSTATE.SM 80 - * enabled and the guest does not enable SME via 81 - * CPACR_EL1.SMEN then operations that should be valid 82 - * may generate SME traps from EL1 to EL1 which we 83 - * can't intercept and which would confuse the guest. 84 - * 85 - * Do the same for PSTATE.ZA in the case where there 86 - * is state in the registers which has not already 87 - * been saved, this is very unlikely to happen. 88 - */ 89 - if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { 90 - *host_data_ptr(fp_owner) = FP_STATE_FREE; 91 - fpsimd_save_and_flush_cpu_state(); 92 - } 93 - } 94 - 95 - /* 96 - * If normal guests gain SME support, maintain this behavior for pKVM 97 - * guests, which don't support SME. 98 - */ 99 - WARN_ON(is_protected_kvm_enabled() && system_supports_sme() && 100 - read_sysreg_s(SYS_SVCR)); 68 + WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); 101 69 } 102 70 103 71 /* ··· 130 162 131 163 local_irq_save(flags); 132 164 133 - /* 134 - * If we have VHE then the Hyp code will reset CPACR_EL1 to 135 - * the default value and we need to reenable SME. 136 - */ 137 - if (has_vhe() && system_supports_sme()) { 138 - /* Also restore EL0 state seen on entry */ 139 - if (host_data_test_flag(HOST_SME_ENABLED)) 140 - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_SMEN); 141 - else 142 - sysreg_clear_set(CPACR_EL1, 143 - CPACR_EL1_SMEN_EL0EN, 144 - CPACR_EL1_SMEN_EL1EN); 145 - isb(); 146 - } 147 - 148 165 if (guest_owns_fp_regs()) { 149 - if (vcpu_has_sve(vcpu)) { 150 - u64 zcr = read_sysreg_el1(SYS_ZCR); 151 - 152 - /* 153 - * If the vCPU is in the hyp context then ZCR_EL1 is 154 - * loaded with its vEL2 counterpart. 155 - */ 156 - __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr; 157 - 158 - /* 159 - * Restore the VL that was saved when bound to the CPU, 160 - * which is the maximum VL for the guest. Because the 161 - * layout of the data when saving the sve state depends 162 - * on the VL, we need to use a consistent (i.e., the 163 - * maximum) VL. 164 - * Note that this means that at guest exit ZCR_EL1 is 165 - * not necessarily the same as on guest entry. 166 - * 167 - * ZCR_EL2 holds the guest hypervisor's VL when running 168 - * a nested guest, which could be smaller than the 169 - * max for the vCPU. Similar to above, we first need to 170 - * switch to a VL consistent with the layout of the 171 - * vCPU's SVE state. KVM support for NV implies VHE, so 172 - * using the ZCR_EL1 alias is safe. 173 - */ 174 - if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) 175 - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, 176 - SYS_ZCR_EL1); 177 - } 178 - 179 166 /* 180 167 * Flush (save and invalidate) the fpsimd/sve state so that if 181 168 * the host tries to use fpsimd/sve, it's not using stale data ··· 142 219 * when needed. 143 220 */ 144 221 fpsimd_save_and_flush_cpu_state(); 145 - } else if (has_vhe() && system_supports_sve()) { 146 - /* 147 - * The FPSIMD/SVE state in the CPU has not been touched, and we 148 - * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been 149 - * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE 150 - * for EL0. To avoid spurious traps, restore the trap state 151 - * seen by kvm_arch_vcpu_load_fp(): 152 - */ 153 - if (host_data_test_flag(HOST_SVE_ENABLED)) 154 - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); 155 - else 156 - sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); 157 222 } 158 223 159 224 local_irq_restore(flags);
+5
arch/arm64/kvm/hyp/entry.S
··· 44 44 alternative_else_nop_endif 45 45 mrs x1, isr_el1 46 46 cbz x1, 1f 47 + 48 + // Ensure that __guest_enter() always provides a context 49 + // synchronization event so that callers don't need ISBs for anything 50 + // that would usually be synchonized by the ERET. 51 + isb 47 52 mov x0, #ARM_EXCEPTION_IRQ 48 53 ret 49 54
+111 -37
arch/arm64/kvm/hyp/include/hyp/switch.h
··· 326 326 return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); 327 327 } 328 328 329 - static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) 329 + static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) 330 330 { 331 331 *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); 332 332 arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); ··· 375 375 true); 376 376 } 377 377 378 - static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); 378 + static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) 379 + { 380 + u64 zcr_el1, zcr_el2; 381 + 382 + if (!guest_owns_fp_regs()) 383 + return; 384 + 385 + if (vcpu_has_sve(vcpu)) { 386 + /* A guest hypervisor may restrict the effective max VL. */ 387 + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) 388 + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); 389 + else 390 + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; 391 + 392 + write_sysreg_el2(zcr_el2, SYS_ZCR); 393 + 394 + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); 395 + write_sysreg_el1(zcr_el1, SYS_ZCR); 396 + } 397 + } 398 + 399 + static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) 400 + { 401 + u64 zcr_el1, zcr_el2; 402 + 403 + if (!guest_owns_fp_regs()) 404 + return; 405 + 406 + /* 407 + * When the guest owns the FP regs, we know that guest+hyp traps for 408 + * any FPSIMD/SVE/SME features exposed to the guest have been disabled 409 + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() 410 + * prior to __guest_entry(). As __guest_entry() guarantees a context 411 + * synchronization event, we don't need an ISB here to avoid taking 412 + * traps for anything that was exposed to the guest. 413 + */ 414 + if (vcpu_has_sve(vcpu)) { 415 + zcr_el1 = read_sysreg_el1(SYS_ZCR); 416 + __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; 417 + 418 + /* 419 + * The guest's state is always saved using the guest's max VL. 420 + * Ensure that the host has the guest's max VL active such that 421 + * the host can save the guest's state lazily, but don't 422 + * artificially restrict the host to the guest's max VL. 423 + */ 424 + if (has_vhe()) { 425 + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; 426 + write_sysreg_el2(zcr_el2, SYS_ZCR); 427 + } else { 428 + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; 429 + write_sysreg_el2(zcr_el2, SYS_ZCR); 430 + 431 + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; 432 + write_sysreg_el1(zcr_el1, SYS_ZCR); 433 + } 434 + } 435 + } 436 + 437 + static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) 438 + { 439 + /* 440 + * Non-protected kvm relies on the host restoring its sve state. 441 + * Protected kvm restores the host's sve state as not to reveal that 442 + * fpsimd was used by a guest nor leak upper sve bits. 443 + */ 444 + if (system_supports_sve()) { 445 + __hyp_sve_save_host(); 446 + 447 + /* Re-enable SVE traps if not supported for the guest vcpu. */ 448 + if (!vcpu_has_sve(vcpu)) 449 + cpacr_clear_set(CPACR_EL1_ZEN, 0); 450 + 451 + } else { 452 + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); 453 + } 454 + 455 + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) 456 + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); 457 + } 458 + 379 459 380 460 /* 381 461 * We trap the first access to the FP/SIMD to save the host context and ··· 463 383 * If FP/SIMD is not implemented, handle the trap and inject an undefined 464 384 * instruction exception to the guest. Similarly for trapped SVE accesses. 465 385 */ 466 - static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) 386 + static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) 467 387 { 468 388 bool sve_guest; 469 389 u8 esr_ec; ··· 505 425 isb(); 506 426 507 427 /* Write out the host state if it's in the registers */ 508 - if (host_owns_fp_regs()) 428 + if (is_protected_kvm_enabled() && host_owns_fp_regs()) 509 429 kvm_hyp_save_fpsimd_host(vcpu); 510 430 511 431 /* Restore the guest state */ ··· 581 501 return true; 582 502 } 583 503 504 + /* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ 505 + static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) 506 + { 507 + u64 offset = 0; 508 + 509 + if (ctxt->offset.vm_offset) 510 + offset += *kern_hyp_va(ctxt->offset.vm_offset); 511 + if (ctxt->offset.vcpu_offset) 512 + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); 513 + 514 + return offset; 515 + } 516 + 584 517 static inline u64 compute_counter_value(struct arch_timer_context *ctxt) 585 518 { 586 - return arch_timer_read_cntpct_el0() - timer_get_offset(ctxt); 519 + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); 587 520 } 588 521 589 522 static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) ··· 680 587 return true; 681 588 } 682 589 683 - static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) 590 + static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) 684 591 { 685 592 if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && 686 593 handle_tx2_tvm(vcpu)) ··· 700 607 return false; 701 608 } 702 609 703 - static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) 610 + static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) 704 611 { 705 612 if (static_branch_unlikely(&vgic_v3_cpuif_trap) && 706 613 __vgic_v3_perform_cpuif_access(vcpu) == 1) ··· 709 616 return false; 710 617 } 711 618 712 - static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code) 619 + static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, 620 + u64 *exit_code) 713 621 { 714 622 if (!__populate_fault_info(vcpu)) 715 623 return true; 716 624 717 625 return false; 718 626 } 719 - static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) 720 - __alias(kvm_hyp_handle_memory_fault); 721 - static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code) 722 - __alias(kvm_hyp_handle_memory_fault); 627 + #define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault 628 + #define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault 723 629 724 - static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) 630 + static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) 725 631 { 726 632 if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) 727 633 return true; ··· 750 658 751 659 typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); 752 660 753 - static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); 754 - 755 - static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); 756 - 757 661 /* 758 662 * Allow the hypervisor to handle the exit with an exit handler if it has one. 759 663 * 760 664 * Returns true if the hypervisor handled the exit, and control should go back 761 665 * to the guest, or false if it hasn't. 762 666 */ 763 - static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) 667 + static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, 668 + const exit_handler_fn *handlers) 764 669 { 765 - const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); 766 - exit_handler_fn fn; 767 - 768 - fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; 769 - 670 + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; 770 671 if (fn) 771 672 return fn(vcpu, exit_code); 772 673 ··· 789 704 * the guest, false when we should restore the host state and return to the 790 705 * main run loop. 791 706 */ 792 - static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) 707 + static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, 708 + const exit_handler_fn *handlers) 793 709 { 794 - /* 795 - * Save PSTATE early so that we can evaluate the vcpu mode 796 - * early on. 797 - */ 798 - synchronize_vcpu_pstate(vcpu, exit_code); 799 - 800 - /* 801 - * Check whether we want to repaint the state one way or 802 - * another. 803 - */ 804 - early_exit_filter(vcpu, exit_code); 805 - 806 710 if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) 807 711 vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); 808 712 ··· 821 747 goto exit; 822 748 823 749 /* Check if there's an exit handler and allow it to handle the exit. */ 824 - if (kvm_hyp_handle_exit(vcpu, exit_code)) 750 + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) 825 751 goto guest; 826 752 exit: 827 753 /* Return to the host kernel and handle the exit */
+7 -8
arch/arm64/kvm/hyp/nvhe/hyp-main.c
··· 5 5 */ 6 6 7 7 #include <hyp/adjust_pc.h> 8 + #include <hyp/switch.h> 8 9 9 10 #include <asm/pgtable-types.h> 10 11 #include <asm/kvm_asm.h> ··· 84 83 if (system_supports_sve()) 85 84 __hyp_sve_restore_host(); 86 85 else 87 - __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); 86 + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); 88 87 89 88 if (has_fpmr) 90 89 write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); ··· 225 224 226 225 sync_hyp_vcpu(hyp_vcpu); 227 226 } else { 227 + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); 228 + 228 229 /* The host is fully trusted, run its vCPU directly. */ 229 - ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu)); 230 + fpsimd_lazy_switch_to_guest(vcpu); 231 + ret = __kvm_vcpu_run(vcpu); 232 + fpsimd_lazy_switch_to_host(vcpu); 230 233 } 231 234 out: 232 235 cpu_reg(host_ctxt, 1) = ret; ··· 679 674 break; 680 675 case ESR_ELx_EC_SMC64: 681 676 handle_host_smc(host_ctxt); 682 - break; 683 - case ESR_ELx_EC_SVE: 684 - cpacr_clear_set(0, CPACR_EL1_ZEN); 685 - isb(); 686 - sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, 687 - SYS_ZCR_EL2); 688 677 break; 689 678 case ESR_ELx_EC_IABT_LOW: 690 679 case ESR_ELx_EC_DABT_LOW:
+41 -35
arch/arm64/kvm/hyp/nvhe/mem_protect.c
··· 943 943 ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); 944 944 if (ret) 945 945 return ret; 946 - if (level != KVM_PGTABLE_LAST_LEVEL) 947 - return -E2BIG; 948 946 if (!kvm_pte_valid(pte)) 949 947 return -ENOENT; 948 + if (level != KVM_PGTABLE_LAST_LEVEL) 949 + return -E2BIG; 950 950 951 951 state = guest_get_page_state(pte, ipa); 952 952 if (state != PKVM_PAGE_SHARED_BORROWED) ··· 998 998 return ret; 999 999 } 1000 1000 1001 - int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) 1001 + static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) 1002 1002 { 1003 - struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); 1004 - u64 ipa = hyp_pfn_to_phys(gfn); 1005 1003 u64 phys; 1006 1004 int ret; 1007 1005 1008 - if (prot & ~KVM_PGTABLE_PROT_RWX) 1009 - return -EINVAL; 1006 + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) 1007 + return; 1010 1008 1011 1009 host_lock_component(); 1012 1010 guest_lock_component(vm); 1013 1011 1014 1012 ret = __check_host_shared_guest(vm, &phys, ipa); 1015 - if (!ret) 1016 - ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); 1017 1013 1018 1014 guest_unlock_component(vm); 1019 1015 host_unlock_component(); 1016 + 1017 + WARN_ON(ret && ret != -ENOENT); 1018 + } 1019 + 1020 + int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) 1021 + { 1022 + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); 1023 + u64 ipa = hyp_pfn_to_phys(gfn); 1024 + int ret; 1025 + 1026 + if (pkvm_hyp_vm_is_protected(vm)) 1027 + return -EPERM; 1028 + 1029 + if (prot & ~KVM_PGTABLE_PROT_RWX) 1030 + return -EINVAL; 1031 + 1032 + assert_host_shared_guest(vm, ipa); 1033 + guest_lock_component(vm); 1034 + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); 1035 + guest_unlock_component(vm); 1020 1036 1021 1037 return ret; 1022 1038 } ··· 1040 1024 int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) 1041 1025 { 1042 1026 u64 ipa = hyp_pfn_to_phys(gfn); 1043 - u64 phys; 1044 1027 int ret; 1045 1028 1046 - host_lock_component(); 1029 + if (pkvm_hyp_vm_is_protected(vm)) 1030 + return -EPERM; 1031 + 1032 + assert_host_shared_guest(vm, ipa); 1047 1033 guest_lock_component(vm); 1048 - 1049 - ret = __check_host_shared_guest(vm, &phys, ipa); 1050 - if (!ret) 1051 - ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); 1052 - 1034 + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); 1053 1035 guest_unlock_component(vm); 1054 - host_unlock_component(); 1055 1036 1056 1037 return ret; 1057 1038 } ··· 1056 1043 int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) 1057 1044 { 1058 1045 u64 ipa = hyp_pfn_to_phys(gfn); 1059 - u64 phys; 1060 1046 int ret; 1061 1047 1062 - host_lock_component(); 1048 + if (pkvm_hyp_vm_is_protected(vm)) 1049 + return -EPERM; 1050 + 1051 + assert_host_shared_guest(vm, ipa); 1063 1052 guest_lock_component(vm); 1064 - 1065 - ret = __check_host_shared_guest(vm, &phys, ipa); 1066 - if (!ret) 1067 - ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); 1068 - 1053 + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); 1069 1054 guest_unlock_component(vm); 1070 - host_unlock_component(); 1071 1055 1072 1056 return ret; 1073 1057 } ··· 1073 1063 { 1074 1064 struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); 1075 1065 u64 ipa = hyp_pfn_to_phys(gfn); 1076 - u64 phys; 1077 - int ret; 1078 1066 1079 - host_lock_component(); 1067 + if (pkvm_hyp_vm_is_protected(vm)) 1068 + return -EPERM; 1069 + 1070 + assert_host_shared_guest(vm, ipa); 1080 1071 guest_lock_component(vm); 1081 - 1082 - ret = __check_host_shared_guest(vm, &phys, ipa); 1083 - if (!ret) 1084 - kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); 1085 - 1072 + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); 1086 1073 guest_unlock_component(vm); 1087 - host_unlock_component(); 1088 1074 1089 - return ret; 1075 + return 0; 1090 1076 }
+45 -44
arch/arm64/kvm/hyp/nvhe/switch.c
··· 39 39 { 40 40 u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ 41 41 42 + if (!guest_owns_fp_regs()) 43 + __activate_traps_fpsimd32(vcpu); 44 + 42 45 if (has_hvhe()) { 43 46 val |= CPACR_EL1_TTA; 44 47 ··· 50 47 if (vcpu_has_sve(vcpu)) 51 48 val |= CPACR_EL1_ZEN; 52 49 } 50 + 51 + write_sysreg(val, cpacr_el1); 53 52 } else { 54 53 val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; 55 54 ··· 66 61 67 62 if (!guest_owns_fp_regs()) 68 63 val |= CPTR_EL2_TFP; 64 + 65 + write_sysreg(val, cptr_el2); 69 66 } 67 + } 70 68 71 - if (!guest_owns_fp_regs()) 72 - __activate_traps_fpsimd32(vcpu); 69 + static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) 70 + { 71 + if (has_hvhe()) { 72 + u64 val = CPACR_EL1_FPEN; 73 73 74 - kvm_write_cptr_el2(val); 74 + if (cpus_have_final_cap(ARM64_SVE)) 75 + val |= CPACR_EL1_ZEN; 76 + if (cpus_have_final_cap(ARM64_SME)) 77 + val |= CPACR_EL1_SMEN; 78 + 79 + write_sysreg(val, cpacr_el1); 80 + } else { 81 + u64 val = CPTR_NVHE_EL2_RES1; 82 + 83 + if (!cpus_have_final_cap(ARM64_SVE)) 84 + val |= CPTR_EL2_TZ; 85 + if (!cpus_have_final_cap(ARM64_SME)) 86 + val |= CPTR_EL2_TSM; 87 + 88 + write_sysreg(val, cptr_el2); 89 + } 75 90 } 76 91 77 92 static void __activate_traps(struct kvm_vcpu *vcpu) ··· 144 119 145 120 write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); 146 121 147 - kvm_reset_cptr_el2(vcpu); 122 + __deactivate_cptr_traps(vcpu); 148 123 write_sysreg(__kvm_hyp_host_vector, vbar_el2); 149 124 } 150 125 ··· 217 192 kvm_handle_pvm_sysreg(vcpu, exit_code)); 218 193 } 219 194 220 - static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) 221 - { 222 - /* 223 - * Non-protected kvm relies on the host restoring its sve state. 224 - * Protected kvm restores the host's sve state as not to reveal that 225 - * fpsimd was used by a guest nor leak upper sve bits. 226 - */ 227 - if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { 228 - __hyp_sve_save_host(); 229 - 230 - /* Re-enable SVE traps if not supported for the guest vcpu. */ 231 - if (!vcpu_has_sve(vcpu)) 232 - cpacr_clear_set(CPACR_EL1_ZEN, 0); 233 - 234 - } else { 235 - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); 236 - } 237 - 238 - if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { 239 - u64 val = read_sysreg_s(SYS_FPMR); 240 - 241 - if (unlikely(is_protected_kvm_enabled())) 242 - *host_data_ptr(fpmr) = val; 243 - else 244 - **host_data_ptr(fpmr_ptr) = val; 245 - } 246 - } 247 - 248 195 static const exit_handler_fn hyp_exit_handlers[] = { 249 196 [0 ... ESR_ELx_EC_MAX] = NULL, 250 197 [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, ··· 248 251 return hyp_exit_handlers; 249 252 } 250 253 251 - /* 252 - * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. 253 - * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a 254 - * guest from dropping to AArch32 EL0 if implemented by the CPU. If the 255 - * hypervisor spots a guest in such a state ensure it is handled, and don't 256 - * trust the host to spot or fix it. The check below is based on the one in 257 - * kvm_arch_vcpu_ioctl_run(). 258 - * 259 - * Returns false if the guest ran in AArch32 when it shouldn't have, and 260 - * thus should exit to the host, or true if a the guest run loop can continue. 261 - */ 262 - static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) 254 + static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) 263 255 { 256 + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); 257 + 258 + synchronize_vcpu_pstate(vcpu, exit_code); 259 + 260 + /* 261 + * Some guests (e.g., protected VMs) are not be allowed to run in 262 + * AArch32. The ARMv8 architecture does not give the hypervisor a 263 + * mechanism to prevent a guest from dropping to AArch32 EL0 if 264 + * implemented by the CPU. If the hypervisor spots a guest in such a 265 + * state ensure it is handled, and don't trust the host to spot or fix 266 + * it. The check below is based on the one in 267 + * kvm_arch_vcpu_ioctl_run(). 268 + */ 264 269 if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { 265 270 /* 266 271 * As we have caught the guest red-handed, decide that it isn't ··· 275 276 *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); 276 277 *exit_code |= ARM_EXCEPTION_IL; 277 278 } 279 + 280 + return __fixup_guest_exit(vcpu, exit_code, handlers); 278 281 } 279 282 280 283 /* Switch to the guest for legacy non-VHE systems */
+19 -14
arch/arm64/kvm/hyp/vhe/switch.c
··· 136 136 write_sysreg(val, cpacr_el1); 137 137 } 138 138 139 + static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) 140 + { 141 + u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; 142 + 143 + if (cpus_have_final_cap(ARM64_SME)) 144 + val |= CPACR_EL1_SMEN_EL1EN; 145 + 146 + write_sysreg(val, cpacr_el1); 147 + } 148 + 139 149 static void __activate_traps(struct kvm_vcpu *vcpu) 140 150 { 141 151 u64 val; ··· 217 207 */ 218 208 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); 219 209 220 - kvm_reset_cptr_el2(vcpu); 210 + __deactivate_cptr_traps(vcpu); 221 211 222 212 if (!arm64_kernel_unmapped_at_el0()) 223 213 host_vectors = __this_cpu_read(this_cpu_vector); ··· 423 413 return true; 424 414 } 425 415 426 - static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) 427 - { 428 - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); 429 - 430 - if (kvm_has_fpmr(vcpu->kvm)) 431 - **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); 432 - } 433 - 434 416 static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) 435 417 { 436 418 int ret = -EINVAL; ··· 540 538 [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, 541 539 }; 542 540 543 - static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) 541 + static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) 544 542 { 545 - return hyp_exit_handlers; 546 - } 543 + synchronize_vcpu_pstate(vcpu, exit_code); 547 544 548 - static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) 549 - { 550 545 /* 551 546 * If we were in HYP context on entry, adjust the PSTATE view 552 547 * so that the usual helpers work correctly. ··· 563 564 *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); 564 565 *vcpu_cpsr(vcpu) |= mode; 565 566 } 567 + 568 + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); 566 569 } 567 570 568 571 /* Switch to the guest for VHE systems running in EL2 */ ··· 578 577 guest_ctxt = &vcpu->arch.ctxt; 579 578 580 579 sysreg_save_host_state_vhe(host_ctxt); 580 + 581 + fpsimd_lazy_switch_to_guest(vcpu); 581 582 582 583 /* 583 584 * Note that ARM erratum 1165522 requires us to configure both stage 1 ··· 604 601 sysreg_save_guest_state_vhe(guest_ctxt); 605 602 606 603 __deactivate_traps(vcpu); 604 + 605 + fpsimd_lazy_switch_to_host(vcpu); 607 606 608 607 sysreg_restore_host_state_vhe(host_ctxt); 609 608
+37 -37
arch/arm64/kvm/vgic/vgic-init.c
··· 34 34 * 35 35 * CPU Interface: 36 36 * 37 - * - kvm_vgic_vcpu_init(): initialization of static data that 38 - * doesn't depend on any sizing information or emulation type. No 39 - * allocation is allowed there. 37 + * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend 38 + * on any sizing information. Private interrupts are allocated if not 39 + * already allocated at vgic-creation time. 40 40 */ 41 41 42 42 /* EARLY INIT */ ··· 57 57 } 58 58 59 59 /* CREATION */ 60 + 61 + static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); 60 62 61 63 /** 62 64 * kvm_vgic_create: triggered by the instantiation of the VGIC device by ··· 111 109 112 110 if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) { 113 111 ret = -E2BIG; 112 + goto out_unlock; 113 + } 114 + 115 + kvm_for_each_vcpu(i, vcpu, kvm) { 116 + ret = vgic_allocate_private_irqs_locked(vcpu, type); 117 + if (ret) 118 + break; 119 + } 120 + 121 + if (ret) { 122 + kvm_for_each_vcpu(i, vcpu, kvm) { 123 + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 124 + kfree(vgic_cpu->private_irqs); 125 + vgic_cpu->private_irqs = NULL; 126 + } 127 + 114 128 goto out_unlock; 115 129 } 116 130 ··· 198 180 return 0; 199 181 } 200 182 201 - static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) 183 + static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) 202 184 { 203 185 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 204 186 int i; ··· 236 218 /* PPIs */ 237 219 irq->config = VGIC_CONFIG_LEVEL; 238 220 } 221 + 222 + switch (type) { 223 + case KVM_DEV_TYPE_ARM_VGIC_V3: 224 + irq->group = 1; 225 + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); 226 + break; 227 + case KVM_DEV_TYPE_ARM_VGIC_V2: 228 + irq->group = 0; 229 + irq->targets = BIT(vcpu->vcpu_id); 230 + break; 231 + } 239 232 } 240 233 241 234 return 0; 242 235 } 243 236 244 - static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu) 237 + static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type) 245 238 { 246 239 int ret; 247 240 248 241 mutex_lock(&vcpu->kvm->arch.config_lock); 249 - ret = vgic_allocate_private_irqs_locked(vcpu); 242 + ret = vgic_allocate_private_irqs_locked(vcpu, type); 250 243 mutex_unlock(&vcpu->kvm->arch.config_lock); 251 244 252 245 return ret; ··· 287 258 if (!irqchip_in_kernel(vcpu->kvm)) 288 259 return 0; 289 260 290 - ret = vgic_allocate_private_irqs(vcpu); 261 + ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); 291 262 if (ret) 292 263 return ret; 293 264 ··· 324 295 { 325 296 struct vgic_dist *dist = &kvm->arch.vgic; 326 297 struct kvm_vcpu *vcpu; 327 - int ret = 0, i; 298 + int ret = 0; 328 299 unsigned long idx; 329 300 330 301 lockdep_assert_held(&kvm->arch.config_lock); ··· 343 314 ret = kvm_vgic_dist_init(kvm, dist->nr_spis); 344 315 if (ret) 345 316 goto out; 346 - 347 - /* Initialize groups on CPUs created before the VGIC type was known */ 348 - kvm_for_each_vcpu(idx, vcpu, kvm) { 349 - ret = vgic_allocate_private_irqs_locked(vcpu); 350 - if (ret) 351 - goto out; 352 - 353 - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { 354 - struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); 355 - 356 - switch (dist->vgic_model) { 357 - case KVM_DEV_TYPE_ARM_VGIC_V3: 358 - irq->group = 1; 359 - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); 360 - break; 361 - case KVM_DEV_TYPE_ARM_VGIC_V2: 362 - irq->group = 0; 363 - irq->targets = 1U << idx; 364 - break; 365 - default: 366 - ret = -EINVAL; 367 - } 368 - 369 - vgic_put_irq(kvm, irq); 370 - 371 - if (ret) 372 - goto out; 373 - } 374 - } 375 317 376 318 /* 377 319 * If we have GICv4.1 enabled, unconditionally request enable the