Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"ARM:

- Fix TCR_EL2 configuration to not use the ASID in TTBR1_EL2 and not
mess-up T1SZ/PS by using the HCR_EL2.E2H==0 layout.

- Bring back the VMID allocation to the vcpu_load phase, ensuring
that we only setup VTTBR_EL2 once on VHE. This cures an ugly race
that would lead to running with an unallocated VMID.

RISC-V:

- Fix hart status check in SBI HSM extension

- Fix hart suspend_type usage in SBI HSM extension

- Fix error returned by SBI IPI and TIME extensions for unsupported
function IDs

- Fix suspend_type usage in SBI SUSP extension

- Remove unnecessary vcpu kick after injecting interrupt via IMSIC
guest file

x86:

- Fix an nVMX bug where KVM fails to detect that, after nested
VM-Exit, L1 has a pending IRQ (or NMI).

- To avoid freeing the PIC while vCPUs are still around, which would
cause a NULL pointer access with the previous patch, destroy vCPUs
before any VM-level destruction.

- Handle failures to create vhost_tasks"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
kvm: retry nx_huge_page_recovery_thread creation
vhost: return task creation error instead of NULL
KVM: nVMX: Process events on nested VM-Exit if injectable IRQ or NMI is pending
KVM: x86: Free vCPUs before freeing VM state
riscv: KVM: Remove unnecessary vcpu kick
KVM: arm64: Ensure a VMID is allocated before programming VTTBR_EL2
KVM: arm64: Fix tcr_el2 initialisation in hVHE mode
riscv: KVM: Fix SBI sleep_type use
riscv: KVM: Fix SBI TIME error generation
riscv: KVM: Fix SBI IPI error generation
riscv: KVM: Fix hart suspend_type use
riscv: KVM: Fix hart suspend status check

+94 -62
+1 -1
arch/arm64/include/asm/kvm_arm.h
··· 119 119 #define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK 120 120 #define TCR_EL2_T0SZ_MASK 0x3f 121 121 #define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ 122 - TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) 122 + TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK) 123 123 124 124 /* VTCR_EL2 Registers bits */ 125 125 #define VTCR_EL2_DS TCR_EL2_DS
+1 -1
arch/arm64/include/asm/kvm_host.h
··· 1259 1259 extern unsigned int __ro_after_init kvm_arm_vmid_bits; 1260 1260 int __init kvm_arm_vmid_alloc_init(void); 1261 1261 void __init kvm_arm_vmid_alloc_free(void); 1262 - bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); 1262 + void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); 1263 1263 void kvm_arm_vmid_clear_active(void); 1264 1264 1265 1265 static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
+17 -20
arch/arm64/kvm/arm.c
··· 560 560 last_ran = this_cpu_ptr(mmu->last_vcpu_ran); 561 561 562 562 /* 563 + * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, 564 + * which happens eagerly in VHE. 565 + * 566 + * Also, the VMID allocator only preserves VMIDs that are active at the 567 + * time of rollover, so KVM might need to grab a new VMID for the MMU if 568 + * this is called from kvm_sched_in(). 569 + */ 570 + kvm_arm_vmid_update(&mmu->vmid); 571 + 572 + /* 563 573 * We guarantee that both TLBs and I-cache are private to each 564 574 * vcpu. If detecting that a vcpu from the same VM has 565 575 * previously run on the same physical CPU, call into the ··· 1147 1137 * non-preemptible context. 1148 1138 */ 1149 1139 preempt_disable(); 1150 - 1151 - /* 1152 - * The VMID allocator only tracks active VMIDs per 1153 - * physical CPU, and therefore the VMID allocated may not be 1154 - * preserved on VMID roll-over if the task was preempted, 1155 - * making a thread's VMID inactive. So we need to call 1156 - * kvm_arm_vmid_update() in non-premptible context. 1157 - */ 1158 - if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) && 1159 - has_vhe()) 1160 - __load_stage2(vcpu->arch.hw_mmu, 1161 - vcpu->arch.hw_mmu->arch); 1162 1140 1163 1141 kvm_pmu_flush_hwstate(vcpu); 1164 1142 ··· 1978 1980 static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) 1979 1981 { 1980 1982 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); 1981 - unsigned long tcr, ips; 1983 + unsigned long tcr; 1982 1984 1983 1985 /* 1984 1986 * Calculate the raw per-cpu offset without a translation from the ··· 1992 1994 params->mair_el2 = read_sysreg(mair_el1); 1993 1995 1994 1996 tcr = read_sysreg(tcr_el1); 1995 - ips = FIELD_GET(TCR_IPS_MASK, tcr); 1996 1997 if (cpus_have_final_cap(ARM64_KVM_HVHE)) { 1998 + tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); 1997 1999 tcr |= TCR_EPD1_MASK; 1998 2000 } else { 2001 + unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); 2002 + 1999 2003 tcr &= TCR_EL2_MASK; 2000 - tcr |= TCR_EL2_RES1; 2004 + tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); 2005 + if (lpa2_is_enabled()) 2006 + tcr |= TCR_EL2_DS; 2001 2007 } 2002 - tcr &= ~TCR_T0SZ_MASK; 2003 2008 tcr |= TCR_T0SZ(hyp_va_bits); 2004 - tcr &= ~TCR_EL2_PS_MASK; 2005 - tcr |= FIELD_PREP(TCR_EL2_PS_MASK, ips); 2006 - if (lpa2_is_enabled()) 2007 - tcr |= TCR_EL2_DS; 2008 2009 params->tcr_el2 = tcr; 2009 2010 2010 2011 params->pgd_pa = kvm_mmu_get_httbr();
+3 -8
arch/arm64/kvm/vmid.c
··· 135 135 atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); 136 136 } 137 137 138 - bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) 138 + void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) 139 139 { 140 140 unsigned long flags; 141 141 u64 vmid, old_active_vmid; 142 - bool updated = false; 143 142 144 143 vmid = atomic64_read(&kvm_vmid->id); 145 144 ··· 156 157 if (old_active_vmid != 0 && vmid_gen_match(vmid) && 157 158 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), 158 159 old_active_vmid, vmid)) 159 - return false; 160 + return; 160 161 161 162 raw_spin_lock_irqsave(&cpu_vmid_lock, flags); 162 163 163 164 /* Check that our VMID belongs to the current generation. */ 164 165 vmid = atomic64_read(&kvm_vmid->id); 165 - if (!vmid_gen_match(vmid)) { 166 + if (!vmid_gen_match(vmid)) 166 167 vmid = new_vmid(kvm_vmid); 167 - updated = true; 168 - } 169 168 170 169 atomic64_set(this_cpu_ptr(&active_vmids), vmid); 171 170 raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); 172 - 173 - return updated; 174 171 } 175 172 176 173 /*
-1
arch/riscv/kvm/aia_imsic.c
··· 974 974 975 975 if (imsic->vsfile_cpu >= 0) { 976 976 writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE); 977 - kvm_vcpu_kick(vcpu); 978 977 } else { 979 978 eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)]; 980 979 set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip);
+6 -5
arch/riscv/kvm/vcpu_sbi_hsm.c
··· 9 9 #include <linux/errno.h> 10 10 #include <linux/err.h> 11 11 #include <linux/kvm_host.h> 12 + #include <linux/wordpart.h> 12 13 #include <asm/sbi.h> 13 14 #include <asm/kvm_vcpu_sbi.h> 14 15 ··· 80 79 target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); 81 80 if (!target_vcpu) 82 81 return SBI_ERR_INVALID_PARAM; 83 - if (!kvm_riscv_vcpu_stopped(target_vcpu)) 84 - return SBI_HSM_STATE_STARTED; 85 - else if (vcpu->stat.generic.blocking) 82 + if (kvm_riscv_vcpu_stopped(target_vcpu)) 83 + return SBI_HSM_STATE_STOPPED; 84 + else if (target_vcpu->stat.generic.blocking) 86 85 return SBI_HSM_STATE_SUSPENDED; 87 86 else 88 - return SBI_HSM_STATE_STOPPED; 87 + return SBI_HSM_STATE_STARTED; 89 88 } 90 89 91 90 static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, ··· 110 109 } 111 110 return 0; 112 111 case SBI_EXT_HSM_HART_SUSPEND: 113 - switch (cp->a0) { 112 + switch (lower_32_bits(cp->a0)) { 114 113 case SBI_HSM_SUSPEND_RET_DEFAULT: 115 114 kvm_riscv_vcpu_wfi(vcpu); 116 115 break;
+12 -3
arch/riscv/kvm/vcpu_sbi_replace.c
··· 21 21 u64 next_cycle; 22 22 23 23 if (cp->a6 != SBI_EXT_TIME_SET_TIMER) { 24 - retdata->err_val = SBI_ERR_INVALID_PARAM; 24 + retdata->err_val = SBI_ERR_NOT_SUPPORTED; 25 25 return 0; 26 26 } 27 27 ··· 51 51 struct kvm_cpu_context *cp = &vcpu->arch.guest_context; 52 52 unsigned long hmask = cp->a0; 53 53 unsigned long hbase = cp->a1; 54 + unsigned long hart_bit = 0, sentmask = 0; 54 55 55 56 if (cp->a6 != SBI_EXT_IPI_SEND_IPI) { 56 - retdata->err_val = SBI_ERR_INVALID_PARAM; 57 + retdata->err_val = SBI_ERR_NOT_SUPPORTED; 57 58 return 0; 58 59 } 59 60 ··· 63 62 if (hbase != -1UL) { 64 63 if (tmp->vcpu_id < hbase) 65 64 continue; 66 - if (!(hmask & (1UL << (tmp->vcpu_id - hbase)))) 65 + hart_bit = tmp->vcpu_id - hbase; 66 + if (hart_bit >= __riscv_xlen) 67 + goto done; 68 + if (!(hmask & (1UL << hart_bit))) 67 69 continue; 68 70 } 69 71 ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT); 70 72 if (ret < 0) 71 73 break; 74 + sentmask |= 1UL << hart_bit; 72 75 kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD); 73 76 } 77 + 78 + done: 79 + if (hbase != -1UL && (hmask ^ sentmask)) 80 + retdata->err_val = SBI_ERR_INVALID_PARAM; 74 81 75 82 return ret; 76 83 }
+2 -1
arch/riscv/kvm/vcpu_sbi_system.c
··· 4 4 */ 5 5 6 6 #include <linux/kvm_host.h> 7 + #include <linux/wordpart.h> 7 8 8 9 #include <asm/kvm_vcpu_sbi.h> 9 10 #include <asm/sbi.h> ··· 20 19 21 20 switch (funcid) { 22 21 case SBI_EXT_SUSP_SYSTEM_SUSPEND: 23 - if (cp->a0 != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { 22 + if (lower_32_bits(cp->a0) != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { 24 23 retdata->err_val = SBI_ERR_INVALID_PARAM; 25 24 return 0; 26 25 }
+5 -7
arch/x86/kvm/mmu/mmu.c
··· 7460 7460 return true; 7461 7461 } 7462 7462 7463 - static void kvm_mmu_start_lpage_recovery(struct once *once) 7463 + static int kvm_mmu_start_lpage_recovery(struct once *once) 7464 7464 { 7465 7465 struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); 7466 7466 struct kvm *kvm = container_of(ka, struct kvm, arch); ··· 7471 7471 kvm_nx_huge_page_recovery_worker_kill, 7472 7472 kvm, "kvm-nx-lpage-recovery"); 7473 7473 7474 - if (!nx_thread) 7475 - return; 7474 + if (IS_ERR(nx_thread)) 7475 + return PTR_ERR(nx_thread); 7476 7476 7477 7477 vhost_task_start(nx_thread); 7478 7478 7479 7479 /* Make the task visible only once it is fully started. */ 7480 7480 WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); 7481 + return 0; 7481 7482 } 7482 7483 7483 7484 int kvm_mmu_post_init_vm(struct kvm *kvm) ··· 7486 7485 if (nx_hugepage_mitigation_hard_disabled) 7487 7486 return 0; 7488 7487 7489 - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); 7490 - if (!kvm->arch.nx_huge_page_recovery_thread) 7491 - return -ENOMEM; 7492 - return 0; 7488 + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); 7493 7489 } 7494 7490 7495 7491 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+11
arch/x86/kvm/vmx/nested.c
··· 5084 5084 5085 5085 load_vmcs12_host_state(vcpu, vmcs12); 5086 5086 5087 + /* 5088 + * Process events if an injectable IRQ or NMI is pending, even 5089 + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5090 + * If an event became pending while L2 was active, KVM needs to 5091 + * either inject the event or request an IRQ/NMI window. SMIs 5092 + * don't need to be processed as SMM is mutually exclusive with 5093 + * non-root mode. INIT/SIPI don't need to be checked as INIT 5094 + * is blocked post-VMXON, and SIPIs are ignored. 5095 + */ 5096 + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5097 + kvm_make_request(KVM_REQ_EVENT, vcpu); 5087 5098 return; 5088 5099 } 5089 5100
+1 -1
arch/x86/kvm/x86.c
··· 12877 12877 mutex_unlock(&kvm->slots_lock); 12878 12878 } 12879 12879 kvm_unload_vcpu_mmus(kvm); 12880 + kvm_destroy_vcpus(kvm); 12880 12881 kvm_x86_call(vm_destroy)(kvm); 12881 12882 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); 12882 12883 kvm_pic_destroy(kvm); 12883 12884 kvm_ioapic_destroy(kvm); 12884 - kvm_destroy_vcpus(kvm); 12885 12885 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); 12886 12886 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); 12887 12887 kvm_mmu_uninit_vm(kvm);
+1 -1
drivers/vhost/vhost.c
··· 666 666 667 667 vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, 668 668 worker, name); 669 - if (!vtsk) 669 + if (IS_ERR(vtsk)) 670 670 goto free_worker; 671 671 672 672 mutex_init(&worker->mutex);
+32 -11
include/linux/call_once.h
··· 26 26 __once_init((once), #once, &__key); \ 27 27 } while (0) 28 28 29 - static inline void call_once(struct once *once, void (*cb)(struct once *)) 29 + /* 30 + * call_once - Ensure a function has been called exactly once 31 + * 32 + * @once: Tracking struct 33 + * @cb: Function to be called 34 + * 35 + * If @once has never completed successfully before, call @cb and, if 36 + * it returns a zero or positive value, mark @once as completed. Return 37 + * the value returned by @cb 38 + * 39 + * If @once has completed succesfully before, return 0. 40 + * 41 + * The call to @cb is implicitly surrounded by a mutex, though for 42 + * efficiency the * function avoids taking it after the first call. 43 + */ 44 + static inline int call_once(struct once *once, int (*cb)(struct once *)) 30 45 { 31 - /* Pairs with atomic_set_release() below. */ 32 - if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) 33 - return; 46 + int r, state; 34 47 35 - guard(mutex)(&once->lock); 36 - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); 37 - if (atomic_read(&once->state) != ONCE_NOT_STARTED) 38 - return; 48 + /* Pairs with atomic_set_release() below. */ 49 + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) 50 + return 0; 39 51 40 - atomic_set(&once->state, ONCE_RUNNING); 41 - cb(once); 42 - atomic_set_release(&once->state, ONCE_COMPLETED); 52 + guard(mutex)(&once->lock); 53 + state = atomic_read(&once->state); 54 + if (unlikely(state != ONCE_NOT_STARTED)) 55 + return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; 56 + 57 + atomic_set(&once->state, ONCE_RUNNING); 58 + r = cb(once); 59 + if (r < 0) 60 + atomic_set(&once->state, ONCE_NOT_STARTED); 61 + else 62 + atomic_set_release(&once->state, ONCE_COMPLETED); 63 + return r; 43 64 } 44 65 45 66 #endif /* _LINUX_CALL_ONCE_H */
+2 -2
kernel/vhost_task.c
··· 133 133 134 134 vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); 135 135 if (!vtsk) 136 - return NULL; 136 + return ERR_PTR(-ENOMEM); 137 137 init_completion(&vtsk->exited); 138 138 mutex_init(&vtsk->exit_mutex); 139 139 vtsk->data = arg; ··· 145 145 tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); 146 146 if (IS_ERR(tsk)) { 147 147 kfree(vtsk); 148 - return NULL; 148 + return ERR_PTR(PTR_ERR(tsk)); 149 149 } 150 150 151 151 vtsk->task = tsk;