Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull kvm fixes from Paolo Bonzini:
"RISC-V:

- Fix VM hang in case of timer delta being zero

ARM:

- MMU fixes:

- Read the MMU notifier seq before dropping the mmap lock to guard
against reading a potentially stale VMA

- Disable interrupts when walking user page tables to protect
against the page table being freed

- Read the MTE permissions for the VMA within the mmap lock
critical section, avoiding the use of a potentally stale VMA
pointer

- vPMU fixes:

- Return the sum of the current perf event value and PMC snapshot
for reads from userspace

- Don't save the value of guest writes to PMCR_EL0.{C,P}, which
could otherwise lead to userspace erroneously resetting the vPMU
during VM save/restore"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
riscv/kvm: Fix VM hang in case of timer delta being zero.
KVM: arm64: Check for kvm_vma_mte_allowed in the critical section
KVM: arm64: Disable interrupts while walking userspace PTs
KVM: arm64: Retry fault if vma_lookup() results become invalid
KVM: arm64: PMU: Don't save PMCR_EL0.{C,P} for the vCPU
KVM: arm64: PMU: Fix GET_ONE_REG for vPMC regs to return the current value

Linus Torvalds 3 years ago 3a93e403 91fe2045

+88 -43

4 changed files

expand all

arch

arm64

kvm

mmu.c

pmu-emul.c

sys_regs.c

riscv

kvm

vcpu_timer.c

+65 -36

arch/arm64/kvm/mmu.c

··· 666 666 CONFIG_PGTABLE_LEVELS), 667 667 .mm_ops = &kvm_user_mm_ops, 668 668 }; 669 + unsigned long flags; 669 670 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 670 671 u32 level = ~0; 671 672 int ret; 672 673 674 + /* 675 + * Disable IRQs so that we hazard against a concurrent 676 + * teardown of the userspace page tables (which relies on 677 + * IPI-ing threads). 678 + */ 679 + local_irq_save(flags); 673 680 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 674 - VM_BUG_ON(ret); 675 - VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); 676 - VM_BUG_ON(!(pte & PTE_VALID)); 681 + local_irq_restore(flags); 682 + 683 + if (ret) 684 + return ret; 685 + 686 + /* 687 + * Not seeing an error, but not updating level? Something went 688 + * deeply wrong... 689 + */ 690 + if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) 691 + return -EFAULT; 692 + 693 + /* Oops, the userspace PTs are gone... Replay the fault */ 694 + if (!kvm_pte_valid(pte)) 695 + return -EAGAIN; 677 696 678 697 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 679 698 } ··· 1098 1079 * 1099 1080 * Returns the size of the mapping. 1100 1081 */ 1101 - static unsigned long 1082 + static long 1102 1083 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1103 1084 unsigned long hva, kvm_pfn_t *pfnp, 1104 1085 phys_addr_t *ipap) ··· 1110 1091 * sure that the HVA and IPA are sufficiently aligned and that the 1111 1092 * block map is contained within the memslot. 1112 1093 */ 1113 - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && 1114 - get_user_mapping_size(kvm, hva) >= PMD_SIZE) { 1094 + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1095 + int sz = get_user_mapping_size(kvm, hva); 1096 + 1097 + if (sz < 0) 1098 + return sz; 1099 + 1100 + if (sz < PMD_SIZE) 1101 + return PAGE_SIZE; 1102 + 1115 1103 /* 1116 1104 * The address we faulted on is backed by a transparent huge 1117 1105 * page. However, because we map the compound huge page and ··· 1218 1192 { 1219 1193 int ret = 0; 1220 1194 bool write_fault, writable, force_pte = false; 1221 - bool exec_fault; 1195 + bool exec_fault, mte_allowed; 1222 1196 bool device = false; 1223 1197 unsigned long mmu_seq; 1224 1198 struct kvm *kvm = vcpu->kvm; ··· 1229 1203 kvm_pfn_t pfn; 1230 1204 bool logging_active = memslot_is_logging(memslot); 1231 1205 unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); 1232 - unsigned long vma_pagesize, fault_granule; 1206 + long vma_pagesize, fault_granule; 1233 1207 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1234 1208 struct kvm_pgtable *pgt; 1235 1209 ··· 1241 1215 if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { 1242 1216 kvm_err("Unexpected L2 read permission error\n"); 1243 1217 return -EFAULT; 1218 + } 1219 + 1220 + /* 1221 + * Permission faults just need to update the existing leaf entry, 1222 + * and so normally don't require allocations from the memcache. The 1223 + * only exception to this is when dirty logging is enabled at runtime 1224 + * and a write fault needs to collapse a block entry into a table. 1225 + */ 1226 + if (fault_status != ESR_ELx_FSC_PERM || 1227 + (logging_active && write_fault)) { 1228 + ret = kvm_mmu_topup_memory_cache(memcache, 1229 + kvm_mmu_cache_min_pages(kvm)); 1230 + if (ret) 1231 + return ret; 1244 1232 } 1245 1233 1246 1234 /* ··· 1309 1269 fault_ipa &= ~(vma_pagesize - 1); 1310 1270 1311 1271 gfn = fault_ipa >> PAGE_SHIFT; 1312 - mmap_read_unlock(current->mm); 1272 + mte_allowed = kvm_vma_mte_allowed(vma); 1273 + 1274 + /* Don't use the VMA after the unlock -- it may have vanished */ 1275 + vma = NULL; 1313 1276 1314 1277 /* 1315 - * Permission faults just need to update the existing leaf entry, 1316 - * and so normally don't require allocations from the memcache. The 1317 - * only exception to this is when dirty logging is enabled at runtime 1318 - * and a write fault needs to collapse a block entry into a table. 1319 - */ 1320 - if (fault_status != ESR_ELx_FSC_PERM || 1321 - (logging_active && write_fault)) { 1322 - ret = kvm_mmu_topup_memory_cache(memcache, 1323 - kvm_mmu_cache_min_pages(kvm)); 1324 - if (ret) 1325 - return ret; 1326 - } 1327 - 1328 - mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1329 - /* 1330 - * Ensure the read of mmu_invalidate_seq happens before we call 1331 - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1332 - * the page we just got a reference to gets unmapped before we have a 1333 - * chance to grab the mmu_lock, which ensure that if the page gets 1334 - * unmapped afterwards, the call to kvm_unmap_gfn will take it away 1335 - * from us again properly. This smp_rmb() interacts with the smp_wmb() 1336 - * in kvm_mmu_notifier_invalidate_<page|range_end>. 1278 + * Read mmu_invalidate_seq so that KVM can detect if the results of 1279 + * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to 1280 + * acquiring kvm->mmu_lock. 1337 1281 * 1338 - * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is 1339 - * used to avoid unnecessary overhead introduced to locate the memory 1340 - * slot because it's always fixed even @gfn is adjusted for huge pages. 1282 + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1283 + * with the smp_wmb() in kvm_mmu_invalidate_end(). 1341 1284 */ 1342 - smp_rmb(); 1285 + mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1286 + mmap_read_unlock(current->mm); 1343 1287 1344 1288 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, 1345 1289 write_fault, &writable, NULL); ··· 1374 1350 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1375 1351 hva, &pfn, 1376 1352 &fault_ipa); 1353 + 1354 + if (vma_pagesize < 0) { 1355 + ret = vma_pagesize; 1356 + goto out_unlock; 1357 + } 1377 1358 } 1378 1359 1379 1360 if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { 1380 1361 /* Check the VMM hasn't introduced a new disallowed VMA */ 1381 - if (kvm_vma_mte_allowed(vma)) { 1362 + if (mte_allowed) { 1382 1363 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1383 1364 } else { 1384 1365 ret = -EFAULT;

+2 -1

arch/arm64/kvm/pmu-emul.c

··· 538 538 if (!kvm_pmu_is_3p5(vcpu)) 539 539 val &= ~ARMV8_PMU_PMCR_LP; 540 540 541 - __vcpu_sys_reg(vcpu, PMCR_EL0) = val; 541 + /* The reset bits don't indicate any state, and shouldn't be saved. */ 542 + __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); 542 543 543 544 if (val & ARMV8_PMU_PMCR_E) { 544 545 kvm_pmu_enable_counter_mask(vcpu,

+19 -2

arch/arm64/kvm/sys_regs.c

··· 856 856 return true; 857 857 } 858 858 859 + static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, 860 + u64 *val) 861 + { 862 + u64 idx; 863 + 864 + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) 865 + /* PMCCNTR_EL0 */ 866 + idx = ARMV8_PMU_CYCLE_IDX; 867 + else 868 + /* PMEVCNTRn_EL0 */ 869 + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); 870 + 871 + *val = kvm_pmu_get_counter_value(vcpu, idx); 872 + return 0; 873 + } 874 + 859 875 static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, 860 876 struct sys_reg_params *p, 861 877 const struct sys_reg_desc *r) ··· 1088 1072 /* Macro to expand the PMEVCNTRn_EL0 register */ 1089 1073 #define PMU_PMEVCNTR_EL0(n) \ 1090 1074 { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ 1091 - .reset = reset_pmevcntr, \ 1075 + .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ 1092 1076 .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } 1093 1077 1094 1078 /* Macro to expand the PMEVTYPERn_EL0 register */ ··· 1998 1982 { PMU_SYS_REG(SYS_PMCEID1_EL0), 1999 1983 .access = access_pmceid, .reset = NULL }, 2000 1984 { PMU_SYS_REG(SYS_PMCCNTR_EL0), 2001 - .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, 1985 + .access = access_pmu_evcntr, .reset = reset_unknown, 1986 + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, 2002 1987 { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), 2003 1988 .access = access_pmu_evtyper, .reset = NULL }, 2004 1989 { PMU_SYS_REG(SYS_PMXEVCNTR_EL0),

+2 -4

arch/riscv/kvm/vcpu_timer.c

··· 147 147 return; 148 148 149 149 delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t); 150 - if (delta_ns) { 151 - hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); 152 - t->next_set = true; 153 - } 150 + hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); 151 + t->next_set = true; 154 152 } 155 153 156 154 static void kvm_riscv_vcpu_timer_unblocking(struct kvm_vcpu *vcpu)