commit 690edec54cbaa0e98dc592aae6864272f48f3c84 · tjh.dev/kernel

+11

arch/arm/include/asm/kvm_mmu.h

··· 381 381 return ret; 382 382 } 383 383 384 + static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, 385 + const void *data, unsigned long len) 386 + { 387 + int srcu_idx = srcu_read_lock(&kvm->srcu); 388 + int ret = kvm_write_guest(kvm, gpa, data, len); 389 + 390 + srcu_read_unlock(&kvm->srcu, srcu_idx); 391 + 392 + return ret; 393 + } 394 + 384 395 static inline void *kvm_get_hyp_vector(void) 385 396 { 386 397 switch(read_cpuid_part()) {

+2

arch/arm/include/asm/stage2_pgtable.h

··· 75 75 76 76 #define S2_PMD_MASK PMD_MASK 77 77 #define S2_PMD_SIZE PMD_SIZE 78 + #define S2_PUD_MASK PUD_MASK 79 + #define S2_PUD_SIZE PUD_SIZE 78 80 79 81 static inline bool kvm_stage2_has_pmd(struct kvm *kvm) 80 82 {

+11

arch/arm64/include/asm/kvm_mmu.h

··· 445 445 return ret; 446 446 } 447 447 448 + static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, 449 + const void *data, unsigned long len) 450 + { 451 + int srcu_idx = srcu_read_lock(&kvm->srcu); 452 + int ret = kvm_write_guest(kvm, gpa, data, len); 453 + 454 + srcu_read_unlock(&kvm->srcu, srcu_idx); 455 + 456 + return ret; 457 + } 458 + 448 459 #ifdef CONFIG_KVM_INDIRECT_VECTORS 449 460 /* 450 461 * EL2 vectors can be mapped and rerouted in a number of ways,

+3 -3

arch/arm64/kvm/reset.c

··· 123 123 int ret = -EINVAL; 124 124 bool loaded; 125 125 126 + /* Reset PMU outside of the non-preemptible section */ 127 + kvm_pmu_vcpu_reset(vcpu); 128 + 126 129 preempt_disable(); 127 130 loaded = (vcpu->cpu != -1); 128 131 if (loaded) ··· 172 169 173 170 vcpu->arch.reset_state.reset = false; 174 171 } 175 - 176 - /* Reset PMU */ 177 - kvm_pmu_vcpu_reset(vcpu); 178 172 179 173 /* Default workaround setup is enabled (if supported) */ 180 174 if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)

+2 -2

virt/kvm/arm/hyp/vgic-v3-sr.c

··· 222 222 } 223 223 } 224 224 225 - if (used_lrs) { 225 + if (used_lrs || cpu_if->its_vpe.its_vm) { 226 226 int i; 227 227 u32 elrsr; 228 228 ··· 247 247 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; 248 248 int i; 249 249 250 - if (used_lrs) { 250 + if (used_lrs || cpu_if->its_vpe.its_vm) { 251 251 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); 252 252 253 253 for (i = 0; i < used_lrs; i++)

+73 -52

virt/kvm/arm/mmu.c

··· 102 102 * @addr: IPA 103 103 * @pmd: pmd pointer for IPA 104 104 * 105 - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all 106 - * pages in the range dirty. 105 + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. 107 106 */ 108 107 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 109 108 { ··· 120 121 * @addr: IPA 121 122 * @pud: pud pointer for IPA 122 123 * 123 - * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all 124 - * pages in the range dirty. 124 + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 125 125 */ 126 126 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) 127 127 { ··· 897 899 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 898 900 * @kvm: The KVM struct pointer for the VM. 899 901 * 900 - * Allocates only the stage-2 HW PGD level table(s) (can support either full 901 - * 40-bit input addresses or limited to 32-bit input addresses). Clears the 902 - * allocated pages. 902 + * Allocates only the stage-2 HW PGD level table(s) of size defined by 903 + * stage2_pgd_size(kvm). 903 904 * 904 905 * Note we don't need locking here as this is only called when the VM is 905 906 * created, which can only be done once. ··· 1064 1067 { 1065 1068 pmd_t *pmd, old_pmd; 1066 1069 1070 + retry: 1067 1071 pmd = stage2_get_pmd(kvm, cache, addr); 1068 1072 VM_BUG_ON(!pmd); 1069 1073 1070 1074 old_pmd = *pmd; 1075 + /* 1076 + * Multiple vcpus faulting on the same PMD entry, can 1077 + * lead to them sequentially updating the PMD with the 1078 + * same value. Following the break-before-make 1079 + * (pmd_clear() followed by tlb_flush()) process can 1080 + * hinder forward progress due to refaults generated 1081 + * on missing translations. 1082 + * 1083 + * Skip updating the page table if the entry is 1084 + * unchanged. 1085 + */ 1086 + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) 1087 + return 0; 1088 + 1071 1089 if (pmd_present(old_pmd)) { 1072 1090 /* 1073 - * Multiple vcpus faulting on the same PMD entry, can 1074 - * lead to them sequentially updating the PMD with the 1075 - * same value. Following the break-before-make 1076 - * (pmd_clear() followed by tlb_flush()) process can 1077 - * hinder forward progress due to refaults generated 1078 - * on missing translations. 1091 + * If we already have PTE level mapping for this block, 1092 + * we must unmap it to avoid inconsistent TLB state and 1093 + * leaking the table page. We could end up in this situation 1094 + * if the memory slot was marked for dirty logging and was 1095 + * reverted, leaving PTE level mappings for the pages accessed 1096 + * during the period. So, unmap the PTE level mapping for this 1097 + * block and retry, as we could have released the upper level 1098 + * table in the process. 1079 1099 * 1080 - * Skip updating the page table if the entry is 1081 - * unchanged. 1100 + * Normal THP split/merge follows mmu_notifier callbacks and do 1101 + * get handled accordingly. 1082 1102 */ 1083 - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) 1084 - return 0; 1085 - 1103 + if (!pmd_thp_or_huge(old_pmd)) { 1104 + unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); 1105 + goto retry; 1106 + } 1086 1107 /* 1087 1108 * Mapping in huge pages should only happen through a 1088 1109 * fault. If a page is merged into a transparent huge ··· 1112 1097 * should become splitting first, unmapped, merged, 1113 1098 * and mapped back in on-demand. 1114 1099 */ 1115 - VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); 1116 - 1100 + WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); 1117 1101 pmd_clear(pmd); 1118 1102 kvm_tlb_flush_vmid_ipa(kvm, addr); 1119 1103 } else { ··· 1128 1114 { 1129 1115 pud_t *pudp, old_pud; 1130 1116 1117 + retry: 1131 1118 pudp = stage2_get_pud(kvm, cache, addr); 1132 1119 VM_BUG_ON(!pudp); 1133 1120 ··· 1136 1121 1137 1122 /* 1138 1123 * A large number of vcpus faulting on the same stage 2 entry, 1139 - * can lead to a refault due to the 1140 - * stage2_pud_clear()/tlb_flush(). Skip updating the page 1141 - * tables if there is no change. 1124 + * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). 1125 + * Skip updating the page tables if there is no change. 1142 1126 */ 1143 1127 if (pud_val(old_pud) == pud_val(*new_pudp)) 1144 1128 return 0; 1145 1129 1146 1130 if (stage2_pud_present(kvm, old_pud)) { 1131 + /* 1132 + * If we already have table level mapping for this block, unmap 1133 + * the range for this block and retry. 1134 + */ 1135 + if (!stage2_pud_huge(kvm, old_pud)) { 1136 + unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); 1137 + goto retry; 1138 + } 1139 + 1140 + WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); 1147 1141 stage2_pud_clear(kvm, pudp); 1148 1142 kvm_tlb_flush_vmid_ipa(kvm, addr); 1149 1143 } else { ··· 1475 1451 } 1476 1452 1477 1453 /** 1478 - * stage2_wp_puds - write protect PGD range 1479 - * @pgd: pointer to pgd entry 1480 - * @addr: range start address 1481 - * @end: range end address 1482 - * 1483 - * Process PUD entries, for a huge PUD we cause a panic. 1484 - */ 1454 + * stage2_wp_puds - write protect PGD range 1455 + * @pgd: pointer to pgd entry 1456 + * @addr: range start address 1457 + * @end: range end address 1458 + */ 1485 1459 static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, 1486 1460 phys_addr_t addr, phys_addr_t end) 1487 1461 { ··· 1616 1594 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1617 1595 } 1618 1596 1619 - static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, 1620 - unsigned long hva) 1597 + static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1598 + unsigned long hva, 1599 + unsigned long map_size) 1621 1600 { 1622 1601 gpa_t gpa_start; 1623 1602 hva_t uaddr_start, uaddr_end; ··· 1633 1610 1634 1611 /* 1635 1612 * Pages belonging to memslots that don't have the same alignment 1636 - * within a PMD for userspace and IPA cannot be mapped with stage-2 1637 - * PMD entries, because we'll end up mapping the wrong pages. 1613 + * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1614 + * PMD/PUD entries, because we'll end up mapping the wrong pages. 1638 1615 * 1639 1616 * Consider a layout like the following: 1640 1617 * 1641 1618 * memslot->userspace_addr: 1642 1619 * +-----+--------------------+--------------------+---+ 1643 - * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| 1620 + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1644 1621 * +-----+--------------------+--------------------+---+ 1645 1622 * 1646 1623 * memslot->base_gfn << PAGE_SIZE: 1647 1624 * +---+--------------------+--------------------+-----+ 1648 - * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| 1625 + * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1649 1626 * +---+--------------------+--------------------+-----+ 1650 1627 * 1651 - * If we create those stage-2 PMDs, we'll end up with this incorrect 1628 + * If we create those stage-2 blocks, we'll end up with this incorrect 1652 1629 * mapping: 1653 1630 * d -> f 1654 1631 * e -> g 1655 1632 * f -> h 1656 1633 */ 1657 - if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) 1634 + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1658 1635 return false; 1659 1636 1660 1637 /* 1661 1638 * Next, let's make sure we're not trying to map anything not covered 1662 - * by the memslot. This means we have to prohibit PMD size mappings 1663 - * for the beginning and end of a non-PMD aligned and non-PMD sized 1639 + * by the memslot. This means we have to prohibit block size mappings 1640 + * for the beginning and end of a non-block aligned and non-block sized 1664 1641 * memory slot (illustrated by the head and tail parts of the 1665 1642 * userspace view above containing pages 'abcde' and 'xyz', 1666 1643 * respectively). ··· 1669 1646 * userspace_addr or the base_gfn, as both are equally aligned (per 1670 1647 * the check above) and equally sized. 1671 1648 */ 1672 - return (hva & S2_PMD_MASK) >= uaddr_start && 1673 - (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; 1649 + return (hva & ~(map_size - 1)) >= uaddr_start && 1650 + (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1674 1651 } 1675 1652 1676 1653 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ··· 1699 1676 return -EFAULT; 1700 1677 } 1701 1678 1702 - if (!fault_supports_stage2_pmd_mappings(memslot, hva)) 1703 - force_pte = true; 1704 - 1705 - if (logging_active) 1706 - force_pte = true; 1707 - 1708 1679 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1709 1680 down_read(&current->mm->mmap_sem); 1710 1681 vma = find_vma_intersection(current->mm, hva, hva + 1); ··· 1709 1692 } 1710 1693 1711 1694 vma_pagesize = vma_kernel_pagesize(vma); 1695 + if (logging_active || 1696 + !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { 1697 + force_pte = true; 1698 + vma_pagesize = PAGE_SIZE; 1699 + } 1700 + 1712 1701 /* 1713 1702 * The stage2 has a minimum of 2 level table (For arm64 see 1714 1703 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can ··· 1722 1699 * As for PUD huge maps, we must make sure that we have at least 1723 1700 * 3 levels, i.e, PMD is not folded. 1724 1701 */ 1725 - if ((vma_pagesize == PMD_SIZE || 1726 - (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) && 1727 - !force_pte) { 1702 + if (vma_pagesize == PMD_SIZE || 1703 + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) 1728 1704 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; 1729 - } 1730 1705 up_read(&current->mm->mmap_sem); 1731 1706 1732 1707 /* We need minimum second+third level pages */

+19 -12

virt/kvm/arm/vgic/vgic-its.c

··· 754 754 u64 indirect_ptr, type = GITS_BASER_TYPE(baser); 755 755 phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); 756 756 int esz = GITS_BASER_ENTRY_SIZE(baser); 757 - int index; 757 + int index, idx; 758 758 gfn_t gfn; 759 + bool ret; 759 760 760 761 switch (type) { 761 762 case GITS_BASER_TYPE_DEVICE: ··· 783 782 784 783 if (eaddr) 785 784 *eaddr = addr; 786 - return kvm_is_visible_gfn(its->dev->kvm, gfn); 785 + 786 + goto out; 787 787 } 788 788 789 789 /* calculate and check the index into the 1st level */ ··· 814 812 815 813 if (eaddr) 816 814 *eaddr = indirect_ptr; 817 - return kvm_is_visible_gfn(its->dev->kvm, gfn); 815 + 816 + out: 817 + idx = srcu_read_lock(&its->dev->kvm->srcu); 818 + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); 819 + srcu_read_unlock(&its->dev->kvm->srcu, idx); 820 + return ret; 818 821 } 819 822 820 823 static int vgic_its_alloc_collection(struct vgic_its *its, ··· 1736 1729 kfree(its); 1737 1730 } 1738 1731 1739 - int vgic_its_has_attr_regs(struct kvm_device *dev, 1740 - struct kvm_device_attr *attr) 1732 + static int vgic_its_has_attr_regs(struct kvm_device *dev, 1733 + struct kvm_device_attr *attr) 1741 1734 { 1742 1735 const struct vgic_register_region *region; 1743 1736 gpa_t offset = attr->attr; ··· 1757 1750 return 0; 1758 1751 } 1759 1752 1760 - int vgic_its_attr_regs_access(struct kvm_device *dev, 1761 - struct kvm_device_attr *attr, 1762 - u64 *reg, bool is_write) 1753 + static int vgic_its_attr_regs_access(struct kvm_device *dev, 1754 + struct kvm_device_attr *attr, 1755 + u64 *reg, bool is_write) 1763 1756 { 1764 1757 const struct vgic_register_region *region; 1765 1758 struct vgic_its *its; ··· 1926 1919 ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | 1927 1920 ite->collection->collection_id; 1928 1921 val = cpu_to_le64(val); 1929 - return kvm_write_guest(kvm, gpa, &val, ite_esz); 1922 + return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); 1930 1923 } 1931 1924 1932 1925 /** ··· 2073 2066 (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | 2074 2067 (dev->num_eventid_bits - 1)); 2075 2068 val = cpu_to_le64(val); 2076 - return kvm_write_guest(kvm, ptr, &val, dte_esz); 2069 + return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); 2077 2070 } 2078 2071 2079 2072 /** ··· 2253 2246 ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | 2254 2247 collection->collection_id); 2255 2248 val = cpu_to_le64(val); 2256 - return kvm_write_guest(its->dev->kvm, gpa, &val, esz); 2249 + return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); 2257 2250 } 2258 2251 2259 2252 static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) ··· 2324 2317 */ 2325 2318 val = 0; 2326 2319 BUG_ON(cte_esz > sizeof(val)); 2327 - ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz); 2320 + ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); 2328 2321 return ret; 2329 2322 } 2330 2323

+2 -2

virt/kvm/arm/vgic/vgic-v3.c

··· 358 358 if (status) { 359 359 /* clear consumed data */ 360 360 val &= ~(1 << bit_nr); 361 - ret = kvm_write_guest(kvm, ptr, &val, 1); 361 + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); 362 362 if (ret) 363 363 return ret; 364 364 } ··· 409 409 else 410 410 val &= ~(1 << bit_nr); 411 411 412 - ret = kvm_write_guest(kvm, ptr, &val, 1); 412 + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); 413 413 if (ret) 414 414 return ret; 415 415 }

+10 -4

virt/kvm/arm/vgic/vgic.c

··· 867 867 * either observe the new interrupt before or after doing this check, 868 868 * and introducing additional synchronization mechanism doesn't change 869 869 * this. 870 + * 871 + * Note that we still need to go through the whole thing if anything 872 + * can be directly injected (GICv4). 870 873 */ 871 - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) 874 + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && 875 + !vgic_supports_direct_msis(vcpu->kvm)) 872 876 return; 873 877 874 878 DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); 875 879 876 - raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 877 - vgic_flush_lr_state(vcpu); 878 - raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 880 + if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { 881 + raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 882 + vgic_flush_lr_state(vcpu); 883 + raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 884 + } 879 885 880 886 if (can_access_vgic_from_kernel()) 881 887 vgic_restore_state(vcpu);