+11
arch/arm/include/asm/kvm_mmu.h
+11
arch/arm/include/asm/kvm_mmu.h
···
381
381
return ret;
382
382
}
383
383
384
+
static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
385
+
const void *data, unsigned long len)
386
+
{
387
+
int srcu_idx = srcu_read_lock(&kvm->srcu);
388
+
int ret = kvm_write_guest(kvm, gpa, data, len);
389
+
390
+
srcu_read_unlock(&kvm->srcu, srcu_idx);
391
+
392
+
return ret;
393
+
}
394
+
384
395
static inline void *kvm_get_hyp_vector(void)
385
396
{
386
397
switch(read_cpuid_part()) {
+2
arch/arm/include/asm/stage2_pgtable.h
+2
arch/arm/include/asm/stage2_pgtable.h
+11
arch/arm64/include/asm/kvm_mmu.h
+11
arch/arm64/include/asm/kvm_mmu.h
···
445
445
return ret;
446
446
}
447
447
448
+
static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
449
+
const void *data, unsigned long len)
450
+
{
451
+
int srcu_idx = srcu_read_lock(&kvm->srcu);
452
+
int ret = kvm_write_guest(kvm, gpa, data, len);
453
+
454
+
srcu_read_unlock(&kvm->srcu, srcu_idx);
455
+
456
+
return ret;
457
+
}
458
+
448
459
#ifdef CONFIG_KVM_INDIRECT_VECTORS
449
460
/*
450
461
* EL2 vectors can be mapped and rerouted in a number of ways,
+3
-3
arch/arm64/kvm/reset.c
+3
-3
arch/arm64/kvm/reset.c
···
123
123
int ret = -EINVAL;
124
124
bool loaded;
125
125
126
+
/* Reset PMU outside of the non-preemptible section */
127
+
kvm_pmu_vcpu_reset(vcpu);
128
+
126
129
preempt_disable();
127
130
loaded = (vcpu->cpu != -1);
128
131
if (loaded)
···
172
169
173
170
vcpu->arch.reset_state.reset = false;
174
171
}
175
-
176
-
/* Reset PMU */
177
-
kvm_pmu_vcpu_reset(vcpu);
178
172
179
173
/* Default workaround setup is enabled (if supported) */
180
174
if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
+2
-2
virt/kvm/arm/hyp/vgic-v3-sr.c
+2
-2
virt/kvm/arm/hyp/vgic-v3-sr.c
···
222
222
}
223
223
}
224
224
225
-
if (used_lrs) {
225
+
if (used_lrs || cpu_if->its_vpe.its_vm) {
226
226
int i;
227
227
u32 elrsr;
228
228
···
247
247
u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
248
248
int i;
249
249
250
-
if (used_lrs) {
250
+
if (used_lrs || cpu_if->its_vpe.its_vm) {
251
251
write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
252
252
253
253
for (i = 0; i < used_lrs; i++)
+73
-52
virt/kvm/arm/mmu.c
+73
-52
virt/kvm/arm/mmu.c
···
102
102
* @addr: IPA
103
103
* @pmd: pmd pointer for IPA
104
104
*
105
-
* Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
106
-
* pages in the range dirty.
105
+
* Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
107
106
*/
108
107
static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
109
108
{
···
120
121
* @addr: IPA
121
122
* @pud: pud pointer for IPA
122
123
*
123
-
* Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
124
-
* pages in the range dirty.
124
+
* Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
125
125
*/
126
126
static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
127
127
{
···
897
899
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
898
900
* @kvm: The KVM struct pointer for the VM.
899
901
*
900
-
* Allocates only the stage-2 HW PGD level table(s) (can support either full
901
-
* 40-bit input addresses or limited to 32-bit input addresses). Clears the
902
-
* allocated pages.
902
+
* Allocates only the stage-2 HW PGD level table(s) of size defined by
903
+
* stage2_pgd_size(kvm).
903
904
*
904
905
* Note we don't need locking here as this is only called when the VM is
905
906
* created, which can only be done once.
···
1064
1067
{
1065
1068
pmd_t *pmd, old_pmd;
1066
1069
1070
+
retry:
1067
1071
pmd = stage2_get_pmd(kvm, cache, addr);
1068
1072
VM_BUG_ON(!pmd);
1069
1073
1070
1074
old_pmd = *pmd;
1075
+
/*
1076
+
* Multiple vcpus faulting on the same PMD entry, can
1077
+
* lead to them sequentially updating the PMD with the
1078
+
* same value. Following the break-before-make
1079
+
* (pmd_clear() followed by tlb_flush()) process can
1080
+
* hinder forward progress due to refaults generated
1081
+
* on missing translations.
1082
+
*
1083
+
* Skip updating the page table if the entry is
1084
+
* unchanged.
1085
+
*/
1086
+
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1087
+
return 0;
1088
+
1071
1089
if (pmd_present(old_pmd)) {
1072
1090
/*
1073
-
* Multiple vcpus faulting on the same PMD entry, can
1074
-
* lead to them sequentially updating the PMD with the
1075
-
* same value. Following the break-before-make
1076
-
* (pmd_clear() followed by tlb_flush()) process can
1077
-
* hinder forward progress due to refaults generated
1078
-
* on missing translations.
1091
+
* If we already have PTE level mapping for this block,
1092
+
* we must unmap it to avoid inconsistent TLB state and
1093
+
* leaking the table page. We could end up in this situation
1094
+
* if the memory slot was marked for dirty logging and was
1095
+
* reverted, leaving PTE level mappings for the pages accessed
1096
+
* during the period. So, unmap the PTE level mapping for this
1097
+
* block and retry, as we could have released the upper level
1098
+
* table in the process.
1079
1099
*
1080
-
* Skip updating the page table if the entry is
1081
-
* unchanged.
1100
+
* Normal THP split/merge follows mmu_notifier callbacks and do
1101
+
* get handled accordingly.
1082
1102
*/
1083
-
if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1084
-
return 0;
1085
-
1103
+
if (!pmd_thp_or_huge(old_pmd)) {
1104
+
unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1105
+
goto retry;
1106
+
}
1086
1107
/*
1087
1108
* Mapping in huge pages should only happen through a
1088
1109
* fault. If a page is merged into a transparent huge
···
1112
1097
* should become splitting first, unmapped, merged,
1113
1098
* and mapped back in on-demand.
1114
1099
*/
1115
-
VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1116
-
1100
+
WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1117
1101
pmd_clear(pmd);
1118
1102
kvm_tlb_flush_vmid_ipa(kvm, addr);
1119
1103
} else {
···
1128
1114
{
1129
1115
pud_t *pudp, old_pud;
1130
1116
1117
+
retry:
1131
1118
pudp = stage2_get_pud(kvm, cache, addr);
1132
1119
VM_BUG_ON(!pudp);
1133
1120
···
1136
1121
1137
1122
/*
1138
1123
* A large number of vcpus faulting on the same stage 2 entry,
1139
-
* can lead to a refault due to the
1140
-
* stage2_pud_clear()/tlb_flush(). Skip updating the page
1141
-
* tables if there is no change.
1124
+
* can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1125
+
* Skip updating the page tables if there is no change.
1142
1126
*/
1143
1127
if (pud_val(old_pud) == pud_val(*new_pudp))
1144
1128
return 0;
1145
1129
1146
1130
if (stage2_pud_present(kvm, old_pud)) {
1131
+
/*
1132
+
* If we already have table level mapping for this block, unmap
1133
+
* the range for this block and retry.
1134
+
*/
1135
+
if (!stage2_pud_huge(kvm, old_pud)) {
1136
+
unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1137
+
goto retry;
1138
+
}
1139
+
1140
+
WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1147
1141
stage2_pud_clear(kvm, pudp);
1148
1142
kvm_tlb_flush_vmid_ipa(kvm, addr);
1149
1143
} else {
···
1475
1451
}
1476
1452
1477
1453
/**
1478
-
* stage2_wp_puds - write protect PGD range
1479
-
* @pgd: pointer to pgd entry
1480
-
* @addr: range start address
1481
-
* @end: range end address
1482
-
*
1483
-
* Process PUD entries, for a huge PUD we cause a panic.
1484
-
*/
1454
+
* stage2_wp_puds - write protect PGD range
1455
+
* @pgd: pointer to pgd entry
1456
+
* @addr: range start address
1457
+
* @end: range end address
1458
+
*/
1485
1459
static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
1486
1460
phys_addr_t addr, phys_addr_t end)
1487
1461
{
···
1616
1594
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1617
1595
}
1618
1596
1619
-
static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1620
-
unsigned long hva)
1597
+
static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1598
+
unsigned long hva,
1599
+
unsigned long map_size)
1621
1600
{
1622
1601
gpa_t gpa_start;
1623
1602
hva_t uaddr_start, uaddr_end;
···
1633
1610
1634
1611
/*
1635
1612
* Pages belonging to memslots that don't have the same alignment
1636
-
* within a PMD for userspace and IPA cannot be mapped with stage-2
1637
-
* PMD entries, because we'll end up mapping the wrong pages.
1613
+
* within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1614
+
* PMD/PUD entries, because we'll end up mapping the wrong pages.
1638
1615
*
1639
1616
* Consider a layout like the following:
1640
1617
*
1641
1618
* memslot->userspace_addr:
1642
1619
* +-----+--------------------+--------------------+---+
1643
-
* |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
1620
+
* |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
1644
1621
* +-----+--------------------+--------------------+---+
1645
1622
*
1646
1623
* memslot->base_gfn << PAGE_SIZE:
1647
1624
* +---+--------------------+--------------------+-----+
1648
-
* |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
1625
+
* |abc|def Stage-2 block | Stage-2 block |tvxyz|
1649
1626
* +---+--------------------+--------------------+-----+
1650
1627
*
1651
-
* If we create those stage-2 PMDs, we'll end up with this incorrect
1628
+
* If we create those stage-2 blocks, we'll end up with this incorrect
1652
1629
* mapping:
1653
1630
* d -> f
1654
1631
* e -> g
1655
1632
* f -> h
1656
1633
*/
1657
-
if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
1634
+
if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1658
1635
return false;
1659
1636
1660
1637
/*
1661
1638
* Next, let's make sure we're not trying to map anything not covered
1662
-
* by the memslot. This means we have to prohibit PMD size mappings
1663
-
* for the beginning and end of a non-PMD aligned and non-PMD sized
1639
+
* by the memslot. This means we have to prohibit block size mappings
1640
+
* for the beginning and end of a non-block aligned and non-block sized
1664
1641
* memory slot (illustrated by the head and tail parts of the
1665
1642
* userspace view above containing pages 'abcde' and 'xyz',
1666
1643
* respectively).
···
1669
1646
* userspace_addr or the base_gfn, as both are equally aligned (per
1670
1647
* the check above) and equally sized.
1671
1648
*/
1672
-
return (hva & S2_PMD_MASK) >= uaddr_start &&
1673
-
(hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
1649
+
return (hva & ~(map_size - 1)) >= uaddr_start &&
1650
+
(hva & ~(map_size - 1)) + map_size <= uaddr_end;
1674
1651
}
1675
1652
1676
1653
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
···
1699
1676
return -EFAULT;
1700
1677
}
1701
1678
1702
-
if (!fault_supports_stage2_pmd_mappings(memslot, hva))
1703
-
force_pte = true;
1704
-
1705
-
if (logging_active)
1706
-
force_pte = true;
1707
-
1708
1679
/* Let's check if we will get back a huge page backed by hugetlbfs */
1709
1680
down_read(¤t->mm->mmap_sem);
1710
1681
vma = find_vma_intersection(current->mm, hva, hva + 1);
···
1709
1692
}
1710
1693
1711
1694
vma_pagesize = vma_kernel_pagesize(vma);
1695
+
if (logging_active ||
1696
+
!fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1697
+
force_pte = true;
1698
+
vma_pagesize = PAGE_SIZE;
1699
+
}
1700
+
1712
1701
/*
1713
1702
* The stage2 has a minimum of 2 level table (For arm64 see
1714
1703
* kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
···
1722
1699
* As for PUD huge maps, we must make sure that we have at least
1723
1700
* 3 levels, i.e, PMD is not folded.
1724
1701
*/
1725
-
if ((vma_pagesize == PMD_SIZE ||
1726
-
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) &&
1727
-
!force_pte) {
1702
+
if (vma_pagesize == PMD_SIZE ||
1703
+
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1728
1704
gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1729
-
}
1730
1705
up_read(¤t->mm->mmap_sem);
1731
1706
1732
1707
/* We need minimum second+third level pages */
+19
-12
virt/kvm/arm/vgic/vgic-its.c
+19
-12
virt/kvm/arm/vgic/vgic-its.c
···
754
754
u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
755
755
phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
756
756
int esz = GITS_BASER_ENTRY_SIZE(baser);
757
-
int index;
757
+
int index, idx;
758
758
gfn_t gfn;
759
+
bool ret;
759
760
760
761
switch (type) {
761
762
case GITS_BASER_TYPE_DEVICE:
···
783
782
784
783
if (eaddr)
785
784
*eaddr = addr;
786
-
return kvm_is_visible_gfn(its->dev->kvm, gfn);
785
+
786
+
goto out;
787
787
}
788
788
789
789
/* calculate and check the index into the 1st level */
···
814
812
815
813
if (eaddr)
816
814
*eaddr = indirect_ptr;
817
-
return kvm_is_visible_gfn(its->dev->kvm, gfn);
815
+
816
+
out:
817
+
idx = srcu_read_lock(&its->dev->kvm->srcu);
818
+
ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
819
+
srcu_read_unlock(&its->dev->kvm->srcu, idx);
820
+
return ret;
818
821
}
819
822
820
823
static int vgic_its_alloc_collection(struct vgic_its *its,
···
1736
1729
kfree(its);
1737
1730
}
1738
1731
1739
-
int vgic_its_has_attr_regs(struct kvm_device *dev,
1740
-
struct kvm_device_attr *attr)
1732
+
static int vgic_its_has_attr_regs(struct kvm_device *dev,
1733
+
struct kvm_device_attr *attr)
1741
1734
{
1742
1735
const struct vgic_register_region *region;
1743
1736
gpa_t offset = attr->attr;
···
1757
1750
return 0;
1758
1751
}
1759
1752
1760
-
int vgic_its_attr_regs_access(struct kvm_device *dev,
1761
-
struct kvm_device_attr *attr,
1762
-
u64 *reg, bool is_write)
1753
+
static int vgic_its_attr_regs_access(struct kvm_device *dev,
1754
+
struct kvm_device_attr *attr,
1755
+
u64 *reg, bool is_write)
1763
1756
{
1764
1757
const struct vgic_register_region *region;
1765
1758
struct vgic_its *its;
···
1926
1919
((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
1927
1920
ite->collection->collection_id;
1928
1921
val = cpu_to_le64(val);
1929
-
return kvm_write_guest(kvm, gpa, &val, ite_esz);
1922
+
return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
1930
1923
}
1931
1924
1932
1925
/**
···
2073
2066
(itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
2074
2067
(dev->num_eventid_bits - 1));
2075
2068
val = cpu_to_le64(val);
2076
-
return kvm_write_guest(kvm, ptr, &val, dte_esz);
2069
+
return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
2077
2070
}
2078
2071
2079
2072
/**
···
2253
2246
((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
2254
2247
collection->collection_id);
2255
2248
val = cpu_to_le64(val);
2256
-
return kvm_write_guest(its->dev->kvm, gpa, &val, esz);
2249
+
return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
2257
2250
}
2258
2251
2259
2252
static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
···
2324
2317
*/
2325
2318
val = 0;
2326
2319
BUG_ON(cte_esz > sizeof(val));
2327
-
ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz);
2320
+
ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
2328
2321
return ret;
2329
2322
}
2330
2323
+2
-2
virt/kvm/arm/vgic/vgic-v3.c
+2
-2
virt/kvm/arm/vgic/vgic-v3.c
···
358
358
if (status) {
359
359
/* clear consumed data */
360
360
val &= ~(1 << bit_nr);
361
-
ret = kvm_write_guest(kvm, ptr, &val, 1);
361
+
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
362
362
if (ret)
363
363
return ret;
364
364
}
···
409
409
else
410
410
val &= ~(1 << bit_nr);
411
411
412
-
ret = kvm_write_guest(kvm, ptr, &val, 1);
412
+
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
413
413
if (ret)
414
414
return ret;
415
415
}
+10
-4
virt/kvm/arm/vgic/vgic.c
+10
-4
virt/kvm/arm/vgic/vgic.c
···
867
867
* either observe the new interrupt before or after doing this check,
868
868
* and introducing additional synchronization mechanism doesn't change
869
869
* this.
870
+
*
871
+
* Note that we still need to go through the whole thing if anything
872
+
* can be directly injected (GICv4).
870
873
*/
871
-
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
874
+
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
875
+
!vgic_supports_direct_msis(vcpu->kvm))
872
876
return;
873
877
874
878
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
875
879
876
-
raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
877
-
vgic_flush_lr_state(vcpu);
878
-
raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
880
+
if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
881
+
raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
882
+
vgic_flush_lr_state(vcpu);
883
+
raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
884
+
}
879
885
880
886
if (can_access_vgic_from_kernel())
881
887
vgic_restore_state(vcpu);