Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: arm64: Move VTCR_EL2 into struct s2_mmu

We currently have a global VTCR_EL2 value for each guest, even
if the guest uses NV. This implies that the guest's own S2 must
fit in the host's. This is odd, for multiple reasons:

- the PARange values and the number of IPA bits don't necessarily
match: you can have 33 bits of IPA space, and yet you can only
describe 32 or 36 bits of PARange

- When userspace set the IPA space, it creates a contract with the
kernel saying "this is the IPA space I'm prepared to handle".
At no point does it constraint the guest's own IPA space as
long as the guest doesn't try to use a [I]PA outside of the
IPA space set by userspace

- We don't even try to hide the value of ID_AA64MMFR0_EL1.PARange.

And then there is the consequence of the above: if a guest tries
to create a S2 that has for input address something that is larger
than the IPA space defined by the host, we inject a fatal exception.

This is no good. For all intent and purposes, a guest should be
able to have the S2 it really wants, as long as the *output* address
of that S2 isn't outside of the IPA space.

For that, we need to have a per-s2_mmu VTCR_EL2 setting, which
allows us to represent the full PARange. Move the vctr field into
the s2_mmu structure, which has no impact whatsoever, except for NV.

Note that once we are able to override ID_AA64MMFR0_EL1.PARange
from userspace, we'll also be able to restrict the size of the
shadow S2 that NV uses.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231012205108.3937270-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>

authored by

Marc Zyngier and committed by
Oliver Upton
fe49fd94 934bf871

+33 -24
+10 -3
arch/arm64/include/asm/kvm_host.h
··· 158 158 phys_addr_t pgd_phys; 159 159 struct kvm_pgtable *pgt; 160 160 161 + /* 162 + * VTCR value used on the host. For a non-NV guest (or a NV 163 + * guest that runs in a context where its own S2 doesn't 164 + * apply), its T0SZ value reflects that of the IPA size. 165 + * 166 + * For a shadow S2 MMU, T0SZ reflects the PARange exposed to 167 + * the guest. 168 + */ 169 + u64 vtcr; 170 + 161 171 /* The last vcpu id that ran on each physical CPU */ 162 172 int __percpu *last_vcpu_ran; 163 173 ··· 214 204 215 205 struct kvm_arch { 216 206 struct kvm_s2_mmu mmu; 217 - 218 - /* VTCR_EL2 value for this VM */ 219 - u64 vtcr; 220 207 221 208 /* Interrupt controller */ 222 209 struct vgic_dist vgic;
+4 -4
arch/arm64/include/asm/kvm_mmu.h
··· 150 150 */ 151 151 #define KVM_PHYS_SHIFT (40) 152 152 153 - #define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr) 154 - #define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) 155 - #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) 153 + #define kvm_phys_shift(mmu) VTCR_EL2_IPA((mmu)->vtcr) 154 + #define kvm_phys_size(mmu) (_AC(1, ULL) << kvm_phys_shift(mmu)) 155 + #define kvm_phys_mask(mmu) (kvm_phys_size(mmu) - _AC(1, ULL)) 156 156 157 157 #include <asm/kvm_pgtable.h> 158 158 #include <asm/stage2_pgtable.h> ··· 299 299 static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, 300 300 struct kvm_arch *arch) 301 301 { 302 - write_sysreg(arch->vtcr, vtcr_el2); 302 + write_sysreg(mmu->vtcr, vtcr_el2); 303 303 write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); 304 304 305 305 /*
+2 -2
arch/arm64/include/asm/stage2_pgtable.h
··· 21 21 * (IPA_SHIFT - 4). 22 22 */ 23 23 #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) 24 - #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) 24 + #define kvm_stage2_levels(mmu) VTCR_EL2_LVLS((mmu)->vtcr) 25 25 26 26 /* 27 27 * kvm_mmmu_cache_min_pages() is the number of pages required to install 28 28 * a stage-2 translation. We pre-allocate the entry level page table at 29 29 * the VM creation. 30 30 */ 31 - #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) 31 + #define kvm_mmu_cache_min_pages(mmu) (kvm_stage2_levels(mmu) - 1) 32 32 33 33 #endif /* __ARM64_S2_PGTABLE_H_ */
+4 -4
arch/arm64/kvm/hyp/nvhe/mem_protect.c
··· 129 129 parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); 130 130 phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); 131 131 132 - host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, 133 - id_aa64mmfr1_el1_sys_val, phys_shift); 132 + host_mmu.arch.mmu.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, 133 + id_aa64mmfr1_el1_sys_val, phys_shift); 134 134 } 135 135 136 136 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); ··· 235 235 unsigned long nr_pages; 236 236 int ret; 237 237 238 - nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT; 238 + nr_pages = kvm_pgtable_stage2_pgd_size(mmu->vtcr) >> PAGE_SHIFT; 239 239 ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0); 240 240 if (ret) 241 241 return ret; ··· 295 295 return -EPERM; 296 296 297 297 params->vttbr = kvm_get_vttbr(mmu); 298 - params->vtcr = host_mmu.arch.vtcr; 298 + params->vtcr = mmu->vtcr; 299 299 params->hcr_el2 |= HCR_VM; 300 300 301 301 /*
+2 -2
arch/arm64/kvm/hyp/nvhe/pkvm.c
··· 303 303 { 304 304 hyp_vm->host_kvm = host_kvm; 305 305 hyp_vm->kvm.created_vcpus = nr_vcpus; 306 - hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr; 306 + hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; 307 307 } 308 308 309 309 static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, ··· 483 483 } 484 484 485 485 vm_size = pkvm_get_hyp_vm_size(nr_vcpus); 486 - pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr); 486 + pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr); 487 487 488 488 ret = -ENOMEM; 489 489
+1 -1
arch/arm64/kvm/hyp/pgtable.c
··· 1511 1511 kvm_pgtable_force_pte_cb_t force_pte_cb) 1512 1512 { 1513 1513 size_t pgd_sz; 1514 - u64 vtcr = mmu->arch->vtcr; 1514 + u64 vtcr = mmu->vtcr; 1515 1515 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1516 1516 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1517 1517 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+7 -6
arch/arm64/kvm/mmu.c
··· 892 892 893 893 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 894 894 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 895 - kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 895 + mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 896 896 897 897 if (mmu->pgt != NULL) { 898 898 kvm_err("kvm_arch already initialized?\n"); ··· 1067 1067 phys_addr_t addr; 1068 1068 int ret = 0; 1069 1069 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1070 - struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; 1070 + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1071 + struct kvm_pgtable *pgt = mmu->pgt; 1071 1072 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1072 1073 KVM_PGTABLE_PROT_R | 1073 1074 (writable ? KVM_PGTABLE_PROT_W : 0); ··· 1081 1080 1082 1081 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1083 1082 ret = kvm_mmu_topup_memory_cache(&cache, 1084 - kvm_mmu_cache_min_pages(kvm)); 1083 + kvm_mmu_cache_min_pages(mmu)); 1085 1084 if (ret) 1086 1085 break; 1087 1086 ··· 1432 1431 if (fault_status != ESR_ELx_FSC_PERM || 1433 1432 (logging_active && write_fault)) { 1434 1433 ret = kvm_mmu_topup_memory_cache(memcache, 1435 - kvm_mmu_cache_min_pages(kvm)); 1434 + kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); 1436 1435 if (ret) 1437 1436 return ret; 1438 1437 } ··· 1748 1747 } 1749 1748 1750 1749 /* Userspace should not be able to register out-of-bounds IPAs */ 1751 - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); 1750 + VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 1752 1751 1753 1752 if (fault_status == ESR_ELx_FSC_ACCESS) { 1754 1753 handle_access_fault(vcpu, fault_ipa); ··· 2022 2021 * Prevent userspace from creating a memory region outside of the IPA 2023 2022 * space addressable by the KVM guest IPA space. 2024 2023 */ 2025 - if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT)) 2024 + if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2026 2025 return -EFAULT; 2027 2026 2028 2027 hva = new->userspace_addr;
+1 -1
arch/arm64/kvm/pkvm.c
··· 123 123 if (host_kvm->created_vcpus < 1) 124 124 return -EINVAL; 125 125 126 - pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr); 126 + pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr); 127 127 128 128 /* 129 129 * The PGD pages will be reclaimed using a hyp_memcache which implies
+2 -1
arch/arm64/kvm/vgic/vgic-kvm-device.c
··· 27 27 if (addr + size < addr) 28 28 return -EINVAL; 29 29 30 - if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm)) 30 + if (addr & ~kvm_phys_mask(&kvm->arch.mmu) || 31 + (addr + size) > kvm_phys_size(&kvm->arch.mmu)) 31 32 return -E2BIG; 32 33 33 34 return 0;