Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

kvm: arm64: Get rid of fake page table levels

On arm64, the hardware supports concatenation of upto 16 tables,
at entry level for stage2 translations and we make use that whenever
possible. This could lead to reduced number of translation levels than
the normal (stage1 table) table. Also, since the IPA(40bit) is smaller
than the some of the supported VA_BITS (e.g, 48bit), there could be
different number of levels in stage-1 vs stage-2 tables. To reuse the
kernel host page table walker for stage2 we have been using a fake
software page table level, not known to the hardware. But with 16K
translations, there could be upto 2 fake software levels (with 48bit VA
and 40bit IPA), which complicates the code. Hence, we want to get rid of
the hack.

Now that we have explicit accessors for hyp vs stage2 page tables,
define the stage2 walker helpers accordingly based on the actual
table used by the hardware.

Once we know the number of translation levels used by the hardware,
it is merely a job of defining the helpers based on whether a
particular level is folded or not, looking at the number of levels.

Some facts before we calculate the translation levels:

1) Smallest page size supported by arm64 is 4K.
2) The minimum number of bits resolved at any page table level
is (PAGE_SHIFT - 3) at intermediate levels.
Both of them implies, minimum number of bits required for a level
change is 9.

Since we can concatenate upto 16 tables at stage2 entry, the total
number of page table levels used by the hardware for resolving N bits
is same as that for (N - 4) bits (with concatenation), as there cannot
be a level in between (N, N-4) as per the above rules.

Hence, we have

STAGE2_PGTABLE_LEVELS = PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)

With the current IPA limit (40bit), for all supported translations
and VA_BITS, we have the following condition (even for 36bit VA with
16K page size):

CONFIG_PGTABLE_LEVELS >= STAGE2_PGTABLE_LEVELS.

So, for e.g, if PUD is present in stage2, it is present in the hyp(host).
Hence, we fall back to the host definition if we find that a level is not
folded. Otherwise we redefine it accordingly. A build time check is added
to make sure the above condition holds. If this condition breaks in future,
we can rearrange the host level helpers and fix our code easily.

Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>

authored by

Suzuki K Poulose and committed by
Christoffer Dall
da04fa04 8684e701

+172 -95
+2 -62
arch/arm64/include/asm/kvm_mmu.h
··· 45 45 */ 46 46 #define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK) 47 47 48 - /* 49 - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation 50 - * levels in addition to the PGD and potentially the PUD which are 51 - * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2 52 - * tables use one level of tables less than the kernel. 53 - */ 54 - #ifdef CONFIG_ARM64_64K_PAGES 55 - #define KVM_MMU_CACHE_MIN_PAGES 1 56 - #else 57 - #define KVM_MMU_CACHE_MIN_PAGES 2 58 - #endif 59 - 60 48 #ifdef __ASSEMBLY__ 61 49 62 50 #include <asm/alternative.h> ··· 143 155 144 156 static inline void *kvm_get_hwpgd(struct kvm *kvm) 145 157 { 146 - pgd_t *pgd = kvm->arch.pgd; 147 - pud_t *pud; 148 - 149 - if (KVM_PREALLOC_LEVEL == 0) 150 - return pgd; 151 - 152 - pud = pud_offset(pgd, 0); 153 - if (KVM_PREALLOC_LEVEL == 1) 154 - return pud; 155 - 156 - BUG_ON(KVM_PREALLOC_LEVEL != 2); 157 - return pmd_offset(pud, 0); 158 + return kvm->arch.pgd; 158 159 } 159 160 160 161 static inline unsigned int kvm_get_hwpgd_size(void) 161 162 { 162 - if (KVM_PREALLOC_LEVEL > 0) 163 - return PTRS_PER_S2_PGD * PAGE_SIZE; 164 163 return PTRS_PER_S2_PGD * sizeof(pgd_t); 165 164 } 166 165 167 - /* 168 - * Allocate fake pgd for the host kernel page table macros to work. 169 - * This is not used by the hardware and we have no alignment 170 - * requirement for this allocation. 171 - */ 172 166 static inline pgd_t *kvm_setup_fake_pgd(pgd_t *hwpgd) 173 167 { 174 - int i; 175 - pgd_t *pgd; 176 - 177 - if (!KVM_PREALLOC_LEVEL) 178 - return hwpgd; 179 - 180 - /* 181 - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for 182 - * the PMD and the kernel will use folded pud. 183 - * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD 184 - * pages. 185 - */ 186 - 187 - pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), 188 - GFP_KERNEL | __GFP_ZERO); 189 - if (!pgd) 190 - return ERR_PTR(-ENOMEM); 191 - 192 - /* Plug the HW PGD into the fake one. */ 193 - for (i = 0; i < PTRS_PER_S2_PGD; i++) { 194 - if (KVM_PREALLOC_LEVEL == 1) 195 - pgd_populate(NULL, pgd + i, 196 - (pud_t *)hwpgd + i * PTRS_PER_PUD); 197 - else if (KVM_PREALLOC_LEVEL == 2) 198 - pud_populate(NULL, pud_offset(pgd, 0) + i, 199 - (pmd_t *)hwpgd + i * PTRS_PER_PMD); 200 - } 201 - 202 - return pgd; 168 + return hwpgd; 203 169 } 204 170 205 171 static inline void kvm_free_fake_pgd(pgd_t *pgd) 206 172 { 207 - if (KVM_PREALLOC_LEVEL > 0) 208 - kfree(pgd); 209 173 } 210 174 static inline bool kvm_page_empty(void *ptr) 211 175 {
+42
arch/arm64/include/asm/stage2_pgtable-nopmd.h
··· 1 + /* 2 + * Copyright (C) 2016 - ARM Ltd 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License version 2 as 6 + * published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 + * GNU General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public License 14 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 + */ 16 + 17 + #ifndef __ARM64_S2_PGTABLE_NOPMD_H_ 18 + #define __ARM64_S2_PGTABLE_NOPMD_H_ 19 + 20 + #include <asm/stage2_pgtable-nopud.h> 21 + 22 + #define __S2_PGTABLE_PMD_FOLDED 23 + 24 + #define S2_PMD_SHIFT S2_PUD_SHIFT 25 + #define S2_PTRS_PER_PMD 1 26 + #define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) 27 + #define S2_PMD_MASK (~(S2_PMD_SIZE-1)) 28 + 29 + #define stage2_pud_none(pud) (0) 30 + #define stage2_pud_present(pud) (1) 31 + #define stage2_pud_clear(pud) do { } while (0) 32 + #define stage2_pud_populate(pud, pmd) do { } while (0) 33 + #define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) 34 + 35 + #define stage2_pmd_free(pmd) do { } while (0) 36 + 37 + #define stage2_pmd_addr_end(addr, end) (end) 38 + 39 + #define stage2_pud_huge(pud) (0) 40 + #define stage2_pmd_table_empty(pmdp) (0) 41 + 42 + #endif
+39
arch/arm64/include/asm/stage2_pgtable-nopud.h
··· 1 + /* 2 + * Copyright (C) 2016 - ARM Ltd 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License version 2 as 6 + * published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, 9 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 + * GNU General Public License for more details. 12 + * 13 + * You should have received a copy of the GNU General Public License 14 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 + */ 16 + 17 + #ifndef __ARM64_S2_PGTABLE_NOPUD_H_ 18 + #define __ARM64_S2_PGTABLE_NOPUD_H_ 19 + 20 + #define __S2_PGTABLE_PUD_FOLDED 21 + 22 + #define S2_PUD_SHIFT S2_PGDIR_SHIFT 23 + #define S2_PTRS_PER_PUD 1 24 + #define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) 25 + #define S2_PUD_MASK (~(S2_PUD_SIZE-1)) 26 + 27 + #define stage2_pgd_none(pgd) (0) 28 + #define stage2_pgd_present(pgd) (1) 29 + #define stage2_pgd_clear(pgd) do { } while (0) 30 + #define stage2_pgd_populate(pgd, pud) do { } while (0) 31 + 32 + #define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) 33 + 34 + #define stage2_pud_free(x) do { } while (0) 35 + 36 + #define stage2_pud_addr_end(addr, end) (end) 37 + #define stage2_pud_table_empty(pmdp) (0) 38 + 39 + #endif
+89 -33
arch/arm64/include/asm/stage2_pgtable.h
··· 22 22 #include <asm/pgtable.h> 23 23 24 24 /* 25 - * In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address 26 - * the entire IPA input range with a single pgd entry, and we would only need 27 - * one pgd entry. Note that in this case, the pgd is actually not used by 28 - * the MMU for Stage-2 translations, but is merely a fake pgd used as a data 29 - * structure for the kernel pgtable macros to work. 25 + * The hardware supports concatenation of up to 16 tables at stage2 entry level 26 + * and we use the feature whenever possible. 27 + * 28 + * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). 29 + * On arm64, the smallest PAGE_SIZE supported is 4k, which means 30 + * (PAGE_SHIFT - 3) > 4 holds for all page sizes. 31 + * This implies, the total number of page table levels at stage2 expected 32 + * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) 33 + * in normal translations(e.g, stage1), since we cannot have another level in 34 + * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). 30 35 */ 31 - #if PGDIR_SHIFT > KVM_PHYS_SHIFT 32 - #define PTRS_PER_S2_PGD_SHIFT 0 33 - #else 34 - #define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT) 35 - #endif 36 - #define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) 36 + #define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) 37 37 38 38 /* 39 - * If we are concatenating first level stage-2 page tables, we would have less 40 - * than or equal to 16 pointers in the fake PGD, because that's what the 41 - * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS) 42 - * represents the first level for the host, and we add 1 to go to the next 43 - * level (which uses contatenation) for the stage-2 tables. 39 + * With all the supported VA_BITs and 40bit guest IPA, the following condition 40 + * is always true: 41 + * 42 + * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS 43 + * 44 + * We base our stage-2 page table walker helpers on this assumption and 45 + * fall back to using the host version of the helper wherever possible. 46 + * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back 47 + * to using the host version, since it is guaranteed it is not folded at host. 48 + * 49 + * If the condition breaks in the future, we can rearrange the host level 50 + * definitions and reuse them for stage2. Till then... 44 51 */ 45 - #if PTRS_PER_S2_PGD <= 16 46 - #define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1) 47 - #else 48 - #define KVM_PREALLOC_LEVEL (0) 52 + #if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS 53 + #error "Unsupported combination of guest IPA and host VA_BITS." 49 54 #endif 55 + 56 + /* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ 57 + #define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) 58 + #define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) 59 + #define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) 60 + 61 + /* 62 + * The number of PTRS across all concatenated stage2 tables given by the 63 + * number of bits resolved at the initial level. 64 + */ 65 + #define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) 66 + 67 + /* 68 + * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation 69 + * levels in addition to the PGD. 70 + */ 71 + #define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) 72 + 73 + 74 + #if STAGE2_PGTABLE_LEVELS > 3 75 + 76 + #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) 77 + #define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) 78 + #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) 50 79 51 80 #define stage2_pgd_none(pgd) pgd_none(pgd) 52 81 #define stage2_pgd_clear(pgd) pgd_clear(pgd) ··· 83 54 #define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) 84 55 #define stage2_pud_offset(pgd, address) pud_offset(pgd, address) 85 56 #define stage2_pud_free(pud) pud_free(NULL, pud) 57 + 58 + #define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) 59 + 60 + static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) 61 + { 62 + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; 63 + 64 + return (boundary - 1 < end - 1) ? boundary : end; 65 + } 66 + 67 + #endif /* STAGE2_PGTABLE_LEVELS > 3 */ 68 + 69 + 70 + #if STAGE2_PGTABLE_LEVELS > 2 71 + 72 + #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) 73 + #define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) 74 + #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) 86 75 87 76 #define stage2_pud_none(pud) pud_none(pud) 88 77 #define stage2_pud_clear(pud) pud_clear(pud) ··· 110 63 #define stage2_pmd_free(pmd) pmd_free(NULL, pmd) 111 64 112 65 #define stage2_pud_huge(pud) pud_huge(pud) 66 + #define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) 113 67 114 - #define stage2_pgd_addr_end(address, end) pgd_addr_end(address, end) 115 - #define stage2_pud_addr_end(address, end) pud_addr_end(address, end) 116 - #define stage2_pmd_addr_end(address, end) pmd_addr_end(address, end) 68 + static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) 69 + { 70 + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; 71 + 72 + return (boundary - 1 < end - 1) ? boundary : end; 73 + } 74 + 75 + #endif /* STAGE2_PGTABLE_LEVELS > 2 */ 117 76 118 77 #define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) 119 - #ifdef __PGTABLE_PMD_FOLDED 120 - #define stage2_pmd_table_empty(pmdp) (0) 121 - #else 122 - #define stage2_pmd_table_empty(pmdp) ((KVM_PREALLOC_LEVEL < 2) && kvm_page_empty(pmdp)) 78 + 79 + #if STAGE2_PGTABLE_LEVELS == 2 80 + #include <asm/stage2_pgtable-nopmd.h> 81 + #elif STAGE2_PGTABLE_LEVELS == 3 82 + #include <asm/stage2_pgtable-nopud.h> 123 83 #endif 124 84 125 - #ifdef __PGTABLE_PUD_FOLDED 126 - #define stage2_pud_table_empty(pudp) (0) 127 - #else 128 - #define stage2_pud_table_empty(pudp) ((KVM_PREALLOC_LEVEL < 1) && kvm_page_empty(pudp)) 129 - #endif 130 85 131 - #define stage2_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) 86 + #define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) 87 + 88 + static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) 89 + { 90 + phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; 91 + 92 + return (boundary - 1 < end - 1) ? boundary : end; 93 + } 132 94 133 95 #endif /* __ARM64_S2_PGTABLE_H_ */