Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S HV: Handle page fault for a nested guest

Consider a normal (L1) guest running under the main hypervisor (L0),
and then a nested guest (L2) running under the L1 guest which is acting
as a nested hypervisor. L0 has page tables to map the address space for
L1 providing the translation from L1 real address -> L0 real address;

L1
|
| (L1 -> L0)
|
----> L0

There are also page tables in L1 used to map the address space for L2
providing the translation from L2 real address -> L1 read address. Since
the hardware can only walk a single level of page table, we need to
maintain in L0 a "shadow_pgtable" for L2 which provides the translation
from L2 real address -> L0 real address. Which looks like;

L2 L2
| |
| (L2 -> L1) |
| |
----> L1 | (L2 -> L0)
| |
| (L1 -> L0) |
| |
----> L0 --------> L0

When a page fault occurs while running a nested (L2) guest we need to
insert a pte into this "shadow_pgtable" for the L2 -> L0 mapping. To
do this we need to:

1. Walk the pgtable in L1 memory to find the L2 -> L1 mapping, and
provide a page fault to L1 if this mapping doesn't exist.
2. Use our L1 -> L0 pgtable to convert this L1 address to an L0 address,
or try to insert a pte for that mapping if it doesn't exist.
3. Now we have a L2 -> L0 mapping, insert this into our shadow_pgtable

Once this mapping exists we can take rc faults when hardware is unable
to automatically set the reference and change bits in the pte. On these
we need to:

1. Check the rc bits on the L2 -> L1 pte match, and otherwise reflect
the fault down to L1.
2. Set the rc bits in the L1 -> L0 pte which corresponds to the same
host page.
3. Set the rc bits in the L2 -> L0 pte.

As we reuse a large number of functions in book3s_64_mmu_radix.c for
this we also needed to refactor a number of these functions to take
an lpid parameter so that the correct lpid is used for tlb invalidations.
The functionality however has remained the same.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Suraj Jitindar Singh and committed by
Michael Ellerman
fd10be25 4bad7779

+473 -86
+1
arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
··· 53 53 unsigned long addr, 54 54 unsigned long page_size); 55 55 extern void radix__flush_pwc_lpid(unsigned int lpid); 56 + extern void radix__flush_tlb_lpid(unsigned int lpid); 56 57 extern void radix__local_flush_tlb_lpid(unsigned int lpid); 57 58 extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); 58 59
+17
arch/powerpc/include/asm/kvm_book3s.h
··· 188 188 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, 189 189 struct kvm_vcpu *vcpu, 190 190 unsigned long ea, unsigned long dsisr); 191 + extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, 192 + struct kvmppc_pte *gpte, u64 root, 193 + u64 *pte_ret_p); 191 194 extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, 192 195 struct kvmppc_pte *gpte, u64 table, 193 196 int table_index, u64 *pte_ret_p); 194 197 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 195 198 struct kvmppc_pte *gpte, bool data, bool iswrite); 199 + extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, 200 + bool writing, unsigned long gpa, 201 + unsigned int lpid); 202 + extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, 203 + unsigned long gpa, 204 + struct kvm_memory_slot *memslot, 205 + bool writing, bool kvm_ro, 206 + pte_t *inserted_pte, unsigned int *levelp); 196 207 extern int kvmppc_init_vm_radix(struct kvm *kvm); 197 208 extern void kvmppc_free_radix(struct kvm *kvm); 209 + extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, 210 + unsigned int lpid); 198 211 extern int kvmppc_radix_init(void); 199 212 extern void kvmppc_radix_exit(void); 200 213 extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 201 214 unsigned long gfn); 215 + extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, 216 + unsigned long gpa, unsigned int shift, 217 + struct kvm_memory_slot *memslot, 218 + unsigned int lpid); 202 219 extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 203 220 unsigned long gfn); 204 221 extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+4
arch/powerpc/include/asm/kvm_book3s_64.h
··· 549 549 } 550 550 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ 551 551 552 + extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, 553 + unsigned long gpa, unsigned int level, 554 + unsigned long mmu_seq, unsigned int lpid); 555 + 552 556 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 553 557 554 558 #endif /* __ASM_KVM_BOOK3S_64_H__ */
+2
arch/powerpc/include/asm/kvm_host.h
··· 367 367 bool may_write : 1; 368 368 bool may_execute : 1; 369 369 unsigned long wimg; 370 + unsigned long rc; 370 371 u8 page_size; /* MMU_PAGE_xxx */ 372 + u8 page_shift; 371 373 }; 372 374 373 375 struct kvmppc_mmu {
+111 -83
arch/powerpc/kvm/book3s_64_mmu_radix.c
··· 29 29 */ 30 30 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; 31 31 32 - /* 33 - * Used to walk a partition or process table radix tree in guest memory 34 - * Note: We exploit the fact that a partition table and a process 35 - * table have the same layout, a partition-scoped page table and a 36 - * process-scoped page table have the same layout, and the 2nd 37 - * doubleword of a partition table entry has the same layout as 38 - * the PTCR register. 39 - */ 40 - int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, 41 - struct kvmppc_pte *gpte, u64 table, 42 - int table_index, u64 *pte_ret_p) 32 + int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, 33 + struct kvmppc_pte *gpte, u64 root, 34 + u64 *pte_ret_p) 43 35 { 44 36 struct kvm *kvm = vcpu->kvm; 45 37 int ret, level, ps; 46 - unsigned long ptbl, root; 47 - unsigned long rts, bits, offset; 48 - unsigned long size, index; 49 - struct prtb_entry entry; 38 + unsigned long rts, bits, offset, index; 50 39 u64 pte, base, gpa; 51 40 __be64 rpte; 52 41 53 - if ((table & PRTS_MASK) > 24) 54 - return -EINVAL; 55 - size = 1ul << ((table & PRTS_MASK) + 12); 56 - 57 - /* Is the table big enough to contain this entry? */ 58 - if ((table_index * sizeof(entry)) >= size) 59 - return -EINVAL; 60 - 61 - /* Read the table to find the root of the radix tree */ 62 - ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); 63 - ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); 64 - if (ret) 65 - return ret; 66 - 67 - /* Root is stored in the first double word */ 68 - root = be64_to_cpu(entry.prtb0); 69 42 rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | 70 43 ((root & RTS2_MASK) >> RTS2_SHIFT); 71 44 bits = root & RPDS_MASK; ··· 52 79 53 80 /* Walk each level of the radix tree */ 54 81 for (level = 3; level >= 0; --level) { 82 + u64 addr; 55 83 /* Check a valid size */ 56 84 if (level && bits != p9_supported_radix_bits[level]) 57 85 return -EINVAL; ··· 64 90 if (base & ((1UL << (bits + 3)) - 1)) 65 91 return -EINVAL; 66 92 /* Read the entry from guest memory */ 67 - ret = kvm_read_guest(kvm, base + (index * sizeof(rpte)), 68 - &rpte, sizeof(rpte)); 69 - if (ret) 93 + addr = base + (index * sizeof(rpte)); 94 + ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); 95 + if (ret) { 96 + if (pte_ret_p) 97 + *pte_ret_p = addr; 70 98 return ret; 99 + } 71 100 pte = __be64_to_cpu(rpte); 72 101 if (!(pte & _PAGE_PRESENT)) 73 102 return -ENOENT; ··· 96 119 if (offset == mmu_psize_defs[ps].shift) 97 120 break; 98 121 gpte->page_size = ps; 122 + gpte->page_shift = offset; 99 123 100 124 gpte->eaddr = eaddr; 101 125 gpte->raddr = gpa; ··· 106 128 gpte->may_write = !!(pte & _PAGE_WRITE); 107 129 gpte->may_execute = !!(pte & _PAGE_EXEC); 108 130 131 + gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); 132 + 109 133 if (pte_ret_p) 110 134 *pte_ret_p = pte; 111 135 112 136 return 0; 137 + } 138 + 139 + /* 140 + * Used to walk a partition or process table radix tree in guest memory 141 + * Note: We exploit the fact that a partition table and a process 142 + * table have the same layout, a partition-scoped page table and a 143 + * process-scoped page table have the same layout, and the 2nd 144 + * doubleword of a partition table entry has the same layout as 145 + * the PTCR register. 146 + */ 147 + int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, 148 + struct kvmppc_pte *gpte, u64 table, 149 + int table_index, u64 *pte_ret_p) 150 + { 151 + struct kvm *kvm = vcpu->kvm; 152 + int ret; 153 + unsigned long size, ptbl, root; 154 + struct prtb_entry entry; 155 + 156 + if ((table & PRTS_MASK) > 24) 157 + return -EINVAL; 158 + size = 1ul << ((table & PRTS_MASK) + 12); 159 + 160 + /* Is the table big enough to contain this entry? */ 161 + if ((table_index * sizeof(entry)) >= size) 162 + return -EINVAL; 163 + 164 + /* Read the table to find the root of the radix tree */ 165 + ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); 166 + ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); 167 + if (ret) 168 + return ret; 169 + 170 + /* Root is stored in the first double word */ 171 + root = be64_to_cpu(entry.prtb0); 172 + 173 + return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); 113 174 } 114 175 115 176 int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, ··· 198 181 } 199 182 200 183 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, 201 - unsigned int pshift) 184 + unsigned int pshift, unsigned int lpid) 202 185 { 203 186 unsigned long psize = PAGE_SIZE; 204 187 ··· 206 189 psize = 1UL << pshift; 207 190 208 191 addr &= ~(psize - 1); 209 - radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize); 192 + radix__flush_tlb_lpid_page(lpid, addr, psize); 210 193 } 211 194 212 - static void kvmppc_radix_flush_pwc(struct kvm *kvm) 195 + static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) 213 196 { 214 - radix__flush_pwc_lpid(kvm->arch.lpid); 197 + radix__flush_pwc_lpid(lpid); 215 198 } 216 199 217 200 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, ··· 256 239 kmem_cache_free(kvm_pmd_cache, pmdp); 257 240 } 258 241 259 - static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, 260 - unsigned long gpa, unsigned int shift, 261 - struct kvm_memory_slot *memslot) 242 + void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, 243 + unsigned long gpa, unsigned int shift, 244 + struct kvm_memory_slot *memslot, 245 + unsigned int lpid) 262 246 263 247 { 264 248 unsigned long old; 265 249 266 250 old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); 267 - kvmppc_radix_tlbie_page(kvm, gpa, shift); 268 - if (old & _PAGE_DIRTY) { 251 + kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); 252 + if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) { 269 253 unsigned long gfn = gpa >> PAGE_SHIFT; 270 254 unsigned long page_size = PAGE_SIZE; 271 255 ··· 289 271 * and emit a warning if encountered, but there may already be data 290 272 * corruption due to the unexpected mappings. 291 273 */ 292 - static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) 274 + static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, 275 + unsigned int lpid) 293 276 { 294 277 if (full) { 295 278 memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); ··· 304 285 WARN_ON_ONCE(1); 305 286 kvmppc_unmap_pte(kvm, p, 306 287 pte_pfn(*p) << PAGE_SHIFT, 307 - PAGE_SHIFT, NULL); 288 + PAGE_SHIFT, NULL, lpid); 308 289 } 309 290 } 310 291 311 292 kvmppc_pte_free(pte); 312 293 } 313 294 314 - static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) 295 + static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, 296 + unsigned int lpid) 315 297 { 316 298 unsigned long im; 317 299 pmd_t *p = pmd; ··· 327 307 WARN_ON_ONCE(1); 328 308 kvmppc_unmap_pte(kvm, (pte_t *)p, 329 309 pte_pfn(*(pte_t *)p) << PAGE_SHIFT, 330 - PMD_SHIFT, NULL); 310 + PMD_SHIFT, NULL, lpid); 331 311 } 332 312 } else { 333 313 pte_t *pte; 334 314 335 315 pte = pte_offset_map(p, 0); 336 - kvmppc_unmap_free_pte(kvm, pte, full); 316 + kvmppc_unmap_free_pte(kvm, pte, full, lpid); 337 317 pmd_clear(p); 338 318 } 339 319 } 340 320 kvmppc_pmd_free(pmd); 341 321 } 342 322 343 - static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) 323 + static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, 324 + unsigned int lpid) 344 325 { 345 326 unsigned long iu; 346 327 pud_t *p = pud; ··· 355 334 pmd_t *pmd; 356 335 357 336 pmd = pmd_offset(p, 0); 358 - kvmppc_unmap_free_pmd(kvm, pmd, true); 337 + kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); 359 338 pud_clear(p); 360 339 } 361 340 } 362 341 pud_free(kvm->mm, pud); 363 342 } 364 343 365 - void kvmppc_free_radix(struct kvm *kvm) 344 + void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) 366 345 { 367 346 unsigned long ig; 368 - pgd_t *pgd; 369 347 370 - if (!kvm->arch.pgtable) 371 - return; 372 - pgd = kvm->arch.pgtable; 373 348 for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { 374 349 pud_t *pud; 375 350 376 351 if (!pgd_present(*pgd)) 377 352 continue; 378 353 pud = pud_offset(pgd, 0); 379 - kvmppc_unmap_free_pud(kvm, pud); 354 + kvmppc_unmap_free_pud(kvm, pud, lpid); 380 355 pgd_clear(pgd); 381 356 } 382 - pgd_free(kvm->mm, kvm->arch.pgtable); 383 - kvm->arch.pgtable = NULL; 357 + } 358 + 359 + void kvmppc_free_radix(struct kvm *kvm) 360 + { 361 + if (kvm->arch.pgtable) { 362 + kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, 363 + kvm->arch.lpid); 364 + pgd_free(kvm->mm, kvm->arch.pgtable); 365 + kvm->arch.pgtable = NULL; 366 + } 384 367 } 385 368 386 369 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, 387 - unsigned long gpa) 370 + unsigned long gpa, unsigned int lpid) 388 371 { 389 372 pte_t *pte = pte_offset_kernel(pmd, 0); 390 373 ··· 398 373 * flushing the PWC again. 399 374 */ 400 375 pmd_clear(pmd); 401 - kvmppc_radix_flush_pwc(kvm); 376 + kvmppc_radix_flush_pwc(kvm, lpid); 402 377 403 - kvmppc_unmap_free_pte(kvm, pte, false); 378 + kvmppc_unmap_free_pte(kvm, pte, false, lpid); 404 379 } 405 380 406 381 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, 407 - unsigned long gpa) 382 + unsigned long gpa, unsigned int lpid) 408 383 { 409 384 pmd_t *pmd = pmd_offset(pud, 0); 410 385 ··· 414 389 * so can be freed without flushing the PWC again. 415 390 */ 416 391 pud_clear(pud); 417 - kvmppc_radix_flush_pwc(kvm); 392 + kvmppc_radix_flush_pwc(kvm, lpid); 418 393 419 - kvmppc_unmap_free_pmd(kvm, pmd, false); 394 + kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); 420 395 } 421 396 422 397 /* ··· 428 403 */ 429 404 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) 430 405 431 - static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, 432 - unsigned long gpa, unsigned int level, 433 - unsigned long mmu_seq) 406 + int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, 407 + unsigned long gpa, unsigned int level, 408 + unsigned long mmu_seq, unsigned int lpid) 434 409 { 435 410 pgd_t *pgd; 436 411 pud_t *pud, *new_pud = NULL; ··· 496 471 goto out_unlock; 497 472 } 498 473 /* Valid 1GB page here already, remove it */ 499 - kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL); 474 + kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, 475 + lpid); 500 476 } 501 477 if (level == 2) { 502 478 if (!pud_none(*pud)) { ··· 506 480 * install a large page, so remove and free the page 507 481 * table page. 508 482 */ 509 - kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa); 483 + kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); 510 484 } 511 485 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); 512 486 ret = 0; ··· 532 506 WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & 533 507 PTE_BITS_MUST_MATCH); 534 508 kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), 535 - 0, pte_val(pte), lgpa, PMD_SHIFT); 509 + 0, pte_val(pte), lgpa, PMD_SHIFT); 536 510 ret = 0; 537 511 goto out_unlock; 538 512 } ··· 546 520 goto out_unlock; 547 521 } 548 522 /* Valid 2MB page here already, remove it */ 549 - kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL); 523 + kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, 524 + lpid); 550 525 } 551 526 if (level == 1) { 552 527 if (!pmd_none(*pmd)) { ··· 556 529 * install a large page, so remove and free the page 557 530 * table page. 558 531 */ 559 - kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa); 532 + kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); 560 533 } 561 534 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); 562 535 ret = 0; ··· 596 569 return ret; 597 570 } 598 571 599 - static bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, 600 - bool writing, unsigned long gpa) 572 + bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, 573 + unsigned long gpa, unsigned int lpid) 601 574 { 602 575 unsigned long pgflags; 603 576 unsigned int shift; ··· 624 597 return false; 625 598 } 626 599 627 - static int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, 628 - unsigned long gpa, 629 - struct kvm_memory_slot *memslot, 630 - bool writing, bool kvm_ro, 631 - pte_t *inserted_pte, unsigned int *levelp) 600 + int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, 601 + unsigned long gpa, 602 + struct kvm_memory_slot *memslot, 603 + bool writing, bool kvm_ro, 604 + pte_t *inserted_pte, unsigned int *levelp) 632 605 { 633 606 struct kvm *kvm = vcpu->kvm; 634 607 struct page *page = NULL; ··· 710 683 711 684 /* Allocate space in the tree and write the PTE */ 712 685 ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, 713 - mmu_seq); 686 + mmu_seq, kvm->arch.lpid); 714 687 if (inserted_pte) 715 688 *inserted_pte = pte; 716 689 if (levelp) ··· 785 758 if (dsisr & DSISR_SET_RC) { 786 759 spin_lock(&kvm->mmu_lock); 787 760 if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, 788 - writing, gpa)) 761 + writing, gpa, kvm->arch.lpid)) 789 762 dsisr &= ~DSISR_SET_RC; 790 763 spin_unlock(&kvm->mmu_lock); 791 764 ··· 813 786 814 787 ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 815 788 if (ptep && pte_present(*ptep)) 816 - kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot); 789 + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, 790 + kvm->arch.lpid); 817 791 return 0; 818 792 } 819 793 ··· 869 841 ret = 1 << (shift - PAGE_SHIFT); 870 842 kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, 871 843 gpa, shift); 872 - kvmppc_radix_tlbie_page(kvm, gpa, shift); 844 + kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); 873 845 } 874 846 return ret; 875 847 }
+329 -3
arch/powerpc/kvm/book3s_hv_nested.c
··· 12 12 #include <linux/kvm_host.h> 13 13 14 14 #include <asm/kvm_ppc.h> 15 + #include <asm/kvm_book3s.h> 15 16 #include <asm/mmu.h> 16 17 #include <asm/pgtable.h> 17 18 #include <asm/pgalloc.h> 19 + #include <asm/pte-walk.h> 20 + #include <asm/reg.h> 18 21 19 22 static struct patb_entry *pseries_partition_tb; 20 23 ··· 406 403 */ 407 404 static void kvmhv_release_nested(struct kvm_nested_guest *gp) 408 405 { 406 + struct kvm *kvm = gp->l1_host; 407 + 408 + if (gp->shadow_pgtable) { 409 + /* 410 + * No vcpu is using this struct and no call to 411 + * kvmhv_get_nested can find this struct, 412 + * so we don't need to hold kvm->mmu_lock. 413 + */ 414 + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, 415 + gp->shadow_lpid); 416 + pgd_free(kvm->mm, gp->shadow_pgtable); 417 + } 409 418 kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0); 410 419 kvmppc_free_lpid(gp->shadow_lpid); 411 - if (gp->shadow_pgtable) 412 - pgd_free(gp->l1_host->mm, gp->shadow_pgtable); 413 420 kfree(gp); 414 421 } 415 422 ··· 479 466 /* caller must hold gp->tlb_lock */ 480 467 void kvmhv_flush_nested(struct kvm_nested_guest *gp) 481 468 { 469 + struct kvm *kvm = gp->l1_host; 470 + 471 + spin_lock(&kvm->mmu_lock); 472 + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid); 473 + spin_unlock(&kvm->mmu_lock); 474 + radix__flush_tlb_lpid(gp->shadow_lpid); 482 475 kvmhv_update_ptbl_cache(gp); 483 476 if (gp->l1_gr_to_hr == 0) 484 477 kvmhv_remove_nested(gp); ··· 544 525 kvmhv_release_nested(gp); 545 526 } 546 527 547 - long kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) 528 + static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, 529 + struct kvm_nested_guest *gp, 530 + long gpa, int *shift_ret) 548 531 { 532 + struct kvm *kvm = vcpu->kvm; 533 + bool ret = false; 534 + pte_t *ptep; 535 + int shift; 536 + 537 + spin_lock(&kvm->mmu_lock); 538 + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); 539 + if (!shift) 540 + shift = PAGE_SHIFT; 541 + if (ptep && pte_present(*ptep)) { 542 + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); 543 + ret = true; 544 + } 545 + spin_unlock(&kvm->mmu_lock); 546 + 547 + if (shift_ret) 548 + *shift_ret = shift; 549 + return ret; 550 + } 551 + 552 + /* Used to convert a nested guest real address to a L1 guest real address */ 553 + static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu, 554 + struct kvm_nested_guest *gp, 555 + unsigned long n_gpa, unsigned long dsisr, 556 + struct kvmppc_pte *gpte_p) 557 + { 558 + u64 fault_addr, flags = dsisr & DSISR_ISSTORE; 559 + int ret; 560 + 561 + ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr, 562 + &fault_addr); 563 + 564 + if (ret) { 565 + /* We didn't find a pte */ 566 + if (ret == -EINVAL) { 567 + /* Unsupported mmu config */ 568 + flags |= DSISR_UNSUPP_MMU; 569 + } else if (ret == -ENOENT) { 570 + /* No translation found */ 571 + flags |= DSISR_NOHPTE; 572 + } else if (ret == -EFAULT) { 573 + /* Couldn't access L1 real address */ 574 + flags |= DSISR_PRTABLE_FAULT; 575 + vcpu->arch.fault_gpa = fault_addr; 576 + } else { 577 + /* Unknown error */ 578 + return ret; 579 + } 580 + goto forward_to_l1; 581 + } else { 582 + /* We found a pte -> check permissions */ 583 + if (dsisr & DSISR_ISSTORE) { 584 + /* Can we write? */ 585 + if (!gpte_p->may_write) { 586 + flags |= DSISR_PROTFAULT; 587 + goto forward_to_l1; 588 + } 589 + } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { 590 + /* Can we execute? */ 591 + if (!gpte_p->may_execute) { 592 + flags |= SRR1_ISI_N_OR_G; 593 + goto forward_to_l1; 594 + } 595 + } else { 596 + /* Can we read? */ 597 + if (!gpte_p->may_read && !gpte_p->may_write) { 598 + flags |= DSISR_PROTFAULT; 599 + goto forward_to_l1; 600 + } 601 + } 602 + } 603 + 604 + return 0; 605 + 606 + forward_to_l1: 607 + vcpu->arch.fault_dsisr = flags; 608 + if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { 609 + vcpu->arch.shregs.msr &= ~0x783f0000ul; 610 + vcpu->arch.shregs.msr |= flags; 611 + } 549 612 return RESUME_HOST; 613 + } 614 + 615 + static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, 616 + struct kvm_nested_guest *gp, 617 + unsigned long n_gpa, 618 + struct kvmppc_pte gpte, 619 + unsigned long dsisr) 620 + { 621 + struct kvm *kvm = vcpu->kvm; 622 + bool writing = !!(dsisr & DSISR_ISSTORE); 623 + u64 pgflags; 624 + bool ret; 625 + 626 + /* Are the rc bits set in the L1 partition scoped pte? */ 627 + pgflags = _PAGE_ACCESSED; 628 + if (writing) 629 + pgflags |= _PAGE_DIRTY; 630 + if (pgflags & ~gpte.rc) 631 + return RESUME_HOST; 632 + 633 + spin_lock(&kvm->mmu_lock); 634 + /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ 635 + ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, 636 + gpte.raddr, kvm->arch.lpid); 637 + spin_unlock(&kvm->mmu_lock); 638 + if (!ret) 639 + return -EINVAL; 640 + 641 + /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ 642 + ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, 643 + gp->shadow_lpid); 644 + if (!ret) 645 + return -EINVAL; 646 + return 0; 647 + } 648 + 649 + static inline int kvmppc_radix_level_to_shift(int level) 650 + { 651 + switch (level) { 652 + case 2: 653 + return PUD_SHIFT; 654 + case 1: 655 + return PMD_SHIFT; 656 + default: 657 + return PAGE_SHIFT; 658 + } 659 + } 660 + 661 + static inline int kvmppc_radix_shift_to_level(int shift) 662 + { 663 + if (shift == PUD_SHIFT) 664 + return 2; 665 + if (shift == PMD_SHIFT) 666 + return 1; 667 + if (shift == PAGE_SHIFT) 668 + return 0; 669 + WARN_ON_ONCE(1); 670 + return 0; 671 + } 672 + 673 + /* called with gp->tlb_lock held */ 674 + static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, 675 + struct kvm_nested_guest *gp) 676 + { 677 + struct kvm *kvm = vcpu->kvm; 678 + struct kvm_memory_slot *memslot; 679 + struct kvmppc_pte gpte; 680 + pte_t pte, *pte_p; 681 + unsigned long mmu_seq; 682 + unsigned long dsisr = vcpu->arch.fault_dsisr; 683 + unsigned long ea = vcpu->arch.fault_dar; 684 + unsigned long n_gpa, gpa, gfn, perm = 0UL; 685 + unsigned int shift, l1_shift, level; 686 + bool writing = !!(dsisr & DSISR_ISSTORE); 687 + bool kvm_ro = false; 688 + long int ret; 689 + 690 + if (!gp->l1_gr_to_hr) { 691 + kvmhv_update_ptbl_cache(gp); 692 + if (!gp->l1_gr_to_hr) 693 + return RESUME_HOST; 694 + } 695 + 696 + /* Convert the nested guest real address into a L1 guest real address */ 697 + 698 + n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL; 699 + if (!(dsisr & DSISR_PRTABLE_FAULT)) 700 + n_gpa |= ea & 0xFFF; 701 + ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte); 702 + 703 + /* 704 + * If the hardware found a translation but we don't now have a usable 705 + * translation in the l1 partition-scoped tree, remove the shadow pte 706 + * and let the guest retry. 707 + */ 708 + if (ret == RESUME_HOST && 709 + (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G | 710 + DSISR_BAD_COPYPASTE))) 711 + goto inval; 712 + if (ret) 713 + return ret; 714 + 715 + /* Failed to set the reference/change bits */ 716 + if (dsisr & DSISR_SET_RC) { 717 + ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr); 718 + if (ret == RESUME_HOST) 719 + return ret; 720 + if (ret) 721 + goto inval; 722 + dsisr &= ~DSISR_SET_RC; 723 + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | 724 + DSISR_PROTFAULT))) 725 + return RESUME_GUEST; 726 + } 727 + 728 + /* 729 + * We took an HISI or HDSI while we were running a nested guest which 730 + * means we have no partition scoped translation for that. This means 731 + * we need to insert a pte for the mapping into our shadow_pgtable. 732 + */ 733 + 734 + l1_shift = gpte.page_shift; 735 + if (l1_shift < PAGE_SHIFT) { 736 + /* We don't support l1 using a page size smaller than our own */ 737 + pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n", 738 + l1_shift, PAGE_SHIFT); 739 + return -EINVAL; 740 + } 741 + gpa = gpte.raddr; 742 + gfn = gpa >> PAGE_SHIFT; 743 + 744 + /* 1. Get the corresponding host memslot */ 745 + 746 + memslot = gfn_to_memslot(kvm, gfn); 747 + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 748 + if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) { 749 + /* unusual error -> reflect to the guest as a DSI */ 750 + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); 751 + return RESUME_GUEST; 752 + } 753 + /* passthrough of emulated MMIO case... */ 754 + pr_err("emulated MMIO passthrough?\n"); 755 + return -EINVAL; 756 + } 757 + if (memslot->flags & KVM_MEM_READONLY) { 758 + if (writing) { 759 + /* Give the guest a DSI */ 760 + kvmppc_core_queue_data_storage(vcpu, ea, 761 + DSISR_ISSTORE | DSISR_PROTFAULT); 762 + return RESUME_GUEST; 763 + } 764 + kvm_ro = true; 765 + } 766 + 767 + /* 2. Find the host pte for this L1 guest real address */ 768 + 769 + /* Used to check for invalidations in progress */ 770 + mmu_seq = kvm->mmu_notifier_seq; 771 + smp_rmb(); 772 + 773 + /* See if can find translation in our partition scoped tables for L1 */ 774 + pte = __pte(0); 775 + spin_lock(&kvm->mmu_lock); 776 + pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); 777 + if (!shift) 778 + shift = PAGE_SHIFT; 779 + if (pte_p) 780 + pte = *pte_p; 781 + spin_unlock(&kvm->mmu_lock); 782 + 783 + if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) { 784 + /* No suitable pte found -> try to insert a mapping */ 785 + ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, 786 + writing, kvm_ro, &pte, &level); 787 + if (ret == -EAGAIN) 788 + return RESUME_GUEST; 789 + else if (ret) 790 + return ret; 791 + shift = kvmppc_radix_level_to_shift(level); 792 + } 793 + 794 + /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ 795 + 796 + /* The permissions is the combination of the host and l1 guest ptes */ 797 + perm |= gpte.may_read ? 0UL : _PAGE_READ; 798 + perm |= gpte.may_write ? 0UL : _PAGE_WRITE; 799 + perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; 800 + pte = __pte(pte_val(pte) & ~perm); 801 + 802 + /* What size pte can we insert? */ 803 + if (shift > l1_shift) { 804 + u64 mask; 805 + unsigned int actual_shift = PAGE_SHIFT; 806 + if (PMD_SHIFT < l1_shift) 807 + actual_shift = PMD_SHIFT; 808 + mask = (1UL << shift) - (1UL << actual_shift); 809 + pte = __pte(pte_val(pte) | (gpa & mask)); 810 + shift = actual_shift; 811 + } 812 + level = kvmppc_radix_shift_to_level(shift); 813 + n_gpa &= ~((1UL << shift) - 1); 814 + 815 + /* 4. Insert the pte into our shadow_pgtable */ 816 + 817 + ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level, 818 + mmu_seq, gp->shadow_lpid); 819 + if (ret == -EAGAIN) 820 + ret = RESUME_GUEST; /* Let the guest try again */ 821 + 822 + return ret; 823 + 824 + inval: 825 + kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL); 826 + return RESUME_GUEST; 827 + } 828 + 829 + long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) 830 + { 831 + struct kvm_nested_guest *gp = vcpu->arch.nested; 832 + long int ret; 833 + 834 + mutex_lock(&gp->tlb_lock); 835 + ret = __kvmhv_nested_page_fault(vcpu, gp); 836 + mutex_unlock(&gp->tlb_lock); 837 + return ret; 550 838 }
+9
arch/powerpc/mm/tlb-radix.c
··· 833 833 /* 834 834 * Flush partition scoped translations from LPID (=LPIDR) 835 835 */ 836 + void radix__flush_tlb_lpid(unsigned int lpid) 837 + { 838 + _tlbie_lpid(lpid, RIC_FLUSH_ALL); 839 + } 840 + EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); 841 + 842 + /* 843 + * Flush partition scoped translations from LPID (=LPIDR) 844 + */ 836 845 void radix__local_flush_tlb_lpid(unsigned int lpid) 837 846 { 838 847 _tlbiel_lpid(lpid, RIC_FLUSH_ALL);