Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.13 1003 lines 26 kB view raw
1/* 2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 3 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License, version 2, as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 */ 18 19#include <linux/mman.h> 20#include <linux/kvm_host.h> 21#include <linux/io.h> 22#include <linux/hugetlb.h> 23#include <trace/events/kvm.h> 24#include <asm/pgalloc.h> 25#include <asm/cacheflush.h> 26#include <asm/kvm_arm.h> 27#include <asm/kvm_mmu.h> 28#include <asm/kvm_mmio.h> 29#include <asm/kvm_asm.h> 30#include <asm/kvm_emulate.h> 31 32#include "trace.h" 33 34extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; 35 36static pgd_t *boot_hyp_pgd; 37static pgd_t *hyp_pgd; 38static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 39 40static void *init_bounce_page; 41static unsigned long hyp_idmap_start; 42static unsigned long hyp_idmap_end; 43static phys_addr_t hyp_idmap_vector; 44 45#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) 46 47static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 48{ 49 /* 50 * This function also gets called when dealing with HYP page 51 * tables. As HYP doesn't have an associated struct kvm (and 52 * the HYP page tables are fairly static), we don't do 53 * anything there. 54 */ 55 if (kvm) 56 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 57} 58 59static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 60 int min, int max) 61{ 62 void *page; 63 64 BUG_ON(max > KVM_NR_MEM_OBJS); 65 if (cache->nobjs >= min) 66 return 0; 67 while (cache->nobjs < max) { 68 page = (void *)__get_free_page(PGALLOC_GFP); 69 if (!page) 70 return -ENOMEM; 71 cache->objects[cache->nobjs++] = page; 72 } 73 return 0; 74} 75 76static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 77{ 78 while (mc->nobjs) 79 free_page((unsigned long)mc->objects[--mc->nobjs]); 80} 81 82static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 83{ 84 void *p; 85 86 BUG_ON(!mc || !mc->nobjs); 87 p = mc->objects[--mc->nobjs]; 88 return p; 89} 90 91static bool page_empty(void *ptr) 92{ 93 struct page *ptr_page = virt_to_page(ptr); 94 return page_count(ptr_page) == 1; 95} 96 97static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 98{ 99 if (pud_huge(*pud)) { 100 pud_clear(pud); 101 kvm_tlb_flush_vmid_ipa(kvm, addr); 102 } else { 103 pmd_t *pmd_table = pmd_offset(pud, 0); 104 pud_clear(pud); 105 kvm_tlb_flush_vmid_ipa(kvm, addr); 106 pmd_free(NULL, pmd_table); 107 } 108 put_page(virt_to_page(pud)); 109} 110 111static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 112{ 113 if (kvm_pmd_huge(*pmd)) { 114 pmd_clear(pmd); 115 kvm_tlb_flush_vmid_ipa(kvm, addr); 116 } else { 117 pte_t *pte_table = pte_offset_kernel(pmd, 0); 118 pmd_clear(pmd); 119 kvm_tlb_flush_vmid_ipa(kvm, addr); 120 pte_free_kernel(NULL, pte_table); 121 } 122 put_page(virt_to_page(pmd)); 123} 124 125static void clear_pte_entry(struct kvm *kvm, pte_t *pte, phys_addr_t addr) 126{ 127 if (pte_present(*pte)) { 128 kvm_set_pte(pte, __pte(0)); 129 put_page(virt_to_page(pte)); 130 kvm_tlb_flush_vmid_ipa(kvm, addr); 131 } 132} 133 134static void unmap_range(struct kvm *kvm, pgd_t *pgdp, 135 unsigned long long start, u64 size) 136{ 137 pgd_t *pgd; 138 pud_t *pud; 139 pmd_t *pmd; 140 pte_t *pte; 141 unsigned long long addr = start, end = start + size; 142 u64 next; 143 144 while (addr < end) { 145 pgd = pgdp + pgd_index(addr); 146 pud = pud_offset(pgd, addr); 147 if (pud_none(*pud)) { 148 addr = pud_addr_end(addr, end); 149 continue; 150 } 151 152 if (pud_huge(*pud)) { 153 /* 154 * If we are dealing with a huge pud, just clear it and 155 * move on. 156 */ 157 clear_pud_entry(kvm, pud, addr); 158 addr = pud_addr_end(addr, end); 159 continue; 160 } 161 162 pmd = pmd_offset(pud, addr); 163 if (pmd_none(*pmd)) { 164 addr = pmd_addr_end(addr, end); 165 continue; 166 } 167 168 if (!kvm_pmd_huge(*pmd)) { 169 pte = pte_offset_kernel(pmd, addr); 170 clear_pte_entry(kvm, pte, addr); 171 next = addr + PAGE_SIZE; 172 } 173 174 /* 175 * If the pmd entry is to be cleared, walk back up the ladder 176 */ 177 if (kvm_pmd_huge(*pmd) || page_empty(pte)) { 178 clear_pmd_entry(kvm, pmd, addr); 179 next = pmd_addr_end(addr, end); 180 if (page_empty(pmd) && !page_empty(pud)) { 181 clear_pud_entry(kvm, pud, addr); 182 next = pud_addr_end(addr, end); 183 } 184 } 185 186 addr = next; 187 } 188} 189 190/** 191 * free_boot_hyp_pgd - free HYP boot page tables 192 * 193 * Free the HYP boot page tables. The bounce page is also freed. 194 */ 195void free_boot_hyp_pgd(void) 196{ 197 mutex_lock(&kvm_hyp_pgd_mutex); 198 199 if (boot_hyp_pgd) { 200 unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); 201 unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 202 kfree(boot_hyp_pgd); 203 boot_hyp_pgd = NULL; 204 } 205 206 if (hyp_pgd) 207 unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); 208 209 kfree(init_bounce_page); 210 init_bounce_page = NULL; 211 212 mutex_unlock(&kvm_hyp_pgd_mutex); 213} 214 215/** 216 * free_hyp_pgds - free Hyp-mode page tables 217 * 218 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 219 * therefore contains either mappings in the kernel memory area (above 220 * PAGE_OFFSET), or device mappings in the vmalloc range (from 221 * VMALLOC_START to VMALLOC_END). 222 * 223 * boot_hyp_pgd should only map two pages for the init code. 224 */ 225void free_hyp_pgds(void) 226{ 227 unsigned long addr; 228 229 free_boot_hyp_pgd(); 230 231 mutex_lock(&kvm_hyp_pgd_mutex); 232 233 if (hyp_pgd) { 234 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) 235 unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 236 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) 237 unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); 238 239 kfree(hyp_pgd); 240 hyp_pgd = NULL; 241 } 242 243 mutex_unlock(&kvm_hyp_pgd_mutex); 244} 245 246static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 247 unsigned long end, unsigned long pfn, 248 pgprot_t prot) 249{ 250 pte_t *pte; 251 unsigned long addr; 252 253 addr = start; 254 do { 255 pte = pte_offset_kernel(pmd, addr); 256 kvm_set_pte(pte, pfn_pte(pfn, prot)); 257 get_page(virt_to_page(pte)); 258 kvm_flush_dcache_to_poc(pte, sizeof(*pte)); 259 pfn++; 260 } while (addr += PAGE_SIZE, addr != end); 261} 262 263static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 264 unsigned long end, unsigned long pfn, 265 pgprot_t prot) 266{ 267 pmd_t *pmd; 268 pte_t *pte; 269 unsigned long addr, next; 270 271 addr = start; 272 do { 273 pmd = pmd_offset(pud, addr); 274 275 BUG_ON(pmd_sect(*pmd)); 276 277 if (pmd_none(*pmd)) { 278 pte = pte_alloc_one_kernel(NULL, addr); 279 if (!pte) { 280 kvm_err("Cannot allocate Hyp pte\n"); 281 return -ENOMEM; 282 } 283 pmd_populate_kernel(NULL, pmd, pte); 284 get_page(virt_to_page(pmd)); 285 kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); 286 } 287 288 next = pmd_addr_end(addr, end); 289 290 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 291 pfn += (next - addr) >> PAGE_SHIFT; 292 } while (addr = next, addr != end); 293 294 return 0; 295} 296 297static int __create_hyp_mappings(pgd_t *pgdp, 298 unsigned long start, unsigned long end, 299 unsigned long pfn, pgprot_t prot) 300{ 301 pgd_t *pgd; 302 pud_t *pud; 303 pmd_t *pmd; 304 unsigned long addr, next; 305 int err = 0; 306 307 mutex_lock(&kvm_hyp_pgd_mutex); 308 addr = start & PAGE_MASK; 309 end = PAGE_ALIGN(end); 310 do { 311 pgd = pgdp + pgd_index(addr); 312 pud = pud_offset(pgd, addr); 313 314 if (pud_none_or_clear_bad(pud)) { 315 pmd = pmd_alloc_one(NULL, addr); 316 if (!pmd) { 317 kvm_err("Cannot allocate Hyp pmd\n"); 318 err = -ENOMEM; 319 goto out; 320 } 321 pud_populate(NULL, pud, pmd); 322 get_page(virt_to_page(pud)); 323 kvm_flush_dcache_to_poc(pud, sizeof(*pud)); 324 } 325 326 next = pgd_addr_end(addr, end); 327 err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 328 if (err) 329 goto out; 330 pfn += (next - addr) >> PAGE_SHIFT; 331 } while (addr = next, addr != end); 332out: 333 mutex_unlock(&kvm_hyp_pgd_mutex); 334 return err; 335} 336 337static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 338{ 339 if (!is_vmalloc_addr(kaddr)) { 340 BUG_ON(!virt_addr_valid(kaddr)); 341 return __pa(kaddr); 342 } else { 343 return page_to_phys(vmalloc_to_page(kaddr)) + 344 offset_in_page(kaddr); 345 } 346} 347 348/** 349 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 350 * @from: The virtual kernel start address of the range 351 * @to: The virtual kernel end address of the range (exclusive) 352 * 353 * The same virtual address as the kernel virtual address is also used 354 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 355 * physical pages. 356 */ 357int create_hyp_mappings(void *from, void *to) 358{ 359 phys_addr_t phys_addr; 360 unsigned long virt_addr; 361 unsigned long start = KERN_TO_HYP((unsigned long)from); 362 unsigned long end = KERN_TO_HYP((unsigned long)to); 363 364 start = start & PAGE_MASK; 365 end = PAGE_ALIGN(end); 366 367 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 368 int err; 369 370 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 371 err = __create_hyp_mappings(hyp_pgd, virt_addr, 372 virt_addr + PAGE_SIZE, 373 __phys_to_pfn(phys_addr), 374 PAGE_HYP); 375 if (err) 376 return err; 377 } 378 379 return 0; 380} 381 382/** 383 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode 384 * @from: The kernel start VA of the range 385 * @to: The kernel end VA of the range (exclusive) 386 * @phys_addr: The physical start address which gets mapped 387 * 388 * The resulting HYP VA is the same as the kernel VA, modulo 389 * HYP_PAGE_OFFSET. 390 */ 391int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) 392{ 393 unsigned long start = KERN_TO_HYP((unsigned long)from); 394 unsigned long end = KERN_TO_HYP((unsigned long)to); 395 396 /* Check for a valid kernel IO mapping */ 397 if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) 398 return -EINVAL; 399 400 return __create_hyp_mappings(hyp_pgd, start, end, 401 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); 402} 403 404/** 405 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 406 * @kvm: The KVM struct pointer for the VM. 407 * 408 * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can 409 * support either full 40-bit input addresses or limited to 32-bit input 410 * addresses). Clears the allocated pages. 411 * 412 * Note we don't need locking here as this is only called when the VM is 413 * created, which can only be done once. 414 */ 415int kvm_alloc_stage2_pgd(struct kvm *kvm) 416{ 417 pgd_t *pgd; 418 419 if (kvm->arch.pgd != NULL) { 420 kvm_err("kvm_arch already initialized?\n"); 421 return -EINVAL; 422 } 423 424 pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER); 425 if (!pgd) 426 return -ENOMEM; 427 428 memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t)); 429 kvm_clean_pgd(pgd); 430 kvm->arch.pgd = pgd; 431 432 return 0; 433} 434 435/** 436 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 437 * @kvm: The VM pointer 438 * @start: The intermediate physical base address of the range to unmap 439 * @size: The size of the area to unmap 440 * 441 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 442 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 443 * destroying the VM), otherwise another faulting VCPU may come in and mess 444 * with things behind our backs. 445 */ 446static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 447{ 448 unmap_range(kvm, kvm->arch.pgd, start, size); 449} 450 451/** 452 * kvm_free_stage2_pgd - free all stage-2 tables 453 * @kvm: The KVM struct pointer for the VM. 454 * 455 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 456 * underlying level-2 and level-3 tables before freeing the actual level-1 table 457 * and setting the struct pointer to NULL. 458 * 459 * Note we don't need locking here as this is only called when the VM is 460 * destroyed, which can only be done once. 461 */ 462void kvm_free_stage2_pgd(struct kvm *kvm) 463{ 464 if (kvm->arch.pgd == NULL) 465 return; 466 467 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); 468 free_pages((unsigned long)kvm->arch.pgd, S2_PGD_ORDER); 469 kvm->arch.pgd = NULL; 470} 471 472static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 473 phys_addr_t addr) 474{ 475 pgd_t *pgd; 476 pud_t *pud; 477 pmd_t *pmd; 478 479 pgd = kvm->arch.pgd + pgd_index(addr); 480 pud = pud_offset(pgd, addr); 481 if (pud_none(*pud)) { 482 if (!cache) 483 return NULL; 484 pmd = mmu_memory_cache_alloc(cache); 485 pud_populate(NULL, pud, pmd); 486 get_page(virt_to_page(pud)); 487 } 488 489 return pmd_offset(pud, addr); 490} 491 492static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 493 *cache, phys_addr_t addr, const pmd_t *new_pmd) 494{ 495 pmd_t *pmd, old_pmd; 496 497 pmd = stage2_get_pmd(kvm, cache, addr); 498 VM_BUG_ON(!pmd); 499 500 /* 501 * Mapping in huge pages should only happen through a fault. If a 502 * page is merged into a transparent huge page, the individual 503 * subpages of that huge page should be unmapped through MMU 504 * notifiers before we get here. 505 * 506 * Merging of CompoundPages is not supported; they should become 507 * splitting first, unmapped, merged, and mapped back in on-demand. 508 */ 509 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); 510 511 old_pmd = *pmd; 512 kvm_set_pmd(pmd, *new_pmd); 513 if (pmd_present(old_pmd)) 514 kvm_tlb_flush_vmid_ipa(kvm, addr); 515 else 516 get_page(virt_to_page(pmd)); 517 return 0; 518} 519 520static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 521 phys_addr_t addr, const pte_t *new_pte, bool iomap) 522{ 523 pmd_t *pmd; 524 pte_t *pte, old_pte; 525 526 /* Create stage-2 page table mapping - Level 1 */ 527 pmd = stage2_get_pmd(kvm, cache, addr); 528 if (!pmd) { 529 /* 530 * Ignore calls from kvm_set_spte_hva for unallocated 531 * address ranges. 532 */ 533 return 0; 534 } 535 536 /* Create stage-2 page mappings - Level 2 */ 537 if (pmd_none(*pmd)) { 538 if (!cache) 539 return 0; /* ignore calls from kvm_set_spte_hva */ 540 pte = mmu_memory_cache_alloc(cache); 541 kvm_clean_pte(pte); 542 pmd_populate_kernel(NULL, pmd, pte); 543 get_page(virt_to_page(pmd)); 544 } 545 546 pte = pte_offset_kernel(pmd, addr); 547 548 if (iomap && pte_present(*pte)) 549 return -EFAULT; 550 551 /* Create 2nd stage page table mapping - Level 3 */ 552 old_pte = *pte; 553 kvm_set_pte(pte, *new_pte); 554 if (pte_present(old_pte)) 555 kvm_tlb_flush_vmid_ipa(kvm, addr); 556 else 557 get_page(virt_to_page(pte)); 558 559 return 0; 560} 561 562/** 563 * kvm_phys_addr_ioremap - map a device range to guest IPA 564 * 565 * @kvm: The KVM pointer 566 * @guest_ipa: The IPA at which to insert the mapping 567 * @pa: The physical address of the device 568 * @size: The size of the mapping 569 */ 570int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 571 phys_addr_t pa, unsigned long size) 572{ 573 phys_addr_t addr, end; 574 int ret = 0; 575 unsigned long pfn; 576 struct kvm_mmu_memory_cache cache = { 0, }; 577 578 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 579 pfn = __phys_to_pfn(pa); 580 581 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 582 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); 583 584 ret = mmu_topup_memory_cache(&cache, 2, 2); 585 if (ret) 586 goto out; 587 spin_lock(&kvm->mmu_lock); 588 ret = stage2_set_pte(kvm, &cache, addr, &pte, true); 589 spin_unlock(&kvm->mmu_lock); 590 if (ret) 591 goto out; 592 593 pfn++; 594 } 595 596out: 597 mmu_free_memory_cache(&cache); 598 return ret; 599} 600 601static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) 602{ 603 pfn_t pfn = *pfnp; 604 gfn_t gfn = *ipap >> PAGE_SHIFT; 605 606 if (PageTransCompound(pfn_to_page(pfn))) { 607 unsigned long mask; 608 /* 609 * The address we faulted on is backed by a transparent huge 610 * page. However, because we map the compound huge page and 611 * not the individual tail page, we need to transfer the 612 * refcount to the head page. We have to be careful that the 613 * THP doesn't start to split while we are adjusting the 614 * refcounts. 615 * 616 * We are sure this doesn't happen, because mmu_notifier_retry 617 * was successful and we are holding the mmu_lock, so if this 618 * THP is trying to split, it will be blocked in the mmu 619 * notifier before touching any of the pages, specifically 620 * before being able to call __split_huge_page_refcount(). 621 * 622 * We can therefore safely transfer the refcount from PG_tail 623 * to PG_head and switch the pfn from a tail page to the head 624 * page accordingly. 625 */ 626 mask = PTRS_PER_PMD - 1; 627 VM_BUG_ON((gfn & mask) != (pfn & mask)); 628 if (pfn & mask) { 629 *ipap &= PMD_MASK; 630 kvm_release_pfn_clean(pfn); 631 pfn &= ~mask; 632 kvm_get_pfn(pfn); 633 *pfnp = pfn; 634 } 635 636 return true; 637 } 638 639 return false; 640} 641 642static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 643 struct kvm_memory_slot *memslot, 644 unsigned long fault_status) 645{ 646 int ret; 647 bool write_fault, writable, hugetlb = false, force_pte = false; 648 unsigned long mmu_seq; 649 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 650 unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); 651 struct kvm *kvm = vcpu->kvm; 652 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 653 struct vm_area_struct *vma; 654 pfn_t pfn; 655 656 write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); 657 if (fault_status == FSC_PERM && !write_fault) { 658 kvm_err("Unexpected L2 read permission error\n"); 659 return -EFAULT; 660 } 661 662 /* Let's check if we will get back a huge page backed by hugetlbfs */ 663 down_read(&current->mm->mmap_sem); 664 vma = find_vma_intersection(current->mm, hva, hva + 1); 665 if (is_vm_hugetlb_page(vma)) { 666 hugetlb = true; 667 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 668 } else { 669 /* 670 * Pages belonging to VMAs not aligned to the PMD mapping 671 * granularity cannot be mapped using block descriptors even 672 * if the pages belong to a THP for the process, because the 673 * stage-2 block descriptor will cover more than a single THP 674 * and we loose atomicity for unmapping, updates, and splits 675 * of the THP or other pages in the stage-2 block range. 676 */ 677 if (vma->vm_start & ~PMD_MASK) 678 force_pte = true; 679 } 680 up_read(&current->mm->mmap_sem); 681 682 /* We need minimum second+third level pages */ 683 ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); 684 if (ret) 685 return ret; 686 687 mmu_seq = vcpu->kvm->mmu_notifier_seq; 688 /* 689 * Ensure the read of mmu_notifier_seq happens before we call 690 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 691 * the page we just got a reference to gets unmapped before we have a 692 * chance to grab the mmu_lock, which ensure that if the page gets 693 * unmapped afterwards, the call to kvm_unmap_hva will take it away 694 * from us again properly. This smp_rmb() interacts with the smp_wmb() 695 * in kvm_mmu_notifier_invalidate_<page|range_end>. 696 */ 697 smp_rmb(); 698 699 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 700 if (is_error_pfn(pfn)) 701 return -EFAULT; 702 703 spin_lock(&kvm->mmu_lock); 704 if (mmu_notifier_retry(kvm, mmu_seq)) 705 goto out_unlock; 706 if (!hugetlb && !force_pte) 707 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 708 709 if (hugetlb) { 710 pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); 711 new_pmd = pmd_mkhuge(new_pmd); 712 if (writable) { 713 kvm_set_s2pmd_writable(&new_pmd); 714 kvm_set_pfn_dirty(pfn); 715 } 716 coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); 717 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 718 } else { 719 pte_t new_pte = pfn_pte(pfn, PAGE_S2); 720 if (writable) { 721 kvm_set_s2pte_writable(&new_pte); 722 kvm_set_pfn_dirty(pfn); 723 } 724 coherent_icache_guest_page(kvm, hva, PAGE_SIZE); 725 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); 726 } 727 728 729out_unlock: 730 spin_unlock(&kvm->mmu_lock); 731 kvm_release_pfn_clean(pfn); 732 return ret; 733} 734 735/** 736 * kvm_handle_guest_abort - handles all 2nd stage aborts 737 * @vcpu: the VCPU pointer 738 * @run: the kvm_run structure 739 * 740 * Any abort that gets to the host is almost guaranteed to be caused by a 741 * missing second stage translation table entry, which can mean that either the 742 * guest simply needs more memory and we must allocate an appropriate page or it 743 * can mean that the guest tried to access I/O memory, which is emulated by user 744 * space. The distinction is based on the IPA causing the fault and whether this 745 * memory region has been registered as standard RAM by user space. 746 */ 747int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 748{ 749 unsigned long fault_status; 750 phys_addr_t fault_ipa; 751 struct kvm_memory_slot *memslot; 752 bool is_iabt; 753 gfn_t gfn; 754 int ret, idx; 755 756 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 757 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 758 759 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 760 kvm_vcpu_get_hfar(vcpu), fault_ipa); 761 762 /* Check the stage-2 fault is trans. fault or write fault */ 763 fault_status = kvm_vcpu_trap_get_fault(vcpu); 764 if (fault_status != FSC_FAULT && fault_status != FSC_PERM) { 765 kvm_err("Unsupported fault status: EC=%#x DFCS=%#lx\n", 766 kvm_vcpu_trap_get_class(vcpu), fault_status); 767 return -EFAULT; 768 } 769 770 idx = srcu_read_lock(&vcpu->kvm->srcu); 771 772 gfn = fault_ipa >> PAGE_SHIFT; 773 if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) { 774 if (is_iabt) { 775 /* Prefetch Abort on I/O address */ 776 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 777 ret = 1; 778 goto out_unlock; 779 } 780 781 if (fault_status != FSC_FAULT) { 782 kvm_err("Unsupported fault status on io memory: %#lx\n", 783 fault_status); 784 ret = -EFAULT; 785 goto out_unlock; 786 } 787 788 /* 789 * The IPA is reported as [MAX:12], so we need to 790 * complement it with the bottom 12 bits from the 791 * faulting VA. This is always 12 bits, irrespective 792 * of the page size. 793 */ 794 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 795 ret = io_mem_abort(vcpu, run, fault_ipa); 796 goto out_unlock; 797 } 798 799 memslot = gfn_to_memslot(vcpu->kvm, gfn); 800 801 ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); 802 if (ret == 0) 803 ret = 1; 804out_unlock: 805 srcu_read_unlock(&vcpu->kvm->srcu, idx); 806 return ret; 807} 808 809static void handle_hva_to_gpa(struct kvm *kvm, 810 unsigned long start, 811 unsigned long end, 812 void (*handler)(struct kvm *kvm, 813 gpa_t gpa, void *data), 814 void *data) 815{ 816 struct kvm_memslots *slots; 817 struct kvm_memory_slot *memslot; 818 819 slots = kvm_memslots(kvm); 820 821 /* we only care about the pages that the guest sees */ 822 kvm_for_each_memslot(memslot, slots) { 823 unsigned long hva_start, hva_end; 824 gfn_t gfn, gfn_end; 825 826 hva_start = max(start, memslot->userspace_addr); 827 hva_end = min(end, memslot->userspace_addr + 828 (memslot->npages << PAGE_SHIFT)); 829 if (hva_start >= hva_end) 830 continue; 831 832 /* 833 * {gfn(page) | page intersects with [hva_start, hva_end)} = 834 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 835 */ 836 gfn = hva_to_gfn_memslot(hva_start, memslot); 837 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 838 839 for (; gfn < gfn_end; ++gfn) { 840 gpa_t gpa = gfn << PAGE_SHIFT; 841 handler(kvm, gpa, data); 842 } 843 } 844} 845 846static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) 847{ 848 unmap_stage2_range(kvm, gpa, PAGE_SIZE); 849} 850 851int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 852{ 853 unsigned long end = hva + PAGE_SIZE; 854 855 if (!kvm->arch.pgd) 856 return 0; 857 858 trace_kvm_unmap_hva(hva); 859 handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); 860 return 0; 861} 862 863int kvm_unmap_hva_range(struct kvm *kvm, 864 unsigned long start, unsigned long end) 865{ 866 if (!kvm->arch.pgd) 867 return 0; 868 869 trace_kvm_unmap_hva_range(start, end); 870 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 871 return 0; 872} 873 874static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) 875{ 876 pte_t *pte = (pte_t *)data; 877 878 stage2_set_pte(kvm, NULL, gpa, pte, false); 879} 880 881 882void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 883{ 884 unsigned long end = hva + PAGE_SIZE; 885 pte_t stage2_pte; 886 887 if (!kvm->arch.pgd) 888 return; 889 890 trace_kvm_set_spte_hva(hva); 891 stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2); 892 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 893} 894 895void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 896{ 897 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 898} 899 900phys_addr_t kvm_mmu_get_httbr(void) 901{ 902 return virt_to_phys(hyp_pgd); 903} 904 905phys_addr_t kvm_mmu_get_boot_httbr(void) 906{ 907 return virt_to_phys(boot_hyp_pgd); 908} 909 910phys_addr_t kvm_get_idmap_vector(void) 911{ 912 return hyp_idmap_vector; 913} 914 915int kvm_mmu_init(void) 916{ 917 int err; 918 919 hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start); 920 hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end); 921 hyp_idmap_vector = virt_to_phys(__kvm_hyp_init); 922 923 if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) { 924 /* 925 * Our init code is crossing a page boundary. Allocate 926 * a bounce page, copy the code over and use that. 927 */ 928 size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start; 929 phys_addr_t phys_base; 930 931 init_bounce_page = kmalloc(PAGE_SIZE, GFP_KERNEL); 932 if (!init_bounce_page) { 933 kvm_err("Couldn't allocate HYP init bounce page\n"); 934 err = -ENOMEM; 935 goto out; 936 } 937 938 memcpy(init_bounce_page, __hyp_idmap_text_start, len); 939 /* 940 * Warning: the code we just copied to the bounce page 941 * must be flushed to the point of coherency. 942 * Otherwise, the data may be sitting in L2, and HYP 943 * mode won't be able to observe it as it runs with 944 * caches off at that point. 945 */ 946 kvm_flush_dcache_to_poc(init_bounce_page, len); 947 948 phys_base = virt_to_phys(init_bounce_page); 949 hyp_idmap_vector += phys_base - hyp_idmap_start; 950 hyp_idmap_start = phys_base; 951 hyp_idmap_end = phys_base + len; 952 953 kvm_info("Using HYP init bounce page @%lx\n", 954 (unsigned long)phys_base); 955 } 956 957 hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); 958 boot_hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); 959 if (!hyp_pgd || !boot_hyp_pgd) { 960 kvm_err("Hyp mode PGD not allocated\n"); 961 err = -ENOMEM; 962 goto out; 963 } 964 965 /* Create the idmap in the boot page tables */ 966 err = __create_hyp_mappings(boot_hyp_pgd, 967 hyp_idmap_start, hyp_idmap_end, 968 __phys_to_pfn(hyp_idmap_start), 969 PAGE_HYP); 970 971 if (err) { 972 kvm_err("Failed to idmap %lx-%lx\n", 973 hyp_idmap_start, hyp_idmap_end); 974 goto out; 975 } 976 977 /* Map the very same page at the trampoline VA */ 978 err = __create_hyp_mappings(boot_hyp_pgd, 979 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, 980 __phys_to_pfn(hyp_idmap_start), 981 PAGE_HYP); 982 if (err) { 983 kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n", 984 TRAMPOLINE_VA); 985 goto out; 986 } 987 988 /* Map the same page again into the runtime page tables */ 989 err = __create_hyp_mappings(hyp_pgd, 990 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, 991 __phys_to_pfn(hyp_idmap_start), 992 PAGE_HYP); 993 if (err) { 994 kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n", 995 TRAMPOLINE_VA); 996 goto out; 997 } 998 999 return 0; 1000out: 1001 free_hyp_pgds(); 1002 return err; 1003}