Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'kvm-arm-for-3.13-2' of git://git.linaro.org/people/cdall/linux-kvm-arm into kvm-queue

Updates for KVM/ARM, take 2 including:
- Transparent Huge Pages and hugetlbfs support for KVM/ARM
- Yield CPU when guest executes WFE to speed up CPU overcommit

+230 -52
+4 -1
arch/arm/include/asm/kvm_arm.h
··· 57 57 * TSC: Trap SMC 58 58 * TSW: Trap cache operations by set/way 59 59 * TWI: Trap WFI 60 + * TWE: Trap WFE 60 61 * TIDCP: Trap L2CTLR/L2ECTLR 61 62 * BSU_IS: Upgrade barriers to the inner shareable domain 62 63 * FB: Force broadcast of all maintainance operations ··· 68 67 */ 69 68 #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ 70 69 HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ 71 - HCR_SWIO | HCR_TIDCP) 70 + HCR_TWE | HCR_SWIO | HCR_TIDCP) 72 71 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) 73 72 74 73 /* System Control Register (SCTLR) bits */ ··· 208 207 #define HSR_EC_IABT_HYP (0x21) 209 208 #define HSR_EC_DABT (0x24) 210 209 #define HSR_EC_DABT_HYP (0x25) 210 + 211 + #define HSR_WFI_IS_WFE (1U << 0) 211 212 212 213 #define HSR_HVC_IMM_MASK ((1UL << 16) - 1) 213 214
+14 -3
arch/arm/include/asm/kvm_mmu.h
··· 62 62 int kvm_mmu_init(void); 63 63 void kvm_clear_hyp_idmap(void); 64 64 65 + static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) 66 + { 67 + *pmd = new_pmd; 68 + flush_pmd_entry(pmd); 69 + } 70 + 65 71 static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) 66 72 { 67 73 *pte = new_pte; ··· 109 103 pte_val(*pte) |= L_PTE_S2_RDWR; 110 104 } 111 105 106 + static inline void kvm_set_s2pmd_writable(pmd_t *pmd) 107 + { 108 + pmd_val(*pmd) |= L_PMD_S2_RDWR; 109 + } 110 + 112 111 struct kvm; 113 112 114 - static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) 113 + static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, 114 + unsigned long size) 115 115 { 116 116 /* 117 117 * If we are going to insert an instruction page and the icache is ··· 132 120 * need any kind of flushing (DDI 0406C.b - Page B3-1392). 133 121 */ 134 122 if (icache_is_pipt()) { 135 - unsigned long hva = gfn_to_hva(kvm, gfn); 136 - __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); 123 + __cpuc_coherent_user_range(hva, hva + size); 137 124 } else if (!icache_is_vivt_asid_tagged()) { 138 125 /* any kind of VIPT cache */ 139 126 __flush_icache_all();
+2
arch/arm/include/asm/pgtable-3level.h
··· 126 126 #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ 127 127 #define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ 128 128 129 + #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ 130 + 129 131 /* 130 132 * Hyp-mode PL2 PTE definitions for LPAE. 131 133 */
+1
arch/arm/kvm/Kconfig
··· 20 20 bool "Kernel-based Virtual Machine (KVM) support" 21 21 select PREEMPT_NOTIFIERS 22 22 select ANON_INODES 23 + select HAVE_KVM_CPU_RELAX_INTERCEPT 23 24 select KVM_MMIO 24 25 select KVM_ARM_HOST 25 26 depends on ARM_VIRT_EXT && ARM_LPAE
+13 -7
arch/arm/kvm/handle_exit.c
··· 73 73 } 74 74 75 75 /** 76 - * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest 76 + * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests 77 77 * @vcpu: the vcpu pointer 78 78 * @run: the kvm_run structure pointer 79 79 * 80 - * Simply sets the wait_for_interrupts flag on the vcpu structure, which will 81 - * halt execution of world-switches and schedule other host processes until 82 - * there is an incoming IRQ or FIQ to the VM. 80 + * WFE: Yield the CPU and come back to this vcpu when the scheduler 81 + * decides to. 82 + * WFI: Simply call kvm_vcpu_block(), which will halt execution of 83 + * world-switches and schedule other host processes until there is an 84 + * incoming IRQ or FIQ to the VM. 83 85 */ 84 - static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) 86 + static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) 85 87 { 86 88 trace_kvm_wfi(*vcpu_pc(vcpu)); 87 - kvm_vcpu_block(vcpu); 89 + if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) 90 + kvm_vcpu_on_spin(vcpu); 91 + else 92 + kvm_vcpu_block(vcpu); 93 + 88 94 return 1; 89 95 } 90 96 91 97 static exit_handle_fn arm_exit_handlers[] = { 92 - [HSR_EC_WFI] = kvm_handle_wfi, 98 + [HSR_EC_WFI] = kvm_handle_wfx, 93 99 [HSR_EC_CP15_32] = kvm_handle_cp15_32, 94 100 [HSR_EC_CP15_64] = kvm_handle_cp15_64, 95 101 [HSR_EC_CP14_MR] = kvm_handle_cp14_access,
+185 -38
arch/arm/kvm/mmu.c
··· 19 19 #include <linux/mman.h> 20 20 #include <linux/kvm_host.h> 21 21 #include <linux/io.h> 22 + #include <linux/hugetlb.h> 22 23 #include <trace/events/kvm.h> 23 24 #include <asm/pgalloc.h> 24 25 #include <asm/cacheflush.h> ··· 41 40 static unsigned long hyp_idmap_start; 42 41 static unsigned long hyp_idmap_end; 43 42 static phys_addr_t hyp_idmap_vector; 43 + 44 + #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) 44 45 45 46 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 46 47 { ··· 96 93 97 94 static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 98 95 { 99 - pmd_t *pmd_table = pmd_offset(pud, 0); 100 - pud_clear(pud); 101 - kvm_tlb_flush_vmid_ipa(kvm, addr); 102 - pmd_free(NULL, pmd_table); 96 + if (pud_huge(*pud)) { 97 + pud_clear(pud); 98 + kvm_tlb_flush_vmid_ipa(kvm, addr); 99 + } else { 100 + pmd_t *pmd_table = pmd_offset(pud, 0); 101 + pud_clear(pud); 102 + kvm_tlb_flush_vmid_ipa(kvm, addr); 103 + pmd_free(NULL, pmd_table); 104 + } 103 105 put_page(virt_to_page(pud)); 104 106 } 105 107 106 108 static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 107 109 { 108 - pte_t *pte_table = pte_offset_kernel(pmd, 0); 109 - pmd_clear(pmd); 110 - kvm_tlb_flush_vmid_ipa(kvm, addr); 111 - pte_free_kernel(NULL, pte_table); 110 + if (kvm_pmd_huge(*pmd)) { 111 + pmd_clear(pmd); 112 + kvm_tlb_flush_vmid_ipa(kvm, addr); 113 + } else { 114 + pte_t *pte_table = pte_offset_kernel(pmd, 0); 115 + pmd_clear(pmd); 116 + kvm_tlb_flush_vmid_ipa(kvm, addr); 117 + pte_free_kernel(NULL, pte_table); 118 + } 112 119 put_page(virt_to_page(pmd)); 113 120 } 114 121 ··· 149 136 continue; 150 137 } 151 138 139 + if (pud_huge(*pud)) { 140 + /* 141 + * If we are dealing with a huge pud, just clear it and 142 + * move on. 143 + */ 144 + clear_pud_entry(kvm, pud, addr); 145 + addr = pud_addr_end(addr, end); 146 + continue; 147 + } 148 + 152 149 pmd = pmd_offset(pud, addr); 153 150 if (pmd_none(*pmd)) { 154 151 addr = pmd_addr_end(addr, end); 155 152 continue; 156 153 } 157 154 158 - pte = pte_offset_kernel(pmd, addr); 159 - clear_pte_entry(kvm, pte, addr); 160 - next = addr + PAGE_SIZE; 155 + if (!kvm_pmd_huge(*pmd)) { 156 + pte = pte_offset_kernel(pmd, addr); 157 + clear_pte_entry(kvm, pte, addr); 158 + next = addr + PAGE_SIZE; 159 + } 161 160 162 - /* If we emptied the pte, walk back up the ladder */ 163 - if (page_empty(pte)) { 161 + /* 162 + * If the pmd entry is to be cleared, walk back up the ladder 163 + */ 164 + if (kvm_pmd_huge(*pmd) || page_empty(pte)) { 164 165 clear_pmd_entry(kvm, pmd, addr); 165 166 next = pmd_addr_end(addr, end); 166 167 if (page_empty(pmd) && !page_empty(pud)) { ··· 447 420 kvm->arch.pgd = NULL; 448 421 } 449 422 450 - 451 - static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 452 - phys_addr_t addr, const pte_t *new_pte, bool iomap) 423 + static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 424 + phys_addr_t addr) 453 425 { 454 426 pgd_t *pgd; 455 427 pud_t *pud; 456 428 pmd_t *pmd; 457 - pte_t *pte, old_pte; 458 429 459 - /* Create 2nd stage page table mapping - Level 1 */ 460 430 pgd = kvm->arch.pgd + pgd_index(addr); 461 431 pud = pud_offset(pgd, addr); 462 432 if (pud_none(*pud)) { 463 433 if (!cache) 464 - return 0; /* ignore calls from kvm_set_spte_hva */ 434 + return NULL; 465 435 pmd = mmu_memory_cache_alloc(cache); 466 436 pud_populate(NULL, pud, pmd); 467 437 get_page(virt_to_page(pud)); 468 438 } 469 439 470 - pmd = pmd_offset(pud, addr); 440 + return pmd_offset(pud, addr); 441 + } 471 442 472 - /* Create 2nd stage page table mapping - Level 2 */ 443 + static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 444 + *cache, phys_addr_t addr, const pmd_t *new_pmd) 445 + { 446 + pmd_t *pmd, old_pmd; 447 + 448 + pmd = stage2_get_pmd(kvm, cache, addr); 449 + VM_BUG_ON(!pmd); 450 + 451 + /* 452 + * Mapping in huge pages should only happen through a fault. If a 453 + * page is merged into a transparent huge page, the individual 454 + * subpages of that huge page should be unmapped through MMU 455 + * notifiers before we get here. 456 + * 457 + * Merging of CompoundPages is not supported; they should become 458 + * splitting first, unmapped, merged, and mapped back in on-demand. 459 + */ 460 + VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); 461 + 462 + old_pmd = *pmd; 463 + kvm_set_pmd(pmd, *new_pmd); 464 + if (pmd_present(old_pmd)) 465 + kvm_tlb_flush_vmid_ipa(kvm, addr); 466 + else 467 + get_page(virt_to_page(pmd)); 468 + return 0; 469 + } 470 + 471 + static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 472 + phys_addr_t addr, const pte_t *new_pte, bool iomap) 473 + { 474 + pmd_t *pmd; 475 + pte_t *pte, old_pte; 476 + 477 + /* Create stage-2 page table mapping - Level 1 */ 478 + pmd = stage2_get_pmd(kvm, cache, addr); 479 + if (!pmd) { 480 + /* 481 + * Ignore calls from kvm_set_spte_hva for unallocated 482 + * address ranges. 483 + */ 484 + return 0; 485 + } 486 + 487 + /* Create stage-2 page mappings - Level 2 */ 473 488 if (pmd_none(*pmd)) { 474 489 if (!cache) 475 490 return 0; /* ignore calls from kvm_set_spte_hva */ ··· 576 507 return ret; 577 508 } 578 509 510 + static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) 511 + { 512 + pfn_t pfn = *pfnp; 513 + gfn_t gfn = *ipap >> PAGE_SHIFT; 514 + 515 + if (PageTransCompound(pfn_to_page(pfn))) { 516 + unsigned long mask; 517 + /* 518 + * The address we faulted on is backed by a transparent huge 519 + * page. However, because we map the compound huge page and 520 + * not the individual tail page, we need to transfer the 521 + * refcount to the head page. We have to be careful that the 522 + * THP doesn't start to split while we are adjusting the 523 + * refcounts. 524 + * 525 + * We are sure this doesn't happen, because mmu_notifier_retry 526 + * was successful and we are holding the mmu_lock, so if this 527 + * THP is trying to split, it will be blocked in the mmu 528 + * notifier before touching any of the pages, specifically 529 + * before being able to call __split_huge_page_refcount(). 530 + * 531 + * We can therefore safely transfer the refcount from PG_tail 532 + * to PG_head and switch the pfn from a tail page to the head 533 + * page accordingly. 534 + */ 535 + mask = PTRS_PER_PMD - 1; 536 + VM_BUG_ON((gfn & mask) != (pfn & mask)); 537 + if (pfn & mask) { 538 + *ipap &= PMD_MASK; 539 + kvm_release_pfn_clean(pfn); 540 + pfn &= ~mask; 541 + kvm_get_pfn(pfn); 542 + *pfnp = pfn; 543 + } 544 + 545 + return true; 546 + } 547 + 548 + return false; 549 + } 550 + 579 551 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 580 - gfn_t gfn, struct kvm_memory_slot *memslot, 552 + struct kvm_memory_slot *memslot, 581 553 unsigned long fault_status) 582 554 { 583 - pte_t new_pte; 584 - pfn_t pfn; 585 555 int ret; 586 - bool write_fault, writable; 556 + bool write_fault, writable, hugetlb = false, force_pte = false; 587 557 unsigned long mmu_seq; 558 + gfn_t gfn = fault_ipa >> PAGE_SHIFT; 559 + unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); 560 + struct kvm *kvm = vcpu->kvm; 588 561 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 562 + struct vm_area_struct *vma; 563 + pfn_t pfn; 589 564 590 565 write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); 591 566 if (fault_status == FSC_PERM && !write_fault) { 592 567 kvm_err("Unexpected L2 read permission error\n"); 593 568 return -EFAULT; 594 569 } 570 + 571 + /* Let's check if we will get back a huge page backed by hugetlbfs */ 572 + down_read(&current->mm->mmap_sem); 573 + vma = find_vma_intersection(current->mm, hva, hva + 1); 574 + if (is_vm_hugetlb_page(vma)) { 575 + hugetlb = true; 576 + gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; 577 + } else { 578 + /* 579 + * Pages belonging to VMAs not aligned to the PMD mapping 580 + * granularity cannot be mapped using block descriptors even 581 + * if the pages belong to a THP for the process, because the 582 + * stage-2 block descriptor will cover more than a single THP 583 + * and we loose atomicity for unmapping, updates, and splits 584 + * of the THP or other pages in the stage-2 block range. 585 + */ 586 + if (vma->vm_start & ~PMD_MASK) 587 + force_pte = true; 588 + } 589 + up_read(&current->mm->mmap_sem); 595 590 596 591 /* We need minimum second+third level pages */ 597 592 ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); ··· 674 541 */ 675 542 smp_rmb(); 676 543 677 - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); 544 + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 678 545 if (is_error_pfn(pfn)) 679 546 return -EFAULT; 680 547 681 - new_pte = pfn_pte(pfn, PAGE_S2); 682 - coherent_icache_guest_page(vcpu->kvm, gfn); 683 - 684 - spin_lock(&vcpu->kvm->mmu_lock); 685 - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 548 + spin_lock(&kvm->mmu_lock); 549 + if (mmu_notifier_retry(kvm, mmu_seq)) 686 550 goto out_unlock; 687 - if (writable) { 688 - kvm_set_s2pte_writable(&new_pte); 689 - kvm_set_pfn_dirty(pfn); 551 + if (!hugetlb && !force_pte) 552 + hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 553 + 554 + if (hugetlb) { 555 + pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); 556 + new_pmd = pmd_mkhuge(new_pmd); 557 + if (writable) { 558 + kvm_set_s2pmd_writable(&new_pmd); 559 + kvm_set_pfn_dirty(pfn); 560 + } 561 + coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); 562 + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 563 + } else { 564 + pte_t new_pte = pfn_pte(pfn, PAGE_S2); 565 + if (writable) { 566 + kvm_set_s2pte_writable(&new_pte); 567 + kvm_set_pfn_dirty(pfn); 568 + } 569 + coherent_icache_guest_page(kvm, hva, PAGE_SIZE); 570 + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); 690 571 } 691 - stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); 572 + 692 573 693 574 out_unlock: 694 - spin_unlock(&vcpu->kvm->mmu_lock); 575 + spin_unlock(&kvm->mmu_lock); 695 576 kvm_release_pfn_clean(pfn); 696 - return 0; 577 + return ret; 697 578 } 698 579 699 580 /** ··· 776 629 777 630 memslot = gfn_to_memslot(vcpu->kvm, gfn); 778 631 779 - ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); 632 + ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); 780 633 if (ret == 0) 781 634 ret = 1; 782 635 out_unlock:
+9 -3
arch/arm64/include/asm/kvm_mmu.h
··· 91 91 void kvm_clear_hyp_idmap(void); 92 92 93 93 #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) 94 + #define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) 94 95 95 96 static inline bool kvm_is_write_fault(unsigned long esr) 96 97 { ··· 117 116 pte_val(*pte) |= PTE_S2_RDWR; 118 117 } 119 118 119 + static inline void kvm_set_s2pmd_writable(pmd_t *pmd) 120 + { 121 + pmd_val(*pmd) |= PMD_S2_RDWR; 122 + } 123 + 120 124 struct kvm; 121 125 122 - static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) 126 + static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, 127 + unsigned long size) 123 128 { 124 129 if (!icache_is_aliasing()) { /* PIPT */ 125 - unsigned long hva = gfn_to_hva(kvm, gfn); 126 - flush_icache_range(hva, hva + PAGE_SIZE); 130 + flush_icache_range(hva, hva + size); 127 131 } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */ 128 132 /* any kind of VIPT cache */ 129 133 __flush_icache_all();
+2
arch/arm64/include/asm/pgtable-hwdef.h
··· 85 85 #define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ 86 86 #define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ 87 87 88 + #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ 89 + 88 90 /* 89 91 * Memory Attribute override for Stage-2 (MemAttr[3:0]) 90 92 */