Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S HV: Make the guest hash table size configurable

This adds a new ioctl to enable userspace to control the size of the guest
hashed page table (HPT) and to clear it out when resetting the guest.
The KVM_PPC_ALLOCATE_HTAB ioctl is a VM ioctl and takes as its parameter
a pointer to a u32 containing the desired order of the HPT (log base 2
of the size in bytes), which is updated on successful return to the
actual order of the HPT which was allocated.

There must be no vcpus running at the time of this ioctl. To enforce
this, we now keep a count of the number of vcpus running in
kvm->arch.vcpus_running.

If the ioctl is called when a HPT has already been allocated, we don't
reallocate the HPT but just clear it out. We first clear the
kvm->arch.rma_setup_done flag, which has two effects: (a) since we hold
the kvm->lock mutex, it will prevent any vcpus from starting to run until
we're done, and (b) it means that the first vcpu to run after we're done
will re-establish the VRMA if necessary.

If userspace doesn't call this ioctl before running the first vcpu, the
kernel will allocate a default-sized HPT at that point. We do it then
rather than when creating the VM, as the code did previously, so that
userspace has a chance to do the ioctl if it wants.

When allocating the HPT, we can allocate either from the kernel page
allocator, or from the preallocated pool. If userspace is asking for
a different size from the preallocated HPTs, we first try to allocate
using the kernel page allocator. Then we try to allocate from the
preallocated pool, and then if that fails, we try allocating decreasing
sizes from the kernel page allocator, down to the minimum size allowed
(256kB). Note that the kernel page allocator limits allocations to
1 << CONFIG_FORCE_MAX_ZONEORDER pages, which by default corresponds to
16MB (on 64-bit powerpc, at least).

Signed-off-by: Paul Mackerras <paulus@samba.org>
[agraf: fix module compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>

authored by

Paul Mackerras and committed by
Alexander Graf
32fad281 2e1ae9c0

+200 -54
+36
Documentation/virtual/kvm/api.txt
··· 1930 1930 PTE's RPN field (ie, it needs to be shifted left by 12 to OR it 1931 1931 into the hash PTE second double word). 1932 1932 1933 + 1934 + 4.75 KVM_PPC_ALLOCATE_HTAB 1935 + 1936 + Capability: KVM_CAP_PPC_ALLOC_HTAB 1937 + Architectures: powerpc 1938 + Type: vm ioctl 1939 + Parameters: Pointer to u32 containing hash table order (in/out) 1940 + Returns: 0 on success, -1 on error 1941 + 1942 + This requests the host kernel to allocate an MMU hash table for a 1943 + guest using the PAPR paravirtualization interface. This only does 1944 + anything if the kernel is configured to use the Book 3S HV style of 1945 + virtualization. Otherwise the capability doesn't exist and the ioctl 1946 + returns an ENOTTY error. The rest of this description assumes Book 3S 1947 + HV. 1948 + 1949 + There must be no vcpus running when this ioctl is called; if there 1950 + are, it will do nothing and return an EBUSY error. 1951 + 1952 + The parameter is a pointer to a 32-bit unsigned integer variable 1953 + containing the order (log base 2) of the desired size of the hash 1954 + table, which must be between 18 and 46. On successful return from the 1955 + ioctl, it will have been updated with the order of the hash table that 1956 + was allocated. 1957 + 1958 + If no hash table has been allocated when any vcpu is asked to run 1959 + (with the KVM_RUN ioctl), the host kernel will allocate a 1960 + default-sized hash table (16 MB). 1961 + 1962 + If this ioctl is called when a hash table has already been allocated, 1963 + the kernel will clear out the existing hash table (zero all HPTEs) and 1964 + return the hash table order in the parameter. (If the guest is using 1965 + the virtualized real-mode area (VRMA) facility, the kernel will 1966 + re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) 1967 + 1968 + 1933 1969 5. The kvm_run structure 1934 1970 ------------------------ 1935 1971
+2 -5
arch/powerpc/include/asm/kvm_book3s_64.h
··· 36 36 #define SPAPR_TCE_SHIFT 12 37 37 38 38 #ifdef CONFIG_KVM_BOOK3S_64_HV 39 - /* For now use fixed-size 16MB page table */ 40 - #define HPT_ORDER 24 41 - #define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ 42 - #define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ 43 - #define HPT_HASH_MASK (HPT_NPTEG - 1) 39 + #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ 40 + extern int kvm_hpt_order; /* order of preallocated HPTs */ 44 41 #endif 45 42 46 43 #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
+4
arch/powerpc/include/asm/kvm_host.h
··· 237 237 unsigned long vrma_slb_v; 238 238 int rma_setup_done; 239 239 int using_mmu_notifiers; 240 + u32 hpt_order; 241 + atomic_t vcpus_running; 242 + unsigned long hpt_npte; 243 + unsigned long hpt_mask; 240 244 spinlock_t slot_phys_lock; 241 245 unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; 242 246 int slot_npages[KVM_MEM_SLOTS_NUM];
+2 -1
arch/powerpc/include/asm/kvm_ppc.h
··· 119 119 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); 120 120 extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); 121 121 122 - extern long kvmppc_alloc_hpt(struct kvm *kvm); 122 + extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); 123 + extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); 123 124 extern void kvmppc_free_hpt(struct kvm *kvm); 124 125 extern long kvmppc_prepare_vrma(struct kvm *kvm, 125 126 struct kvm_userspace_memory_region *mem);
+95 -28
arch/powerpc/kvm/book3s_64_mmu_hv.c
··· 37 37 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ 38 38 #define MAX_LPID_970 63 39 39 40 - long kvmppc_alloc_hpt(struct kvm *kvm) 40 + /* Power architecture requires HPT is at least 256kB */ 41 + #define PPC_MIN_HPT_ORDER 18 42 + 43 + long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 41 44 { 42 45 unsigned long hpt; 43 - long lpid; 44 46 struct revmap_entry *rev; 45 47 struct kvmppc_linear_info *li; 48 + long order = kvm_hpt_order; 46 49 47 - /* Allocate guest's hashed page table */ 48 - li = kvm_alloc_hpt(); 49 - if (li) { 50 - /* using preallocated memory */ 51 - hpt = (ulong)li->base_virt; 52 - kvm->arch.hpt_li = li; 53 - } else { 54 - /* using dynamic memory */ 50 + if (htab_orderp) { 51 + order = *htab_orderp; 52 + if (order < PPC_MIN_HPT_ORDER) 53 + order = PPC_MIN_HPT_ORDER; 54 + } 55 + 56 + /* 57 + * If the user wants a different size from default, 58 + * try first to allocate it from the kernel page allocator. 59 + */ 60 + hpt = 0; 61 + if (order != kvm_hpt_order) { 55 62 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 56 - __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); 63 + __GFP_NOWARN, order - PAGE_SHIFT); 64 + if (!hpt) 65 + --order; 57 66 } 58 67 68 + /* Next try to allocate from the preallocated pool */ 59 69 if (!hpt) { 60 - pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); 61 - return -ENOMEM; 70 + li = kvm_alloc_hpt(); 71 + if (li) { 72 + hpt = (ulong)li->base_virt; 73 + kvm->arch.hpt_li = li; 74 + order = kvm_hpt_order; 75 + } 62 76 } 77 + 78 + /* Lastly try successively smaller sizes from the page allocator */ 79 + while (!hpt && order > PPC_MIN_HPT_ORDER) { 80 + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 81 + __GFP_NOWARN, order - PAGE_SHIFT); 82 + if (!hpt) 83 + --order; 84 + } 85 + 86 + if (!hpt) 87 + return -ENOMEM; 88 + 63 89 kvm->arch.hpt_virt = hpt; 90 + kvm->arch.hpt_order = order; 91 + /* HPTEs are 2**4 bytes long */ 92 + kvm->arch.hpt_npte = 1ul << (order - 4); 93 + /* 128 (2**7) bytes in each HPTEG */ 94 + kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; 64 95 65 96 /* Allocate reverse map array */ 66 - rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); 97 + rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); 67 98 if (!rev) { 68 99 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); 69 100 goto out_freehpt; 70 101 } 71 102 kvm->arch.revmap = rev; 103 + kvm->arch.sdr1 = __pa(hpt) | (order - 18); 72 104 73 - lpid = kvmppc_alloc_lpid(); 74 - if (lpid < 0) 75 - goto out_freeboth; 105 + pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", 106 + hpt, order, kvm->arch.lpid); 76 107 77 - kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); 78 - kvm->arch.lpid = lpid; 79 - 80 - pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); 108 + if (htab_orderp) 109 + *htab_orderp = order; 81 110 return 0; 82 111 83 - out_freeboth: 84 - vfree(rev); 85 112 out_freehpt: 86 - free_pages(hpt, HPT_ORDER - PAGE_SHIFT); 113 + if (kvm->arch.hpt_li) 114 + kvm_release_hpt(kvm->arch.hpt_li); 115 + else 116 + free_pages(hpt, order - PAGE_SHIFT); 87 117 return -ENOMEM; 118 + } 119 + 120 + long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) 121 + { 122 + long err = -EBUSY; 123 + long order; 124 + 125 + mutex_lock(&kvm->lock); 126 + if (kvm->arch.rma_setup_done) { 127 + kvm->arch.rma_setup_done = 0; 128 + /* order rma_setup_done vs. vcpus_running */ 129 + smp_mb(); 130 + if (atomic_read(&kvm->arch.vcpus_running)) { 131 + kvm->arch.rma_setup_done = 1; 132 + goto out; 133 + } 134 + } 135 + if (kvm->arch.hpt_virt) { 136 + order = kvm->arch.hpt_order; 137 + /* Set the entire HPT to 0, i.e. invalid HPTEs */ 138 + memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 139 + /* 140 + * Set the whole last_vcpu array to an invalid vcpu number. 141 + * This ensures that each vcpu will flush its TLB on next entry. 142 + */ 143 + memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); 144 + *htab_orderp = order; 145 + err = 0; 146 + } else { 147 + err = kvmppc_alloc_hpt(kvm, htab_orderp); 148 + order = *htab_orderp; 149 + } 150 + out: 151 + mutex_unlock(&kvm->lock); 152 + return err; 88 153 } 89 154 90 155 void kvmppc_free_hpt(struct kvm *kvm) ··· 159 94 if (kvm->arch.hpt_li) 160 95 kvm_release_hpt(kvm->arch.hpt_li); 161 96 else 162 - free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); 97 + free_pages(kvm->arch.hpt_virt, 98 + kvm->arch.hpt_order - PAGE_SHIFT); 163 99 } 164 100 165 101 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ ··· 185 119 unsigned long psize; 186 120 unsigned long hp0, hp1; 187 121 long ret; 122 + struct kvm *kvm = vcpu->kvm; 188 123 189 124 psize = 1ul << porder; 190 125 npages = memslot->npages >> (porder - PAGE_SHIFT); ··· 194 127 if (npages > 1ul << (40 - porder)) 195 128 npages = 1ul << (40 - porder); 196 129 /* Can't use more than 1 HPTE per HPTEG */ 197 - if (npages > HPT_NPTEG) 198 - npages = HPT_NPTEG; 130 + if (npages > kvm->arch.hpt_mask + 1) 131 + npages = kvm->arch.hpt_mask + 1; 199 132 200 133 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 201 134 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); ··· 205 138 for (i = 0; i < npages; ++i) { 206 139 addr = i << porder; 207 140 /* can't use hpt_hash since va > 64 bits */ 208 - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; 141 + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; 209 142 /* 210 143 * We assume that the hash table is empty and no 211 144 * vcpus are using it at this stage. Since we create
+28 -12
arch/powerpc/kvm/book3s_hv.c
··· 56 56 /* #define EXIT_DEBUG_INT */ 57 57 58 58 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 59 - static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); 59 + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 60 60 61 61 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 62 62 { ··· 1068 1068 return -EINTR; 1069 1069 } 1070 1070 1071 - /* On the first time here, set up VRMA or RMA */ 1071 + atomic_inc(&vcpu->kvm->arch.vcpus_running); 1072 + /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ 1073 + smp_mb(); 1074 + 1075 + /* On the first time here, set up HTAB and VRMA or RMA */ 1072 1076 if (!vcpu->kvm->arch.rma_setup_done) { 1073 - r = kvmppc_hv_setup_rma(vcpu); 1077 + r = kvmppc_hv_setup_htab_rma(vcpu); 1074 1078 if (r) 1075 - return r; 1079 + goto out; 1076 1080 } 1077 1081 1078 1082 flush_fp_to_thread(current); ··· 1094 1090 kvmppc_core_prepare_to_enter(vcpu); 1095 1091 } 1096 1092 } while (r == RESUME_GUEST); 1093 + 1094 + out: 1095 + atomic_dec(&vcpu->kvm->arch.vcpus_running); 1097 1096 return r; 1098 1097 } 1099 1098 ··· 1312 1305 { 1313 1306 } 1314 1307 1315 - static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) 1308 + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 1316 1309 { 1317 1310 int err = 0; 1318 1311 struct kvm *kvm = vcpu->kvm; ··· 1330 1323 mutex_lock(&kvm->lock); 1331 1324 if (kvm->arch.rma_setup_done) 1332 1325 goto out; /* another vcpu beat us to it */ 1326 + 1327 + /* Allocate hashed page table (if not done already) and reset it */ 1328 + if (!kvm->arch.hpt_virt) { 1329 + err = kvmppc_alloc_hpt(kvm, NULL); 1330 + if (err) { 1331 + pr_err("KVM: Couldn't alloc HPT\n"); 1332 + goto out; 1333 + } 1334 + } 1333 1335 1334 1336 /* Look up the memslot for guest physical address 0 */ 1335 1337 memslot = gfn_to_memslot(kvm, 0); ··· 1451 1435 1452 1436 int kvmppc_core_init_vm(struct kvm *kvm) 1453 1437 { 1454 - long r; 1455 - unsigned long lpcr; 1438 + unsigned long lpcr, lpid; 1456 1439 1457 - /* Allocate hashed page table */ 1458 - r = kvmppc_alloc_hpt(kvm); 1459 - if (r) 1460 - return r; 1440 + /* Allocate the guest's logical partition ID */ 1441 + 1442 + lpid = kvmppc_alloc_lpid(); 1443 + if (lpid < 0) 1444 + return -ENOMEM; 1445 + kvm->arch.lpid = lpid; 1461 1446 1462 1447 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1463 1448 ··· 1468 1451 1469 1452 if (cpu_has_feature(CPU_FTR_ARCH_201)) { 1470 1453 /* PPC970; HID4 is effectively the LPCR */ 1471 - unsigned long lpid = kvm->arch.lpid; 1472 1454 kvm->arch.host_lpid = 0; 1473 1455 kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); 1474 1456 lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
+4 -1
arch/powerpc/kvm/book3s_hv_builtin.c
··· 25 25 static struct kvmppc_linear_info *kvm_alloc_linear(int type); 26 26 static void kvm_release_linear(struct kvmppc_linear_info *ri); 27 27 28 + int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER; 29 + EXPORT_SYMBOL_GPL(kvm_hpt_order); 30 + 28 31 /*************** RMA *************/ 29 32 30 33 /* ··· 212 209 void __init kvm_linear_init(void) 213 210 { 214 211 /* HPT */ 215 - kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); 212 + kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT); 216 213 217 214 /* RMA */ 218 215 /* Only do this on PPC970 in HV mode */
+8 -7
arch/powerpc/kvm/book3s_hv_rm_mmu.c
··· 237 237 238 238 /* Find and lock the HPTEG slot to use */ 239 239 do_insert: 240 - if (pte_index >= HPT_NPTE) 240 + if (pte_index >= kvm->arch.hpt_npte) 241 241 return H_PARAMETER; 242 242 if (likely((flags & H_EXACT) == 0)) { 243 243 pte_index &= ~7UL; ··· 352 352 unsigned long v, r, rb; 353 353 struct revmap_entry *rev; 354 354 355 - if (pte_index >= HPT_NPTE) 355 + if (pte_index >= kvm->arch.hpt_npte) 356 356 return H_PARAMETER; 357 357 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); 358 358 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) ··· 419 419 i = 4; 420 420 break; 421 421 } 422 - if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { 422 + if (req != 1 || flags == 3 || 423 + pte_index >= kvm->arch.hpt_npte) { 423 424 /* parameter error */ 424 425 args[j] = ((0xa0 | flags) << 56) + pte_index; 425 426 ret = H_PARAMETER; ··· 522 521 struct revmap_entry *rev; 523 522 unsigned long v, r, rb, mask, bits; 524 523 525 - if (pte_index >= HPT_NPTE) 524 + if (pte_index >= kvm->arch.hpt_npte) 526 525 return H_PARAMETER; 527 526 528 527 hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); ··· 584 583 int i, n = 1; 585 584 struct revmap_entry *rev = NULL; 586 585 587 - if (pte_index >= HPT_NPTE) 586 + if (pte_index >= kvm->arch.hpt_npte) 588 587 return H_PARAMETER; 589 588 if (flags & H_READ_4) { 590 589 pte_index &= ~3; ··· 679 678 somask = (1UL << 28) - 1; 680 679 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; 681 680 } 682 - hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; 681 + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; 683 682 avpn = slb_v & ~(somask >> 16); /* also includes B */ 684 683 avpn |= (eaddr & somask) >> 16; 685 684 ··· 724 723 if (val & HPTE_V_SECONDARY) 725 724 break; 726 725 val |= HPTE_V_SECONDARY; 727 - hash = hash ^ HPT_HASH_MASK; 726 + hash = hash ^ kvm->arch.hpt_mask; 728 727 } 729 728 return -1; 730 729 }
+18
arch/powerpc/kvm/powerpc.c
··· 246 246 #endif 247 247 #ifdef CONFIG_PPC_BOOK3S_64 248 248 case KVM_CAP_SPAPR_TCE: 249 + case KVM_CAP_PPC_ALLOC_HTAB: 249 250 r = 1; 250 251 break; 251 252 #endif /* CONFIG_PPC_BOOK3S_64 */ ··· 801 800 r = kvm_vm_ioctl_allocate_rma(kvm, &rma); 802 801 if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) 803 802 r = -EFAULT; 803 + break; 804 + } 805 + 806 + case KVM_PPC_ALLOCATE_HTAB: { 807 + struct kvm *kvm = filp->private_data; 808 + u32 htab_order; 809 + 810 + r = -EFAULT; 811 + if (get_user(htab_order, (u32 __user *)argp)) 812 + break; 813 + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); 814 + if (r) 815 + break; 816 + r = -EFAULT; 817 + if (put_user(htab_order, (u32 __user *)argp)) 818 + break; 819 + r = 0; 804 820 break; 805 821 } 806 822 #endif /* CONFIG_KVM_BOOK3S_64_HV */
+3
include/linux/kvm.h
··· 617 617 #define KVM_CAP_SIGNAL_MSI 77 618 618 #define KVM_CAP_PPC_GET_SMMU_INFO 78 619 619 #define KVM_CAP_S390_COW 79 620 + #define KVM_CAP_PPC_ALLOC_HTAB 80 620 621 621 622 #ifdef KVM_CAP_IRQ_ROUTING 622 623 ··· 829 828 #define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) 830 829 /* Available with KVM_CAP_PPC_GET_SMMU_INFO */ 831 830 #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) 831 + /* Available with KVM_CAP_PPC_ALLOC_HTAB */ 832 + #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) 832 833 833 834 /* 834 835 * ioctls for vcpu fds