Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Accelerate H_PUT_TCE by implementing it in real mode

This improves I/O performance for guests using the PAPR
paravirtualization interface by making the H_PUT_TCE hcall faster, by
implementing it in real mode. H_PUT_TCE is used for updating virtual
IOMMU tables, and is used both for virtual I/O and for real I/O in the
PAPR interface.

Since this moves the IOMMU tables into the kernel, we define a new
KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables. The
ioctl returns a file descriptor which can be used to mmap the newly
created table. The qemu driver models use them in the same way as
userspace managed tables, but they can be updated directly by the
guest with a real-mode H_PUT_TCE implementation, reducing the number
of host/guest context switches during guest IO.

There are certain circumstances where it is useful for userland qemu
to write to the TCE table even if the kernel H_PUT_TCE path is used
most of the time. Specifically, allowing this will avoid awkwardness
when we need to reset the table. More importantly, we will in the
future need to write the table in order to restore its state after a
checkpoint resume or migration.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>

authored by

David Gibson and committed by
Avi Kivity
54738c09 a8606e20

+268 -3
+35
Documentation/virtual/kvm/api.txt
··· 1350 1350 If datamatch flag is set, the event will be signaled only if the written value 1351 1351 to the registered address is equal to datamatch in struct kvm_ioeventfd. 1352 1352 1353 + 4.62 KVM_CREATE_SPAPR_TCE 1354 + 1355 + Capability: KVM_CAP_SPAPR_TCE 1356 + Architectures: powerpc 1357 + Type: vm ioctl 1358 + Parameters: struct kvm_create_spapr_tce (in) 1359 + Returns: file descriptor for manipulating the created TCE table 1360 + 1361 + This creates a virtual TCE (translation control entry) table, which 1362 + is an IOMMU for PAPR-style virtual I/O. It is used to translate 1363 + logical addresses used in virtual I/O into guest physical addresses, 1364 + and provides a scatter/gather capability for PAPR virtual I/O. 1365 + 1366 + /* for KVM_CAP_SPAPR_TCE */ 1367 + struct kvm_create_spapr_tce { 1368 + __u64 liobn; 1369 + __u32 window_size; 1370 + }; 1371 + 1372 + The liobn field gives the logical IO bus number for which to create a 1373 + TCE table. The window_size field specifies the size of the DMA window 1374 + which this TCE table will translate - the table will contain one 64 1375 + bit TCE entry for every 4kiB of the DMA window. 1376 + 1377 + When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE 1378 + table has been created using this ioctl(), the kernel will handle it 1379 + in real mode, updating the TCE table. H_PUT_TCE calls for other 1380 + liobns will cause a vm exit and must be handled by userspace. 1381 + 1382 + The return value is a file descriptor which can be passed to mmap(2) 1383 + to map the created TCE table into userspace. This lets userspace read 1384 + the entries written by kernel-handled H_PUT_TCE calls, and also lets 1385 + userspace update the TCE table directly which is useful in some 1386 + circumstances. 1387 + 1353 1388 5. The kvm_run structure 1354 1389 1355 1390 Application code obtains a pointer to the kvm_run structure by
+9
arch/powerpc/include/asm/kvm.h
··· 22 22 23 23 #include <linux/types.h> 24 24 25 + /* Select powerpc specific features in <linux/kvm.h> */ 26 + #define __KVM_HAVE_SPAPR_TCE 27 + 25 28 struct kvm_regs { 26 29 __u64 pc; 27 30 __u64 cr; ··· 274 271 #define KVM_INTERRUPT_SET -1U 275 272 #define KVM_INTERRUPT_UNSET -2U 276 273 #define KVM_INTERRUPT_SET_LEVEL -3U 274 + 275 + /* for KVM_CAP_SPAPR_TCE */ 276 + struct kvm_create_spapr_tce { 277 + __u64 liobn; 278 + __u32 window_size; 279 + }; 277 280 278 281 #endif /* __LINUX_KVM_POWERPC_H */
+2
arch/powerpc/include/asm/kvm_book3s_64.h
··· 27 27 } 28 28 #endif 29 29 30 + #define SPAPR_TCE_SHIFT 12 31 + 30 32 #endif /* __ASM_KVM_BOOK3S_64_H__ */
+9
arch/powerpc/include/asm/kvm_host.h
··· 144 144 atomic_t refcnt; 145 145 }; 146 146 147 + struct kvmppc_spapr_tce_table { 148 + struct list_head list; 149 + struct kvm *kvm; 150 + u64 liobn; 151 + u32 window_size; 152 + struct page *pages[0]; 153 + }; 154 + 147 155 struct kvm_arch { 148 156 #ifdef CONFIG_KVM_BOOK3S_64_HV 149 157 unsigned long hpt_virt; ··· 165 157 unsigned long sdr1; 166 158 unsigned long host_sdr1; 167 159 int tlbie_lock; 160 + struct list_head spapr_tce_tables; 168 161 unsigned short last_vcpu[NR_CPUS]; 169 162 #endif /* CONFIG_KVM_BOOK3S_64_HV */ 170 163 };
+2
arch/powerpc/include/asm/kvm_ppc.h
··· 119 119 extern void kvmppc_map_vrma(struct kvm *kvm, 120 120 struct kvm_userspace_memory_region *mem); 121 121 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); 122 + extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, 123 + struct kvm_create_spapr_tce *args); 122 124 extern int kvmppc_core_init_vm(struct kvm *kvm); 123 125 extern void kvmppc_core_destroy_vm(struct kvm *kvm); 124 126 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+2 -1
arch/powerpc/kvm/Makefile
··· 55 55 book3s_hv_interrupts.o \ 56 56 book3s_64_mmu_hv.o 57 57 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ 58 - book3s_hv_rm_mmu.o 58 + book3s_hv_rm_mmu.o \ 59 + book3s_64_vio_hv.o 59 60 60 61 kvm-book3s_64-module-objs := \ 61 62 ../../../virt/kvm/kvm_main.o \
+73
arch/powerpc/kvm/book3s_64_vio_hv.c
··· 1 + /* 2 + * This program is free software; you can redistribute it and/or modify 3 + * it under the terms of the GNU General Public License, version 2, as 4 + * published by the Free Software Foundation. 5 + * 6 + * This program is distributed in the hope that it will be useful, 7 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 + * GNU General Public License for more details. 10 + * 11 + * You should have received a copy of the GNU General Public License 12 + * along with this program; if not, write to the Free Software 13 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 14 + * 15 + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 16 + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> 17 + */ 18 + 19 + #include <linux/types.h> 20 + #include <linux/string.h> 21 + #include <linux/kvm.h> 22 + #include <linux/kvm_host.h> 23 + #include <linux/highmem.h> 24 + #include <linux/gfp.h> 25 + #include <linux/slab.h> 26 + #include <linux/hugetlb.h> 27 + #include <linux/list.h> 28 + 29 + #include <asm/tlbflush.h> 30 + #include <asm/kvm_ppc.h> 31 + #include <asm/kvm_book3s.h> 32 + #include <asm/mmu-hash64.h> 33 + #include <asm/hvcall.h> 34 + #include <asm/synch.h> 35 + #include <asm/ppc-opcode.h> 36 + #include <asm/kvm_host.h> 37 + #include <asm/udbg.h> 38 + 39 + #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) 40 + 41 + long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, 42 + unsigned long ioba, unsigned long tce) 43 + { 44 + struct kvm *kvm = vcpu->kvm; 45 + struct kvmppc_spapr_tce_table *stt; 46 + 47 + /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ 48 + /* liobn, ioba, tce); */ 49 + 50 + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { 51 + if (stt->liobn == liobn) { 52 + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; 53 + struct page *page; 54 + u64 *tbl; 55 + 56 + /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ 57 + /* liobn, stt, stt->window_size); */ 58 + if (ioba >= stt->window_size) 59 + return H_PARAMETER; 60 + 61 + page = stt->pages[idx / TCES_PER_PAGE]; 62 + tbl = (u64 *)page_address(page); 63 + 64 + /* FIXME: Need to validate the TCE itself */ 65 + /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ 66 + tbl[idx % TCES_PER_PAGE] = tce; 67 + return H_SUCCESS; 68 + } 69 + } 70 + 71 + /* Didn't find the liobn, punt it to userspace */ 72 + return H_TOO_HARD; 73 + }
+115 -1
arch/powerpc/kvm/book3s_hv.c
··· 538 538 return r; 539 539 } 540 540 541 + static long kvmppc_stt_npages(unsigned long window_size) 542 + { 543 + return ALIGN((window_size >> SPAPR_TCE_SHIFT) 544 + * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; 545 + } 546 + 547 + static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) 548 + { 549 + struct kvm *kvm = stt->kvm; 550 + int i; 551 + 552 + mutex_lock(&kvm->lock); 553 + list_del(&stt->list); 554 + for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) 555 + __free_page(stt->pages[i]); 556 + kfree(stt); 557 + mutex_unlock(&kvm->lock); 558 + 559 + kvm_put_kvm(kvm); 560 + } 561 + 562 + static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 563 + { 564 + struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; 565 + struct page *page; 566 + 567 + if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) 568 + return VM_FAULT_SIGBUS; 569 + 570 + page = stt->pages[vmf->pgoff]; 571 + get_page(page); 572 + vmf->page = page; 573 + return 0; 574 + } 575 + 576 + static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { 577 + .fault = kvm_spapr_tce_fault, 578 + }; 579 + 580 + static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) 581 + { 582 + vma->vm_ops = &kvm_spapr_tce_vm_ops; 583 + return 0; 584 + } 585 + 586 + static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) 587 + { 588 + struct kvmppc_spapr_tce_table *stt = filp->private_data; 589 + 590 + release_spapr_tce_table(stt); 591 + return 0; 592 + } 593 + 594 + static struct file_operations kvm_spapr_tce_fops = { 595 + .mmap = kvm_spapr_tce_mmap, 596 + .release = kvm_spapr_tce_release, 597 + }; 598 + 599 + long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, 600 + struct kvm_create_spapr_tce *args) 601 + { 602 + struct kvmppc_spapr_tce_table *stt = NULL; 603 + long npages; 604 + int ret = -ENOMEM; 605 + int i; 606 + 607 + /* Check this LIOBN hasn't been previously allocated */ 608 + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { 609 + if (stt->liobn == args->liobn) 610 + return -EBUSY; 611 + } 612 + 613 + npages = kvmppc_stt_npages(args->window_size); 614 + 615 + stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *), 616 + GFP_KERNEL); 617 + if (!stt) 618 + goto fail; 619 + 620 + stt->liobn = args->liobn; 621 + stt->window_size = args->window_size; 622 + stt->kvm = kvm; 623 + 624 + for (i = 0; i < npages; i++) { 625 + stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); 626 + if (!stt->pages[i]) 627 + goto fail; 628 + } 629 + 630 + kvm_get_kvm(kvm); 631 + 632 + mutex_lock(&kvm->lock); 633 + list_add(&stt->list, &kvm->arch.spapr_tce_tables); 634 + 635 + mutex_unlock(&kvm->lock); 636 + 637 + return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, 638 + stt, O_RDWR); 639 + 640 + fail: 641 + if (stt) { 642 + for (i = 0; i < npages; i++) 643 + if (stt->pages[i]) 644 + __free_page(stt->pages[i]); 645 + 646 + kfree(stt); 647 + } 648 + return ret; 649 + } 650 + 541 651 int kvmppc_core_prepare_memory_region(struct kvm *kvm, 542 652 struct kvm_userspace_memory_region *mem) 543 653 { ··· 669 559 670 560 /* Allocate hashed page table */ 671 561 r = kvmppc_alloc_hpt(kvm); 562 + if (r) 563 + return r; 672 564 673 - return r; 565 + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 566 + return 0; 674 567 } 675 568 676 569 void kvmppc_core_destroy_vm(struct kvm *kvm) 677 570 { 678 571 kvmppc_free_hpt(kvm); 572 + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); 679 573 } 680 574 681 575 /* These are stubs for now */
+1 -1
arch/powerpc/kvm/book3s_hv_rmhandlers.S
··· 754 754 .long 0 /* 0x14 - H_CLEAR_REF */ 755 755 .long .kvmppc_h_protect - hcall_real_table 756 756 .long 0 /* 0x1c - H_GET_TCE */ 757 - .long 0 /* 0x20 - H_SET_TCE */ 757 + .long .kvmppc_h_put_tce - hcall_real_table 758 758 .long 0 /* 0x24 - H_SET_SPRG0 */ 759 759 .long .kvmppc_h_set_dabr - hcall_real_table 760 760 .long 0 /* 0x2c */
+18
arch/powerpc/kvm/powerpc.c
··· 203 203 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 204 204 break; 205 205 #endif 206 + #ifdef CONFIG_KVM_BOOK3S_64_HV 207 + case KVM_CAP_SPAPR_TCE: 208 + r = 1; 209 + break; 210 + #endif 206 211 default: 207 212 r = 0; 208 213 break; ··· 658 653 659 654 break; 660 655 } 656 + #ifdef CONFIG_KVM_BOOK3S_64_HV 657 + case KVM_CREATE_SPAPR_TCE: { 658 + struct kvm_create_spapr_tce create_tce; 659 + struct kvm *kvm = filp->private_data; 660 + 661 + r = -EFAULT; 662 + if (copy_from_user(&create_tce, argp, sizeof(create_tce))) 663 + goto out; 664 + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); 665 + goto out; 666 + } 667 + #endif /* CONFIG_KVM_BOOK3S_64_HV */ 668 + 661 669 default: 662 670 r = -ENOTTY; 663 671 }
+2
include/linux/kvm.h
··· 550 550 #define KVM_CAP_TSC_CONTROL 60 551 551 #define KVM_CAP_GET_TSC_KHZ 61 552 552 #define KVM_CAP_PPC_BOOKE_SREGS 62 553 + #define KVM_CAP_SPAPR_TCE 63 553 554 554 555 #ifdef KVM_CAP_IRQ_ROUTING 555 556 ··· 753 752 /* Available with KVM_CAP_XCRS */ 754 753 #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) 755 754 #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) 755 + #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) 756 756 757 757 #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 758 758