KVM: MMU: hypercall based pte updates and TLB flushes

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Hypercall based pte updates are faster than faults, and also allow use
of the lazy MMU mode to batch operations.

Don't report the feature if two dimensional paging is enabled.

[avi:
- one mmu_op hypercall instead of one per op
- allow 64-bit gpa on hypercall
- don't pass host errors (-ENOMEM) to guest]

[akpm: warning fix on i386]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>

authored by

Marcelo Tosatti and committed by

Avi Kivity 18 years ago 2f333bcb 9f811285

+190 -3

6 changed files

expand all

arch

x86

kvm

mmu.c

x86.c

include

asm-x86

kvm_host.h

kvm_para.h

linux

kvm.h

kvm_para.h

+135 -1

arch/x86/kvm/mmu.c

··· 28 28 #include <linux/module.h> 29 29 #include <linux/swap.h> 30 30 #include <linux/hugetlb.h> 31 + #include <linux/compiler.h> 31 32 32 33 #include <asm/page.h> 33 34 #include <asm/cmpxchg.h> ··· 41 40 * 2. while doing 1. it walks guest-physical to host-physical 42 41 * If the hardware supports that we don't need to do shadow paging. 43 42 */ 44 - static bool tdp_enabled = false; 43 + bool tdp_enabled = false; 45 44 46 45 #undef MMU_DEBUG 47 46 ··· 167 166 #define ACC_WRITE_MASK PT_WRITABLE_MASK 168 167 #define ACC_USER_MASK PT_USER_MASK 169 168 #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 169 + 170 + struct kvm_pv_mmu_op_buffer { 171 + void *ptr; 172 + unsigned len; 173 + unsigned processed; 174 + char buf[512] __aligned(sizeof(long)); 175 + }; 170 176 171 177 struct kvm_rmap_desc { 172 178 u64 *shadow_ptes[RMAP_EXT]; ··· 2009 2001 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); 2010 2002 2011 2003 return nr_mmu_pages; 2004 + } 2005 + 2006 + static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, 2007 + unsigned len) 2008 + { 2009 + if (len > buffer->len) 2010 + return NULL; 2011 + return buffer->ptr; 2012 + } 2013 + 2014 + static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, 2015 + unsigned len) 2016 + { 2017 + void *ret; 2018 + 2019 + ret = pv_mmu_peek_buffer(buffer, len); 2020 + if (!ret) 2021 + return ret; 2022 + buffer->ptr += len; 2023 + buffer->len -= len; 2024 + buffer->processed += len; 2025 + return ret; 2026 + } 2027 + 2028 + static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, 2029 + gpa_t addr, gpa_t value) 2030 + { 2031 + int bytes = 8; 2032 + int r; 2033 + 2034 + if (!is_long_mode(vcpu) && !is_pae(vcpu)) 2035 + bytes = 4; 2036 + 2037 + r = mmu_topup_memory_caches(vcpu); 2038 + if (r) 2039 + return r; 2040 + 2041 + if (!__emulator_write_phys(vcpu, addr, &value, bytes)) 2042 + return -EFAULT; 2043 + 2044 + return 1; 2045 + } 2046 + 2047 + static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2048 + { 2049 + kvm_x86_ops->tlb_flush(vcpu); 2050 + return 1; 2051 + } 2052 + 2053 + static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) 2054 + { 2055 + spin_lock(&vcpu->kvm->mmu_lock); 2056 + mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); 2057 + spin_unlock(&vcpu->kvm->mmu_lock); 2058 + return 1; 2059 + } 2060 + 2061 + static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, 2062 + struct kvm_pv_mmu_op_buffer *buffer) 2063 + { 2064 + struct kvm_mmu_op_header *header; 2065 + 2066 + header = pv_mmu_peek_buffer(buffer, sizeof *header); 2067 + if (!header) 2068 + return 0; 2069 + switch (header->op) { 2070 + case KVM_MMU_OP_WRITE_PTE: { 2071 + struct kvm_mmu_op_write_pte *wpte; 2072 + 2073 + wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); 2074 + if (!wpte) 2075 + return 0; 2076 + return kvm_pv_mmu_write(vcpu, wpte->pte_phys, 2077 + wpte->pte_val); 2078 + } 2079 + case KVM_MMU_OP_FLUSH_TLB: { 2080 + struct kvm_mmu_op_flush_tlb *ftlb; 2081 + 2082 + ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); 2083 + if (!ftlb) 2084 + return 0; 2085 + return kvm_pv_mmu_flush_tlb(vcpu); 2086 + } 2087 + case KVM_MMU_OP_RELEASE_PT: { 2088 + struct kvm_mmu_op_release_pt *rpt; 2089 + 2090 + rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); 2091 + if (!rpt) 2092 + return 0; 2093 + return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); 2094 + } 2095 + default: return 0; 2096 + } 2097 + } 2098 + 2099 + int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 2100 + gpa_t addr, unsigned long *ret) 2101 + { 2102 + int r; 2103 + struct kvm_pv_mmu_op_buffer buffer; 2104 + 2105 + down_read(&vcpu->kvm->slots_lock); 2106 + down_read(&current->mm->mmap_sem); 2107 + 2108 + buffer.ptr = buffer.buf; 2109 + buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); 2110 + buffer.processed = 0; 2111 + 2112 + r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); 2113 + if (r) 2114 + goto out; 2115 + 2116 + while (buffer.len) { 2117 + r = kvm_pv_mmu_op_one(vcpu, &buffer); 2118 + if (r < 0) 2119 + goto out; 2120 + if (r == 0) 2121 + break; 2122 + } 2123 + 2124 + r = 1; 2125 + out: 2126 + *ret = buffer.processed; 2127 + up_read(&current->mm->mmap_sem); 2128 + up_read(&vcpu->kvm->slots_lock); 2129 + return r; 2012 2130 } 2013 2131 2014 2132 #ifdef AUDIT

+17 -1

arch/x86/kvm/x86.c

··· 832 832 case KVM_CAP_NR_MEMSLOTS: 833 833 r = KVM_MEMORY_SLOTS; 834 834 break; 835 + case KVM_CAP_PV_MMU: 836 + r = !tdp_enabled; 837 + break; 835 838 default: 836 839 r = 0; 837 840 break; ··· 2455 2452 } 2456 2453 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2457 2454 2455 + static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2456 + unsigned long a1) 2457 + { 2458 + if (is_long_mode(vcpu)) 2459 + return a0; 2460 + else 2461 + return a0 | ((gpa_t)a1 << 32); 2462 + } 2463 + 2458 2464 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2459 2465 { 2460 2466 unsigned long nr, a0, a1, a2, a3, ret; 2467 + int r = 1; 2461 2468 2462 2469 kvm_x86_ops->cache_regs(vcpu); 2463 2470 ··· 2489 2476 case KVM_HC_VAPIC_POLL_IRQ: 2490 2477 ret = 0; 2491 2478 break; 2479 + case KVM_HC_MMU_OP: 2480 + r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2481 + break; 2492 2482 default: 2493 2483 ret = -KVM_ENOSYS; 2494 2484 break; ··· 2499 2483 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2500 2484 kvm_x86_ops->decache_regs(vcpu); 2501 2485 ++vcpu->stat.hypercalls; 2502 - return 0; 2486 + return r; 2503 2487 } 2504 2488 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2505 2489

include/asm-x86/kvm_host.h

··· 434 434 435 435 int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 436 436 const void *val, int bytes); 437 + int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 438 + gpa_t addr, unsigned long *ret); 439 + 440 + extern bool tdp_enabled; 437 441 438 442 enum emulation_result { 439 443 EMULATE_DONE, /* no further processing */

+29

include/asm-x86/kvm_para.h

··· 12 12 #define KVM_CPUID_FEATURES 0x40000001 13 13 #define KVM_FEATURE_CLOCKSOURCE 0 14 14 #define KVM_FEATURE_NOP_IO_DELAY 1 15 + #define KVM_FEATURE_MMU_OP 2 15 16 16 17 #define MSR_KVM_WALL_CLOCK 0x11 17 18 #define MSR_KVM_SYSTEM_TIME 0x12 19 + 20 + #define KVM_MAX_MMU_OP_BATCH 32 21 + 22 + /* Operations for KVM_HC_MMU_OP */ 23 + #define KVM_MMU_OP_WRITE_PTE 1 24 + #define KVM_MMU_OP_FLUSH_TLB 2 25 + #define KVM_MMU_OP_RELEASE_PT 3 26 + 27 + /* Payload for KVM_HC_MMU_OP */ 28 + struct kvm_mmu_op_header { 29 + __u32 op; 30 + __u32 pad; 31 + }; 32 + 33 + struct kvm_mmu_op_write_pte { 34 + struct kvm_mmu_op_header header; 35 + __u64 pte_phys; 36 + __u64 pte_val; 37 + }; 38 + 39 + struct kvm_mmu_op_flush_tlb { 40 + struct kvm_mmu_op_header header; 41 + }; 42 + 43 + struct kvm_mmu_op_release_pt { 44 + struct kvm_mmu_op_header header; 45 + __u64 pt_phys; 46 + }; 18 47 19 48 #ifdef __KERNEL__ 20 49 #include <asm/processor.h>

include/linux/kvm.h

··· 238 238 #define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ 239 239 #define KVM_CAP_PIT 11 240 240 #define KVM_CAP_NOP_IO_DELAY 12 241 + #define KVM_CAP_PV_MMU 13 241 242 242 243 /* 243 244 * ioctls for VM fds

+4 -1

include/linux/kvm_para.h

··· 11 11 12 12 /* Return values for hypercalls */ 13 13 #define KVM_ENOSYS 1000 14 + #define KVM_EFAULT EFAULT 15 + #define KVM_E2BIG E2BIG 14 16 15 - #define KVM_HC_VAPIC_POLL_IRQ 1 17 + #define KVM_HC_VAPIC_POLL_IRQ 1 18 + #define KVM_HC_MMU_OP 2 16 19 17 20 /* 18 21 * hypercalls use architecture specific