Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.22-rc3 3130 lines 70 kB view raw
1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18#include "kvm.h" 19 20#include <linux/kvm.h> 21#include <linux/module.h> 22#include <linux/errno.h> 23#include <linux/magic.h> 24#include <asm/processor.h> 25#include <linux/percpu.h> 26#include <linux/gfp.h> 27#include <asm/msr.h> 28#include <linux/mm.h> 29#include <linux/miscdevice.h> 30#include <linux/vmalloc.h> 31#include <asm/uaccess.h> 32#include <linux/reboot.h> 33#include <asm/io.h> 34#include <linux/debugfs.h> 35#include <linux/highmem.h> 36#include <linux/file.h> 37#include <asm/desc.h> 38#include <linux/sysdev.h> 39#include <linux/cpu.h> 40#include <linux/file.h> 41#include <linux/fs.h> 42#include <linux/mount.h> 43#include <linux/sched.h> 44 45#include "x86_emulate.h" 46#include "segment_descriptor.h" 47 48MODULE_AUTHOR("Qumranet"); 49MODULE_LICENSE("GPL"); 50 51static DEFINE_SPINLOCK(kvm_lock); 52static LIST_HEAD(vm_list); 53 54struct kvm_arch_ops *kvm_arch_ops; 55 56#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) 57 58static struct kvm_stats_debugfs_item { 59 const char *name; 60 int offset; 61 struct dentry *dentry; 62} debugfs_entries[] = { 63 { "pf_fixed", STAT_OFFSET(pf_fixed) }, 64 { "pf_guest", STAT_OFFSET(pf_guest) }, 65 { "tlb_flush", STAT_OFFSET(tlb_flush) }, 66 { "invlpg", STAT_OFFSET(invlpg) }, 67 { "exits", STAT_OFFSET(exits) }, 68 { "io_exits", STAT_OFFSET(io_exits) }, 69 { "mmio_exits", STAT_OFFSET(mmio_exits) }, 70 { "signal_exits", STAT_OFFSET(signal_exits) }, 71 { "irq_window", STAT_OFFSET(irq_window_exits) }, 72 { "halt_exits", STAT_OFFSET(halt_exits) }, 73 { "request_irq", STAT_OFFSET(request_irq_exits) }, 74 { "irq_exits", STAT_OFFSET(irq_exits) }, 75 { NULL } 76}; 77 78static struct dentry *debugfs_dir; 79 80struct vfsmount *kvmfs_mnt; 81 82#define MAX_IO_MSRS 256 83 84#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL 85#define LMSW_GUEST_MASK 0x0eULL 86#define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) 87#define CR8_RESEVED_BITS (~0x0fULL) 88#define EFER_RESERVED_BITS 0xfffffffffffff2fe 89 90#ifdef CONFIG_X86_64 91// LDT or TSS descriptor in the GDT. 16 bytes. 92struct segment_descriptor_64 { 93 struct segment_descriptor s; 94 u32 base_higher; 95 u32 pad_zero; 96}; 97 98#endif 99 100static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 101 unsigned long arg); 102 103static struct inode *kvmfs_inode(struct file_operations *fops) 104{ 105 int error = -ENOMEM; 106 struct inode *inode = new_inode(kvmfs_mnt->mnt_sb); 107 108 if (!inode) 109 goto eexit_1; 110 111 inode->i_fop = fops; 112 113 /* 114 * Mark the inode dirty from the very beginning, 115 * that way it will never be moved to the dirty 116 * list because mark_inode_dirty() will think 117 * that it already _is_ on the dirty list. 118 */ 119 inode->i_state = I_DIRTY; 120 inode->i_mode = S_IRUSR | S_IWUSR; 121 inode->i_uid = current->fsuid; 122 inode->i_gid = current->fsgid; 123 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 124 return inode; 125 126eexit_1: 127 return ERR_PTR(error); 128} 129 130static struct file *kvmfs_file(struct inode *inode, void *private_data) 131{ 132 struct file *file = get_empty_filp(); 133 134 if (!file) 135 return ERR_PTR(-ENFILE); 136 137 file->f_path.mnt = mntget(kvmfs_mnt); 138 file->f_path.dentry = d_alloc_anon(inode); 139 if (!file->f_path.dentry) 140 return ERR_PTR(-ENOMEM); 141 file->f_mapping = inode->i_mapping; 142 143 file->f_pos = 0; 144 file->f_flags = O_RDWR; 145 file->f_op = inode->i_fop; 146 file->f_mode = FMODE_READ | FMODE_WRITE; 147 file->f_version = 0; 148 file->private_data = private_data; 149 return file; 150} 151 152unsigned long segment_base(u16 selector) 153{ 154 struct descriptor_table gdt; 155 struct segment_descriptor *d; 156 unsigned long table_base; 157 typedef unsigned long ul; 158 unsigned long v; 159 160 if (selector == 0) 161 return 0; 162 163 asm ("sgdt %0" : "=m"(gdt)); 164 table_base = gdt.base; 165 166 if (selector & 4) { /* from ldt */ 167 u16 ldt_selector; 168 169 asm ("sldt %0" : "=g"(ldt_selector)); 170 table_base = segment_base(ldt_selector); 171 } 172 d = (struct segment_descriptor *)(table_base + (selector & ~7)); 173 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); 174#ifdef CONFIG_X86_64 175 if (d->system == 0 176 && (d->type == 2 || d->type == 9 || d->type == 11)) 177 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; 178#endif 179 return v; 180} 181EXPORT_SYMBOL_GPL(segment_base); 182 183static inline int valid_vcpu(int n) 184{ 185 return likely(n >= 0 && n < KVM_MAX_VCPUS); 186} 187 188int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, 189 void *dest) 190{ 191 unsigned char *host_buf = dest; 192 unsigned long req_size = size; 193 194 while (size) { 195 hpa_t paddr; 196 unsigned now; 197 unsigned offset; 198 hva_t guest_buf; 199 200 paddr = gva_to_hpa(vcpu, addr); 201 202 if (is_error_hpa(paddr)) 203 break; 204 205 guest_buf = (hva_t)kmap_atomic( 206 pfn_to_page(paddr >> PAGE_SHIFT), 207 KM_USER0); 208 offset = addr & ~PAGE_MASK; 209 guest_buf |= offset; 210 now = min(size, PAGE_SIZE - offset); 211 memcpy(host_buf, (void*)guest_buf, now); 212 host_buf += now; 213 addr += now; 214 size -= now; 215 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); 216 } 217 return req_size - size; 218} 219EXPORT_SYMBOL_GPL(kvm_read_guest); 220 221int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, 222 void *data) 223{ 224 unsigned char *host_buf = data; 225 unsigned long req_size = size; 226 227 while (size) { 228 hpa_t paddr; 229 unsigned now; 230 unsigned offset; 231 hva_t guest_buf; 232 gfn_t gfn; 233 234 paddr = gva_to_hpa(vcpu, addr); 235 236 if (is_error_hpa(paddr)) 237 break; 238 239 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT; 240 mark_page_dirty(vcpu->kvm, gfn); 241 guest_buf = (hva_t)kmap_atomic( 242 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); 243 offset = addr & ~PAGE_MASK; 244 guest_buf |= offset; 245 now = min(size, PAGE_SIZE - offset); 246 memcpy((void*)guest_buf, host_buf, now); 247 host_buf += now; 248 addr += now; 249 size -= now; 250 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); 251 } 252 return req_size - size; 253} 254EXPORT_SYMBOL_GPL(kvm_write_guest); 255 256/* 257 * Switches to specified vcpu, until a matching vcpu_put() 258 */ 259static void vcpu_load(struct kvm_vcpu *vcpu) 260{ 261 mutex_lock(&vcpu->mutex); 262 kvm_arch_ops->vcpu_load(vcpu); 263} 264 265/* 266 * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL 267 * if the slot is not populated. 268 */ 269static struct kvm_vcpu *vcpu_load_slot(struct kvm *kvm, int slot) 270{ 271 struct kvm_vcpu *vcpu = &kvm->vcpus[slot]; 272 273 mutex_lock(&vcpu->mutex); 274 if (!vcpu->vmcs) { 275 mutex_unlock(&vcpu->mutex); 276 return NULL; 277 } 278 kvm_arch_ops->vcpu_load(vcpu); 279 return vcpu; 280} 281 282static void vcpu_put(struct kvm_vcpu *vcpu) 283{ 284 kvm_arch_ops->vcpu_put(vcpu); 285 mutex_unlock(&vcpu->mutex); 286} 287 288static struct kvm *kvm_create_vm(void) 289{ 290 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 291 int i; 292 293 if (!kvm) 294 return ERR_PTR(-ENOMEM); 295 296 spin_lock_init(&kvm->lock); 297 INIT_LIST_HEAD(&kvm->active_mmu_pages); 298 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 299 struct kvm_vcpu *vcpu = &kvm->vcpus[i]; 300 301 mutex_init(&vcpu->mutex); 302 vcpu->cpu = -1; 303 vcpu->kvm = kvm; 304 vcpu->mmu.root_hpa = INVALID_PAGE; 305 INIT_LIST_HEAD(&vcpu->free_pages); 306 spin_lock(&kvm_lock); 307 list_add(&kvm->vm_list, &vm_list); 308 spin_unlock(&kvm_lock); 309 } 310 return kvm; 311} 312 313static int kvm_dev_open(struct inode *inode, struct file *filp) 314{ 315 return 0; 316} 317 318/* 319 * Free any memory in @free but not in @dont. 320 */ 321static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 322 struct kvm_memory_slot *dont) 323{ 324 int i; 325 326 if (!dont || free->phys_mem != dont->phys_mem) 327 if (free->phys_mem) { 328 for (i = 0; i < free->npages; ++i) 329 if (free->phys_mem[i]) 330 __free_page(free->phys_mem[i]); 331 vfree(free->phys_mem); 332 } 333 334 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 335 vfree(free->dirty_bitmap); 336 337 free->phys_mem = NULL; 338 free->npages = 0; 339 free->dirty_bitmap = NULL; 340} 341 342static void kvm_free_physmem(struct kvm *kvm) 343{ 344 int i; 345 346 for (i = 0; i < kvm->nmemslots; ++i) 347 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 348} 349 350static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 351{ 352 int i; 353 354 for (i = 0; i < 2; ++i) 355 if (vcpu->pio.guest_pages[i]) { 356 __free_page(vcpu->pio.guest_pages[i]); 357 vcpu->pio.guest_pages[i] = NULL; 358 } 359} 360 361static void kvm_free_vcpu(struct kvm_vcpu *vcpu) 362{ 363 if (!vcpu->vmcs) 364 return; 365 366 vcpu_load(vcpu); 367 kvm_mmu_destroy(vcpu); 368 vcpu_put(vcpu); 369 kvm_arch_ops->vcpu_free(vcpu); 370 free_page((unsigned long)vcpu->run); 371 vcpu->run = NULL; 372 free_page((unsigned long)vcpu->pio_data); 373 vcpu->pio_data = NULL; 374 free_pio_guest_pages(vcpu); 375} 376 377static void kvm_free_vcpus(struct kvm *kvm) 378{ 379 unsigned int i; 380 381 for (i = 0; i < KVM_MAX_VCPUS; ++i) 382 kvm_free_vcpu(&kvm->vcpus[i]); 383} 384 385static int kvm_dev_release(struct inode *inode, struct file *filp) 386{ 387 return 0; 388} 389 390static void kvm_destroy_vm(struct kvm *kvm) 391{ 392 spin_lock(&kvm_lock); 393 list_del(&kvm->vm_list); 394 spin_unlock(&kvm_lock); 395 kvm_free_vcpus(kvm); 396 kvm_free_physmem(kvm); 397 kfree(kvm); 398} 399 400static int kvm_vm_release(struct inode *inode, struct file *filp) 401{ 402 struct kvm *kvm = filp->private_data; 403 404 kvm_destroy_vm(kvm); 405 return 0; 406} 407 408static void inject_gp(struct kvm_vcpu *vcpu) 409{ 410 kvm_arch_ops->inject_gp(vcpu, 0); 411} 412 413/* 414 * Load the pae pdptrs. Return true is they are all valid. 415 */ 416static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 417{ 418 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 419 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 420 int i; 421 u64 pdpte; 422 u64 *pdpt; 423 int ret; 424 struct page *page; 425 426 spin_lock(&vcpu->kvm->lock); 427 page = gfn_to_page(vcpu->kvm, pdpt_gfn); 428 /* FIXME: !page - emulate? 0xff? */ 429 pdpt = kmap_atomic(page, KM_USER0); 430 431 ret = 1; 432 for (i = 0; i < 4; ++i) { 433 pdpte = pdpt[offset + i]; 434 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) { 435 ret = 0; 436 goto out; 437 } 438 } 439 440 for (i = 0; i < 4; ++i) 441 vcpu->pdptrs[i] = pdpt[offset + i]; 442 443out: 444 kunmap_atomic(pdpt, KM_USER0); 445 spin_unlock(&vcpu->kvm->lock); 446 447 return ret; 448} 449 450void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 451{ 452 if (cr0 & CR0_RESEVED_BITS) { 453 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 454 cr0, vcpu->cr0); 455 inject_gp(vcpu); 456 return; 457 } 458 459 if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { 460 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 461 inject_gp(vcpu); 462 return; 463 } 464 465 if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { 466 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 467 "and a clear PE flag\n"); 468 inject_gp(vcpu); 469 return; 470 } 471 472 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { 473#ifdef CONFIG_X86_64 474 if ((vcpu->shadow_efer & EFER_LME)) { 475 int cs_db, cs_l; 476 477 if (!is_pae(vcpu)) { 478 printk(KERN_DEBUG "set_cr0: #GP, start paging " 479 "in long mode while PAE is disabled\n"); 480 inject_gp(vcpu); 481 return; 482 } 483 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 484 if (cs_l) { 485 printk(KERN_DEBUG "set_cr0: #GP, start paging " 486 "in long mode while CS.L == 1\n"); 487 inject_gp(vcpu); 488 return; 489 490 } 491 } else 492#endif 493 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { 494 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 495 "reserved bits\n"); 496 inject_gp(vcpu); 497 return; 498 } 499 500 } 501 502 kvm_arch_ops->set_cr0(vcpu, cr0); 503 vcpu->cr0 = cr0; 504 505 spin_lock(&vcpu->kvm->lock); 506 kvm_mmu_reset_context(vcpu); 507 spin_unlock(&vcpu->kvm->lock); 508 return; 509} 510EXPORT_SYMBOL_GPL(set_cr0); 511 512void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 513{ 514 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); 515} 516EXPORT_SYMBOL_GPL(lmsw); 517 518void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 519{ 520 if (cr4 & CR4_RESEVED_BITS) { 521 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 522 inject_gp(vcpu); 523 return; 524 } 525 526 if (is_long_mode(vcpu)) { 527 if (!(cr4 & CR4_PAE_MASK)) { 528 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 529 "in long mode\n"); 530 inject_gp(vcpu); 531 return; 532 } 533 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) 534 && !load_pdptrs(vcpu, vcpu->cr3)) { 535 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 536 inject_gp(vcpu); 537 } 538 539 if (cr4 & CR4_VMXE_MASK) { 540 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 541 inject_gp(vcpu); 542 return; 543 } 544 kvm_arch_ops->set_cr4(vcpu, cr4); 545 spin_lock(&vcpu->kvm->lock); 546 kvm_mmu_reset_context(vcpu); 547 spin_unlock(&vcpu->kvm->lock); 548} 549EXPORT_SYMBOL_GPL(set_cr4); 550 551void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 552{ 553 if (is_long_mode(vcpu)) { 554 if (cr3 & CR3_L_MODE_RESEVED_BITS) { 555 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 556 inject_gp(vcpu); 557 return; 558 } 559 } else { 560 if (cr3 & CR3_RESEVED_BITS) { 561 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 562 inject_gp(vcpu); 563 return; 564 } 565 if (is_paging(vcpu) && is_pae(vcpu) && 566 !load_pdptrs(vcpu, cr3)) { 567 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 568 "reserved bits\n"); 569 inject_gp(vcpu); 570 return; 571 } 572 } 573 574 vcpu->cr3 = cr3; 575 spin_lock(&vcpu->kvm->lock); 576 /* 577 * Does the new cr3 value map to physical memory? (Note, we 578 * catch an invalid cr3 even in real-mode, because it would 579 * cause trouble later on when we turn on paging anyway.) 580 * 581 * A real CPU would silently accept an invalid cr3 and would 582 * attempt to use it - with largely undefined (and often hard 583 * to debug) behavior on the guest side. 584 */ 585 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 586 inject_gp(vcpu); 587 else 588 vcpu->mmu.new_cr3(vcpu); 589 spin_unlock(&vcpu->kvm->lock); 590} 591EXPORT_SYMBOL_GPL(set_cr3); 592 593void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 594{ 595 if ( cr8 & CR8_RESEVED_BITS) { 596 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 597 inject_gp(vcpu); 598 return; 599 } 600 vcpu->cr8 = cr8; 601} 602EXPORT_SYMBOL_GPL(set_cr8); 603 604void fx_init(struct kvm_vcpu *vcpu) 605{ 606 struct __attribute__ ((__packed__)) fx_image_s { 607 u16 control; //fcw 608 u16 status; //fsw 609 u16 tag; // ftw 610 u16 opcode; //fop 611 u64 ip; // fpu ip 612 u64 operand;// fpu dp 613 u32 mxcsr; 614 u32 mxcsr_mask; 615 616 } *fx_image; 617 618 fx_save(vcpu->host_fx_image); 619 fpu_init(); 620 fx_save(vcpu->guest_fx_image); 621 fx_restore(vcpu->host_fx_image); 622 623 fx_image = (struct fx_image_s *)vcpu->guest_fx_image; 624 fx_image->mxcsr = 0x1f80; 625 memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), 626 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); 627} 628EXPORT_SYMBOL_GPL(fx_init); 629 630static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot) 631{ 632 spin_lock(&vcpu->kvm->lock); 633 kvm_mmu_slot_remove_write_access(vcpu, slot); 634 spin_unlock(&vcpu->kvm->lock); 635} 636 637/* 638 * Allocate some memory and give it an address in the guest physical address 639 * space. 640 * 641 * Discontiguous memory is allowed, mostly for framebuffers. 642 */ 643static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 644 struct kvm_memory_region *mem) 645{ 646 int r; 647 gfn_t base_gfn; 648 unsigned long npages; 649 unsigned long i; 650 struct kvm_memory_slot *memslot; 651 struct kvm_memory_slot old, new; 652 int memory_config_version; 653 654 r = -EINVAL; 655 /* General sanity checks */ 656 if (mem->memory_size & (PAGE_SIZE - 1)) 657 goto out; 658 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 659 goto out; 660 if (mem->slot >= KVM_MEMORY_SLOTS) 661 goto out; 662 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 663 goto out; 664 665 memslot = &kvm->memslots[mem->slot]; 666 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 667 npages = mem->memory_size >> PAGE_SHIFT; 668 669 if (!npages) 670 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 671 672raced: 673 spin_lock(&kvm->lock); 674 675 memory_config_version = kvm->memory_config_version; 676 new = old = *memslot; 677 678 new.base_gfn = base_gfn; 679 new.npages = npages; 680 new.flags = mem->flags; 681 682 /* Disallow changing a memory slot's size. */ 683 r = -EINVAL; 684 if (npages && old.npages && npages != old.npages) 685 goto out_unlock; 686 687 /* Check for overlaps */ 688 r = -EEXIST; 689 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 690 struct kvm_memory_slot *s = &kvm->memslots[i]; 691 692 if (s == memslot) 693 continue; 694 if (!((base_gfn + npages <= s->base_gfn) || 695 (base_gfn >= s->base_gfn + s->npages))) 696 goto out_unlock; 697 } 698 /* 699 * Do memory allocations outside lock. memory_config_version will 700 * detect any races. 701 */ 702 spin_unlock(&kvm->lock); 703 704 /* Deallocate if slot is being removed */ 705 if (!npages) 706 new.phys_mem = NULL; 707 708 /* Free page dirty bitmap if unneeded */ 709 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 710 new.dirty_bitmap = NULL; 711 712 r = -ENOMEM; 713 714 /* Allocate if a slot is being created */ 715 if (npages && !new.phys_mem) { 716 new.phys_mem = vmalloc(npages * sizeof(struct page *)); 717 718 if (!new.phys_mem) 719 goto out_free; 720 721 memset(new.phys_mem, 0, npages * sizeof(struct page *)); 722 for (i = 0; i < npages; ++i) { 723 new.phys_mem[i] = alloc_page(GFP_HIGHUSER 724 | __GFP_ZERO); 725 if (!new.phys_mem[i]) 726 goto out_free; 727 set_page_private(new.phys_mem[i],0); 728 } 729 } 730 731 /* Allocate page dirty bitmap if needed */ 732 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 733 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 734 735 new.dirty_bitmap = vmalloc(dirty_bytes); 736 if (!new.dirty_bitmap) 737 goto out_free; 738 memset(new.dirty_bitmap, 0, dirty_bytes); 739 } 740 741 spin_lock(&kvm->lock); 742 743 if (memory_config_version != kvm->memory_config_version) { 744 spin_unlock(&kvm->lock); 745 kvm_free_physmem_slot(&new, &old); 746 goto raced; 747 } 748 749 r = -EAGAIN; 750 if (kvm->busy) 751 goto out_unlock; 752 753 if (mem->slot >= kvm->nmemslots) 754 kvm->nmemslots = mem->slot + 1; 755 756 *memslot = new; 757 ++kvm->memory_config_version; 758 759 spin_unlock(&kvm->lock); 760 761 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 762 struct kvm_vcpu *vcpu; 763 764 vcpu = vcpu_load_slot(kvm, i); 765 if (!vcpu) 766 continue; 767 if (new.flags & KVM_MEM_LOG_DIRTY_PAGES) 768 do_remove_write_access(vcpu, mem->slot); 769 kvm_mmu_reset_context(vcpu); 770 vcpu_put(vcpu); 771 } 772 773 kvm_free_physmem_slot(&old, &new); 774 return 0; 775 776out_unlock: 777 spin_unlock(&kvm->lock); 778out_free: 779 kvm_free_physmem_slot(&new, &old); 780out: 781 return r; 782} 783 784/* 785 * Get (and clear) the dirty memory log for a memory slot. 786 */ 787static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 788 struct kvm_dirty_log *log) 789{ 790 struct kvm_memory_slot *memslot; 791 int r, i; 792 int n; 793 int cleared; 794 unsigned long any = 0; 795 796 spin_lock(&kvm->lock); 797 798 /* 799 * Prevent changes to guest memory configuration even while the lock 800 * is not taken. 801 */ 802 ++kvm->busy; 803 spin_unlock(&kvm->lock); 804 r = -EINVAL; 805 if (log->slot >= KVM_MEMORY_SLOTS) 806 goto out; 807 808 memslot = &kvm->memslots[log->slot]; 809 r = -ENOENT; 810 if (!memslot->dirty_bitmap) 811 goto out; 812 813 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 814 815 for (i = 0; !any && i < n/sizeof(long); ++i) 816 any = memslot->dirty_bitmap[i]; 817 818 r = -EFAULT; 819 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 820 goto out; 821 822 if (any) { 823 cleared = 0; 824 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 825 struct kvm_vcpu *vcpu; 826 827 vcpu = vcpu_load_slot(kvm, i); 828 if (!vcpu) 829 continue; 830 if (!cleared) { 831 do_remove_write_access(vcpu, log->slot); 832 memset(memslot->dirty_bitmap, 0, n); 833 cleared = 1; 834 } 835 kvm_arch_ops->tlb_flush(vcpu); 836 vcpu_put(vcpu); 837 } 838 } 839 840 r = 0; 841 842out: 843 spin_lock(&kvm->lock); 844 --kvm->busy; 845 spin_unlock(&kvm->lock); 846 return r; 847} 848 849/* 850 * Set a new alias region. Aliases map a portion of physical memory into 851 * another portion. This is useful for memory windows, for example the PC 852 * VGA region. 853 */ 854static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 855 struct kvm_memory_alias *alias) 856{ 857 int r, n; 858 struct kvm_mem_alias *p; 859 860 r = -EINVAL; 861 /* General sanity checks */ 862 if (alias->memory_size & (PAGE_SIZE - 1)) 863 goto out; 864 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 865 goto out; 866 if (alias->slot >= KVM_ALIAS_SLOTS) 867 goto out; 868 if (alias->guest_phys_addr + alias->memory_size 869 < alias->guest_phys_addr) 870 goto out; 871 if (alias->target_phys_addr + alias->memory_size 872 < alias->target_phys_addr) 873 goto out; 874 875 spin_lock(&kvm->lock); 876 877 p = &kvm->aliases[alias->slot]; 878 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 879 p->npages = alias->memory_size >> PAGE_SHIFT; 880 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 881 882 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 883 if (kvm->aliases[n - 1].npages) 884 break; 885 kvm->naliases = n; 886 887 spin_unlock(&kvm->lock); 888 889 vcpu_load(&kvm->vcpus[0]); 890 spin_lock(&kvm->lock); 891 kvm_mmu_zap_all(&kvm->vcpus[0]); 892 spin_unlock(&kvm->lock); 893 vcpu_put(&kvm->vcpus[0]); 894 895 return 0; 896 897out: 898 return r; 899} 900 901static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 902{ 903 int i; 904 struct kvm_mem_alias *alias; 905 906 for (i = 0; i < kvm->naliases; ++i) { 907 alias = &kvm->aliases[i]; 908 if (gfn >= alias->base_gfn 909 && gfn < alias->base_gfn + alias->npages) 910 return alias->target_gfn + gfn - alias->base_gfn; 911 } 912 return gfn; 913} 914 915static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 916{ 917 int i; 918 919 for (i = 0; i < kvm->nmemslots; ++i) { 920 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 921 922 if (gfn >= memslot->base_gfn 923 && gfn < memslot->base_gfn + memslot->npages) 924 return memslot; 925 } 926 return NULL; 927} 928 929struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 930{ 931 gfn = unalias_gfn(kvm, gfn); 932 return __gfn_to_memslot(kvm, gfn); 933} 934 935struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 936{ 937 struct kvm_memory_slot *slot; 938 939 gfn = unalias_gfn(kvm, gfn); 940 slot = __gfn_to_memslot(kvm, gfn); 941 if (!slot) 942 return NULL; 943 return slot->phys_mem[gfn - slot->base_gfn]; 944} 945EXPORT_SYMBOL_GPL(gfn_to_page); 946 947void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 948{ 949 int i; 950 struct kvm_memory_slot *memslot = NULL; 951 unsigned long rel_gfn; 952 953 for (i = 0; i < kvm->nmemslots; ++i) { 954 memslot = &kvm->memslots[i]; 955 956 if (gfn >= memslot->base_gfn 957 && gfn < memslot->base_gfn + memslot->npages) { 958 959 if (!memslot || !memslot->dirty_bitmap) 960 return; 961 962 rel_gfn = gfn - memslot->base_gfn; 963 964 /* avoid RMW */ 965 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 966 set_bit(rel_gfn, memslot->dirty_bitmap); 967 return; 968 } 969 } 970} 971 972static int emulator_read_std(unsigned long addr, 973 void *val, 974 unsigned int bytes, 975 struct x86_emulate_ctxt *ctxt) 976{ 977 struct kvm_vcpu *vcpu = ctxt->vcpu; 978 void *data = val; 979 980 while (bytes) { 981 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 982 unsigned offset = addr & (PAGE_SIZE-1); 983 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 984 unsigned long pfn; 985 struct page *page; 986 void *page_virt; 987 988 if (gpa == UNMAPPED_GVA) 989 return X86EMUL_PROPAGATE_FAULT; 990 pfn = gpa >> PAGE_SHIFT; 991 page = gfn_to_page(vcpu->kvm, pfn); 992 if (!page) 993 return X86EMUL_UNHANDLEABLE; 994 page_virt = kmap_atomic(page, KM_USER0); 995 996 memcpy(data, page_virt + offset, tocopy); 997 998 kunmap_atomic(page_virt, KM_USER0); 999 1000 bytes -= tocopy; 1001 data += tocopy; 1002 addr += tocopy; 1003 } 1004 1005 return X86EMUL_CONTINUE; 1006} 1007 1008static int emulator_write_std(unsigned long addr, 1009 const void *val, 1010 unsigned int bytes, 1011 struct x86_emulate_ctxt *ctxt) 1012{ 1013 printk(KERN_ERR "emulator_write_std: addr %lx n %d\n", 1014 addr, bytes); 1015 return X86EMUL_UNHANDLEABLE; 1016} 1017 1018static int emulator_read_emulated(unsigned long addr, 1019 void *val, 1020 unsigned int bytes, 1021 struct x86_emulate_ctxt *ctxt) 1022{ 1023 struct kvm_vcpu *vcpu = ctxt->vcpu; 1024 1025 if (vcpu->mmio_read_completed) { 1026 memcpy(val, vcpu->mmio_data, bytes); 1027 vcpu->mmio_read_completed = 0; 1028 return X86EMUL_CONTINUE; 1029 } else if (emulator_read_std(addr, val, bytes, ctxt) 1030 == X86EMUL_CONTINUE) 1031 return X86EMUL_CONTINUE; 1032 else { 1033 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1034 1035 if (gpa == UNMAPPED_GVA) 1036 return X86EMUL_PROPAGATE_FAULT; 1037 vcpu->mmio_needed = 1; 1038 vcpu->mmio_phys_addr = gpa; 1039 vcpu->mmio_size = bytes; 1040 vcpu->mmio_is_write = 0; 1041 1042 return X86EMUL_UNHANDLEABLE; 1043 } 1044} 1045 1046static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1047 const void *val, int bytes) 1048{ 1049 struct page *page; 1050 void *virt; 1051 1052 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1053 return 0; 1054 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1055 if (!page) 1056 return 0; 1057 kvm_mmu_pre_write(vcpu, gpa, bytes); 1058 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); 1059 virt = kmap_atomic(page, KM_USER0); 1060 memcpy(virt + offset_in_page(gpa), val, bytes); 1061 kunmap_atomic(virt, KM_USER0); 1062 kvm_mmu_post_write(vcpu, gpa, bytes); 1063 return 1; 1064} 1065 1066static int emulator_write_emulated(unsigned long addr, 1067 const void *val, 1068 unsigned int bytes, 1069 struct x86_emulate_ctxt *ctxt) 1070{ 1071 struct kvm_vcpu *vcpu = ctxt->vcpu; 1072 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1073 1074 if (gpa == UNMAPPED_GVA) { 1075 kvm_arch_ops->inject_page_fault(vcpu, addr, 2); 1076 return X86EMUL_PROPAGATE_FAULT; 1077 } 1078 1079 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1080 return X86EMUL_CONTINUE; 1081 1082 vcpu->mmio_needed = 1; 1083 vcpu->mmio_phys_addr = gpa; 1084 vcpu->mmio_size = bytes; 1085 vcpu->mmio_is_write = 1; 1086 memcpy(vcpu->mmio_data, val, bytes); 1087 1088 return X86EMUL_CONTINUE; 1089} 1090 1091static int emulator_cmpxchg_emulated(unsigned long addr, 1092 const void *old, 1093 const void *new, 1094 unsigned int bytes, 1095 struct x86_emulate_ctxt *ctxt) 1096{ 1097 static int reported; 1098 1099 if (!reported) { 1100 reported = 1; 1101 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 1102 } 1103 return emulator_write_emulated(addr, new, bytes, ctxt); 1104} 1105 1106static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 1107{ 1108 return kvm_arch_ops->get_segment_base(vcpu, seg); 1109} 1110 1111int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 1112{ 1113 return X86EMUL_CONTINUE; 1114} 1115 1116int emulate_clts(struct kvm_vcpu *vcpu) 1117{ 1118 unsigned long cr0; 1119 1120 cr0 = vcpu->cr0 & ~CR0_TS_MASK; 1121 kvm_arch_ops->set_cr0(vcpu, cr0); 1122 return X86EMUL_CONTINUE; 1123} 1124 1125int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) 1126{ 1127 struct kvm_vcpu *vcpu = ctxt->vcpu; 1128 1129 switch (dr) { 1130 case 0 ... 3: 1131 *dest = kvm_arch_ops->get_dr(vcpu, dr); 1132 return X86EMUL_CONTINUE; 1133 default: 1134 printk(KERN_DEBUG "%s: unexpected dr %u\n", 1135 __FUNCTION__, dr); 1136 return X86EMUL_UNHANDLEABLE; 1137 } 1138} 1139 1140int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 1141{ 1142 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 1143 int exception; 1144 1145 kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 1146 if (exception) { 1147 /* FIXME: better handling */ 1148 return X86EMUL_UNHANDLEABLE; 1149 } 1150 return X86EMUL_CONTINUE; 1151} 1152 1153static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) 1154{ 1155 static int reported; 1156 u8 opcodes[4]; 1157 unsigned long rip = ctxt->vcpu->rip; 1158 unsigned long rip_linear; 1159 1160 rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); 1161 1162 if (reported) 1163 return; 1164 1165 emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); 1166 1167 printk(KERN_ERR "emulation failed but !mmio_needed?" 1168 " rip %lx %02x %02x %02x %02x\n", 1169 rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 1170 reported = 1; 1171} 1172 1173struct x86_emulate_ops emulate_ops = { 1174 .read_std = emulator_read_std, 1175 .write_std = emulator_write_std, 1176 .read_emulated = emulator_read_emulated, 1177 .write_emulated = emulator_write_emulated, 1178 .cmpxchg_emulated = emulator_cmpxchg_emulated, 1179}; 1180 1181int emulate_instruction(struct kvm_vcpu *vcpu, 1182 struct kvm_run *run, 1183 unsigned long cr2, 1184 u16 error_code) 1185{ 1186 struct x86_emulate_ctxt emulate_ctxt; 1187 int r; 1188 int cs_db, cs_l; 1189 1190 vcpu->mmio_fault_cr2 = cr2; 1191 kvm_arch_ops->cache_regs(vcpu); 1192 1193 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 1194 1195 emulate_ctxt.vcpu = vcpu; 1196 emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu); 1197 emulate_ctxt.cr2 = cr2; 1198 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) 1199 ? X86EMUL_MODE_REAL : cs_l 1200 ? X86EMUL_MODE_PROT64 : cs_db 1201 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 1202 1203 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { 1204 emulate_ctxt.cs_base = 0; 1205 emulate_ctxt.ds_base = 0; 1206 emulate_ctxt.es_base = 0; 1207 emulate_ctxt.ss_base = 0; 1208 } else { 1209 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); 1210 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); 1211 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); 1212 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); 1213 } 1214 1215 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); 1216 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); 1217 1218 vcpu->mmio_is_write = 0; 1219 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); 1220 1221 if ((r || vcpu->mmio_is_write) && run) { 1222 run->mmio.phys_addr = vcpu->mmio_phys_addr; 1223 memcpy(run->mmio.data, vcpu->mmio_data, 8); 1224 run->mmio.len = vcpu->mmio_size; 1225 run->mmio.is_write = vcpu->mmio_is_write; 1226 } 1227 1228 if (r) { 1229 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 1230 return EMULATE_DONE; 1231 if (!vcpu->mmio_needed) { 1232 report_emulation_failure(&emulate_ctxt); 1233 return EMULATE_FAIL; 1234 } 1235 return EMULATE_DO_MMIO; 1236 } 1237 1238 kvm_arch_ops->decache_regs(vcpu); 1239 kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); 1240 1241 if (vcpu->mmio_is_write) { 1242 vcpu->mmio_needed = 0; 1243 return EMULATE_DO_MMIO; 1244 } 1245 1246 return EMULATE_DONE; 1247} 1248EXPORT_SYMBOL_GPL(emulate_instruction); 1249 1250int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) 1251{ 1252 unsigned long nr, a0, a1, a2, a3, a4, a5, ret; 1253 1254 kvm_arch_ops->cache_regs(vcpu); 1255 ret = -KVM_EINVAL; 1256#ifdef CONFIG_X86_64 1257 if (is_long_mode(vcpu)) { 1258 nr = vcpu->regs[VCPU_REGS_RAX]; 1259 a0 = vcpu->regs[VCPU_REGS_RDI]; 1260 a1 = vcpu->regs[VCPU_REGS_RSI]; 1261 a2 = vcpu->regs[VCPU_REGS_RDX]; 1262 a3 = vcpu->regs[VCPU_REGS_RCX]; 1263 a4 = vcpu->regs[VCPU_REGS_R8]; 1264 a5 = vcpu->regs[VCPU_REGS_R9]; 1265 } else 1266#endif 1267 { 1268 nr = vcpu->regs[VCPU_REGS_RBX] & -1u; 1269 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; 1270 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; 1271 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; 1272 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; 1273 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; 1274 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; 1275 } 1276 switch (nr) { 1277 default: 1278 run->hypercall.args[0] = a0; 1279 run->hypercall.args[1] = a1; 1280 run->hypercall.args[2] = a2; 1281 run->hypercall.args[3] = a3; 1282 run->hypercall.args[4] = a4; 1283 run->hypercall.args[5] = a5; 1284 run->hypercall.ret = ret; 1285 run->hypercall.longmode = is_long_mode(vcpu); 1286 kvm_arch_ops->decache_regs(vcpu); 1287 return 0; 1288 } 1289 vcpu->regs[VCPU_REGS_RAX] = ret; 1290 kvm_arch_ops->decache_regs(vcpu); 1291 return 1; 1292} 1293EXPORT_SYMBOL_GPL(kvm_hypercall); 1294 1295static u64 mk_cr_64(u64 curr_cr, u32 new_val) 1296{ 1297 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 1298} 1299 1300void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 1301{ 1302 struct descriptor_table dt = { limit, base }; 1303 1304 kvm_arch_ops->set_gdt(vcpu, &dt); 1305} 1306 1307void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 1308{ 1309 struct descriptor_table dt = { limit, base }; 1310 1311 kvm_arch_ops->set_idt(vcpu, &dt); 1312} 1313 1314void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 1315 unsigned long *rflags) 1316{ 1317 lmsw(vcpu, msw); 1318 *rflags = kvm_arch_ops->get_rflags(vcpu); 1319} 1320 1321unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 1322{ 1323 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 1324 switch (cr) { 1325 case 0: 1326 return vcpu->cr0; 1327 case 2: 1328 return vcpu->cr2; 1329 case 3: 1330 return vcpu->cr3; 1331 case 4: 1332 return vcpu->cr4; 1333 default: 1334 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); 1335 return 0; 1336 } 1337} 1338 1339void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 1340 unsigned long *rflags) 1341{ 1342 switch (cr) { 1343 case 0: 1344 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); 1345 *rflags = kvm_arch_ops->get_rflags(vcpu); 1346 break; 1347 case 2: 1348 vcpu->cr2 = val; 1349 break; 1350 case 3: 1351 set_cr3(vcpu, val); 1352 break; 1353 case 4: 1354 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); 1355 break; 1356 default: 1357 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); 1358 } 1359} 1360 1361/* 1362 * Register the para guest with the host: 1363 */ 1364static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) 1365{ 1366 struct kvm_vcpu_para_state *para_state; 1367 hpa_t para_state_hpa, hypercall_hpa; 1368 struct page *para_state_page; 1369 unsigned char *hypercall; 1370 gpa_t hypercall_gpa; 1371 1372 printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); 1373 printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); 1374 1375 /* 1376 * Needs to be page aligned: 1377 */ 1378 if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) 1379 goto err_gp; 1380 1381 para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); 1382 printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); 1383 if (is_error_hpa(para_state_hpa)) 1384 goto err_gp; 1385 1386 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); 1387 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); 1388 para_state = kmap_atomic(para_state_page, KM_USER0); 1389 1390 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); 1391 printk(KERN_DEBUG ".... size: %d\n", para_state->size); 1392 1393 para_state->host_version = KVM_PARA_API_VERSION; 1394 /* 1395 * We cannot support guests that try to register themselves 1396 * with a newer API version than the host supports: 1397 */ 1398 if (para_state->guest_version > KVM_PARA_API_VERSION) { 1399 para_state->ret = -KVM_EINVAL; 1400 goto err_kunmap_skip; 1401 } 1402 1403 hypercall_gpa = para_state->hypercall_gpa; 1404 hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); 1405 printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); 1406 if (is_error_hpa(hypercall_hpa)) { 1407 para_state->ret = -KVM_EINVAL; 1408 goto err_kunmap_skip; 1409 } 1410 1411 printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); 1412 vcpu->para_state_page = para_state_page; 1413 vcpu->para_state_gpa = para_state_gpa; 1414 vcpu->hypercall_gpa = hypercall_gpa; 1415 1416 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); 1417 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), 1418 KM_USER1) + (hypercall_hpa & ~PAGE_MASK); 1419 kvm_arch_ops->patch_hypercall(vcpu, hypercall); 1420 kunmap_atomic(hypercall, KM_USER1); 1421 1422 para_state->ret = 0; 1423err_kunmap_skip: 1424 kunmap_atomic(para_state, KM_USER0); 1425 return 0; 1426err_gp: 1427 return 1; 1428} 1429 1430int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1431{ 1432 u64 data; 1433 1434 switch (msr) { 1435 case 0xc0010010: /* SYSCFG */ 1436 case 0xc0010015: /* HWCR */ 1437 case MSR_IA32_PLATFORM_ID: 1438 case MSR_IA32_P5_MC_ADDR: 1439 case MSR_IA32_P5_MC_TYPE: 1440 case MSR_IA32_MC0_CTL: 1441 case MSR_IA32_MCG_STATUS: 1442 case MSR_IA32_MCG_CAP: 1443 case MSR_IA32_MC0_MISC: 1444 case MSR_IA32_MC0_MISC+4: 1445 case MSR_IA32_MC0_MISC+8: 1446 case MSR_IA32_MC0_MISC+12: 1447 case MSR_IA32_MC0_MISC+16: 1448 case MSR_IA32_UCODE_REV: 1449 case MSR_IA32_PERF_STATUS: 1450 /* MTRR registers */ 1451 case 0xfe: 1452 case 0x200 ... 0x2ff: 1453 data = 0; 1454 break; 1455 case 0xcd: /* fsb frequency */ 1456 data = 3; 1457 break; 1458 case MSR_IA32_APICBASE: 1459 data = vcpu->apic_base; 1460 break; 1461 case MSR_IA32_MISC_ENABLE: 1462 data = vcpu->ia32_misc_enable_msr; 1463 break; 1464#ifdef CONFIG_X86_64 1465 case MSR_EFER: 1466 data = vcpu->shadow_efer; 1467 break; 1468#endif 1469 default: 1470 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr); 1471 return 1; 1472 } 1473 *pdata = data; 1474 return 0; 1475} 1476EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1477 1478/* 1479 * Reads an msr value (of 'msr_index') into 'pdata'. 1480 * Returns 0 on success, non-0 otherwise. 1481 * Assumes vcpu_load() was already called. 1482 */ 1483static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1484{ 1485 return kvm_arch_ops->get_msr(vcpu, msr_index, pdata); 1486} 1487 1488#ifdef CONFIG_X86_64 1489 1490static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 1491{ 1492 if (efer & EFER_RESERVED_BITS) { 1493 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 1494 efer); 1495 inject_gp(vcpu); 1496 return; 1497 } 1498 1499 if (is_paging(vcpu) 1500 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { 1501 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 1502 inject_gp(vcpu); 1503 return; 1504 } 1505 1506 kvm_arch_ops->set_efer(vcpu, efer); 1507 1508 efer &= ~EFER_LMA; 1509 efer |= vcpu->shadow_efer & EFER_LMA; 1510 1511 vcpu->shadow_efer = efer; 1512} 1513 1514#endif 1515 1516int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1517{ 1518 switch (msr) { 1519#ifdef CONFIG_X86_64 1520 case MSR_EFER: 1521 set_efer(vcpu, data); 1522 break; 1523#endif 1524 case MSR_IA32_MC0_STATUS: 1525 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 1526 __FUNCTION__, data); 1527 break; 1528 case MSR_IA32_MCG_STATUS: 1529 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 1530 __FUNCTION__, data); 1531 break; 1532 case MSR_IA32_UCODE_REV: 1533 case MSR_IA32_UCODE_WRITE: 1534 case 0x200 ... 0x2ff: /* MTRRs */ 1535 break; 1536 case MSR_IA32_APICBASE: 1537 vcpu->apic_base = data; 1538 break; 1539 case MSR_IA32_MISC_ENABLE: 1540 vcpu->ia32_misc_enable_msr = data; 1541 break; 1542 /* 1543 * This is the 'probe whether the host is KVM' logic: 1544 */ 1545 case MSR_KVM_API_MAGIC: 1546 return vcpu_register_para(vcpu, data); 1547 1548 default: 1549 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr); 1550 return 1; 1551 } 1552 return 0; 1553} 1554EXPORT_SYMBOL_GPL(kvm_set_msr_common); 1555 1556/* 1557 * Writes msr value into into the appropriate "register". 1558 * Returns 0 on success, non-0 otherwise. 1559 * Assumes vcpu_load() was already called. 1560 */ 1561static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1562{ 1563 return kvm_arch_ops->set_msr(vcpu, msr_index, data); 1564} 1565 1566void kvm_resched(struct kvm_vcpu *vcpu) 1567{ 1568 if (!need_resched()) 1569 return; 1570 vcpu_put(vcpu); 1571 cond_resched(); 1572 vcpu_load(vcpu); 1573} 1574EXPORT_SYMBOL_GPL(kvm_resched); 1575 1576void load_msrs(struct vmx_msr_entry *e, int n) 1577{ 1578 int i; 1579 1580 for (i = 0; i < n; ++i) 1581 wrmsrl(e[i].index, e[i].data); 1582} 1583EXPORT_SYMBOL_GPL(load_msrs); 1584 1585void save_msrs(struct vmx_msr_entry *e, int n) 1586{ 1587 int i; 1588 1589 for (i = 0; i < n; ++i) 1590 rdmsrl(e[i].index, e[i].data); 1591} 1592EXPORT_SYMBOL_GPL(save_msrs); 1593 1594void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 1595{ 1596 int i; 1597 u32 function; 1598 struct kvm_cpuid_entry *e, *best; 1599 1600 kvm_arch_ops->cache_regs(vcpu); 1601 function = vcpu->regs[VCPU_REGS_RAX]; 1602 vcpu->regs[VCPU_REGS_RAX] = 0; 1603 vcpu->regs[VCPU_REGS_RBX] = 0; 1604 vcpu->regs[VCPU_REGS_RCX] = 0; 1605 vcpu->regs[VCPU_REGS_RDX] = 0; 1606 best = NULL; 1607 for (i = 0; i < vcpu->cpuid_nent; ++i) { 1608 e = &vcpu->cpuid_entries[i]; 1609 if (e->function == function) { 1610 best = e; 1611 break; 1612 } 1613 /* 1614 * Both basic or both extended? 1615 */ 1616 if (((e->function ^ function) & 0x80000000) == 0) 1617 if (!best || e->function > best->function) 1618 best = e; 1619 } 1620 if (best) { 1621 vcpu->regs[VCPU_REGS_RAX] = best->eax; 1622 vcpu->regs[VCPU_REGS_RBX] = best->ebx; 1623 vcpu->regs[VCPU_REGS_RCX] = best->ecx; 1624 vcpu->regs[VCPU_REGS_RDX] = best->edx; 1625 } 1626 kvm_arch_ops->decache_regs(vcpu); 1627 kvm_arch_ops->skip_emulated_instruction(vcpu); 1628} 1629EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 1630 1631static int pio_copy_data(struct kvm_vcpu *vcpu) 1632{ 1633 void *p = vcpu->pio_data; 1634 void *q; 1635 unsigned bytes; 1636 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; 1637 1638 kvm_arch_ops->vcpu_put(vcpu); 1639 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 1640 PAGE_KERNEL); 1641 if (!q) { 1642 kvm_arch_ops->vcpu_load(vcpu); 1643 free_pio_guest_pages(vcpu); 1644 return -ENOMEM; 1645 } 1646 q += vcpu->pio.guest_page_offset; 1647 bytes = vcpu->pio.size * vcpu->pio.cur_count; 1648 if (vcpu->pio.in) 1649 memcpy(q, p, bytes); 1650 else 1651 memcpy(p, q, bytes); 1652 q -= vcpu->pio.guest_page_offset; 1653 vunmap(q); 1654 kvm_arch_ops->vcpu_load(vcpu); 1655 free_pio_guest_pages(vcpu); 1656 return 0; 1657} 1658 1659static int complete_pio(struct kvm_vcpu *vcpu) 1660{ 1661 struct kvm_pio_request *io = &vcpu->pio; 1662 long delta; 1663 int r; 1664 1665 kvm_arch_ops->cache_regs(vcpu); 1666 1667 if (!io->string) { 1668 if (io->in) 1669 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, 1670 io->size); 1671 } else { 1672 if (io->in) { 1673 r = pio_copy_data(vcpu); 1674 if (r) { 1675 kvm_arch_ops->cache_regs(vcpu); 1676 return r; 1677 } 1678 } 1679 1680 delta = 1; 1681 if (io->rep) { 1682 delta *= io->cur_count; 1683 /* 1684 * The size of the register should really depend on 1685 * current address size. 1686 */ 1687 vcpu->regs[VCPU_REGS_RCX] -= delta; 1688 } 1689 if (io->down) 1690 delta = -delta; 1691 delta *= io->size; 1692 if (io->in) 1693 vcpu->regs[VCPU_REGS_RDI] += delta; 1694 else 1695 vcpu->regs[VCPU_REGS_RSI] += delta; 1696 } 1697 1698 kvm_arch_ops->decache_regs(vcpu); 1699 1700 io->count -= io->cur_count; 1701 io->cur_count = 0; 1702 1703 if (!io->count) 1704 kvm_arch_ops->skip_emulated_instruction(vcpu); 1705 return 0; 1706} 1707 1708int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 1709 int size, unsigned long count, int string, int down, 1710 gva_t address, int rep, unsigned port) 1711{ 1712 unsigned now, in_page; 1713 int i; 1714 int nr_pages = 1; 1715 struct page *page; 1716 1717 vcpu->run->exit_reason = KVM_EXIT_IO; 1718 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 1719 vcpu->run->io.size = size; 1720 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 1721 vcpu->run->io.count = count; 1722 vcpu->run->io.port = port; 1723 vcpu->pio.count = count; 1724 vcpu->pio.cur_count = count; 1725 vcpu->pio.size = size; 1726 vcpu->pio.in = in; 1727 vcpu->pio.string = string; 1728 vcpu->pio.down = down; 1729 vcpu->pio.guest_page_offset = offset_in_page(address); 1730 vcpu->pio.rep = rep; 1731 1732 if (!string) { 1733 kvm_arch_ops->cache_regs(vcpu); 1734 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); 1735 kvm_arch_ops->decache_regs(vcpu); 1736 return 0; 1737 } 1738 1739 if (!count) { 1740 kvm_arch_ops->skip_emulated_instruction(vcpu); 1741 return 1; 1742 } 1743 1744 now = min(count, PAGE_SIZE / size); 1745 1746 if (!down) 1747 in_page = PAGE_SIZE - offset_in_page(address); 1748 else 1749 in_page = offset_in_page(address) + size; 1750 now = min(count, (unsigned long)in_page / size); 1751 if (!now) { 1752 /* 1753 * String I/O straddles page boundary. Pin two guest pages 1754 * so that we satisfy atomicity constraints. Do just one 1755 * transaction to avoid complexity. 1756 */ 1757 nr_pages = 2; 1758 now = 1; 1759 } 1760 if (down) { 1761 /* 1762 * String I/O in reverse. Yuck. Kill the guest, fix later. 1763 */ 1764 printk(KERN_ERR "kvm: guest string pio down\n"); 1765 inject_gp(vcpu); 1766 return 1; 1767 } 1768 vcpu->run->io.count = now; 1769 vcpu->pio.cur_count = now; 1770 1771 for (i = 0; i < nr_pages; ++i) { 1772 spin_lock(&vcpu->kvm->lock); 1773 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 1774 if (page) 1775 get_page(page); 1776 vcpu->pio.guest_pages[i] = page; 1777 spin_unlock(&vcpu->kvm->lock); 1778 if (!page) { 1779 inject_gp(vcpu); 1780 free_pio_guest_pages(vcpu); 1781 return 1; 1782 } 1783 } 1784 1785 if (!vcpu->pio.in) 1786 return pio_copy_data(vcpu); 1787 return 0; 1788} 1789EXPORT_SYMBOL_GPL(kvm_setup_pio); 1790 1791static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1792{ 1793 int r; 1794 sigset_t sigsaved; 1795 1796 vcpu_load(vcpu); 1797 1798 if (vcpu->sigset_active) 1799 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 1800 1801 /* re-sync apic's tpr */ 1802 vcpu->cr8 = kvm_run->cr8; 1803 1804 if (vcpu->pio.cur_count) { 1805 r = complete_pio(vcpu); 1806 if (r) 1807 goto out; 1808 } 1809 1810 if (vcpu->mmio_needed) { 1811 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 1812 vcpu->mmio_read_completed = 1; 1813 vcpu->mmio_needed = 0; 1814 r = emulate_instruction(vcpu, kvm_run, 1815 vcpu->mmio_fault_cr2, 0); 1816 if (r == EMULATE_DO_MMIO) { 1817 /* 1818 * Read-modify-write. Back to userspace. 1819 */ 1820 kvm_run->exit_reason = KVM_EXIT_MMIO; 1821 r = 0; 1822 goto out; 1823 } 1824 } 1825 1826 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 1827 kvm_arch_ops->cache_regs(vcpu); 1828 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 1829 kvm_arch_ops->decache_regs(vcpu); 1830 } 1831 1832 r = kvm_arch_ops->run(vcpu, kvm_run); 1833 1834out: 1835 if (vcpu->sigset_active) 1836 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1837 1838 vcpu_put(vcpu); 1839 return r; 1840} 1841 1842static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 1843 struct kvm_regs *regs) 1844{ 1845 vcpu_load(vcpu); 1846 1847 kvm_arch_ops->cache_regs(vcpu); 1848 1849 regs->rax = vcpu->regs[VCPU_REGS_RAX]; 1850 regs->rbx = vcpu->regs[VCPU_REGS_RBX]; 1851 regs->rcx = vcpu->regs[VCPU_REGS_RCX]; 1852 regs->rdx = vcpu->regs[VCPU_REGS_RDX]; 1853 regs->rsi = vcpu->regs[VCPU_REGS_RSI]; 1854 regs->rdi = vcpu->regs[VCPU_REGS_RDI]; 1855 regs->rsp = vcpu->regs[VCPU_REGS_RSP]; 1856 regs->rbp = vcpu->regs[VCPU_REGS_RBP]; 1857#ifdef CONFIG_X86_64 1858 regs->r8 = vcpu->regs[VCPU_REGS_R8]; 1859 regs->r9 = vcpu->regs[VCPU_REGS_R9]; 1860 regs->r10 = vcpu->regs[VCPU_REGS_R10]; 1861 regs->r11 = vcpu->regs[VCPU_REGS_R11]; 1862 regs->r12 = vcpu->regs[VCPU_REGS_R12]; 1863 regs->r13 = vcpu->regs[VCPU_REGS_R13]; 1864 regs->r14 = vcpu->regs[VCPU_REGS_R14]; 1865 regs->r15 = vcpu->regs[VCPU_REGS_R15]; 1866#endif 1867 1868 regs->rip = vcpu->rip; 1869 regs->rflags = kvm_arch_ops->get_rflags(vcpu); 1870 1871 /* 1872 * Don't leak debug flags in case they were set for guest debugging 1873 */ 1874 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) 1875 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1876 1877 vcpu_put(vcpu); 1878 1879 return 0; 1880} 1881 1882static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 1883 struct kvm_regs *regs) 1884{ 1885 vcpu_load(vcpu); 1886 1887 vcpu->regs[VCPU_REGS_RAX] = regs->rax; 1888 vcpu->regs[VCPU_REGS_RBX] = regs->rbx; 1889 vcpu->regs[VCPU_REGS_RCX] = regs->rcx; 1890 vcpu->regs[VCPU_REGS_RDX] = regs->rdx; 1891 vcpu->regs[VCPU_REGS_RSI] = regs->rsi; 1892 vcpu->regs[VCPU_REGS_RDI] = regs->rdi; 1893 vcpu->regs[VCPU_REGS_RSP] = regs->rsp; 1894 vcpu->regs[VCPU_REGS_RBP] = regs->rbp; 1895#ifdef CONFIG_X86_64 1896 vcpu->regs[VCPU_REGS_R8] = regs->r8; 1897 vcpu->regs[VCPU_REGS_R9] = regs->r9; 1898 vcpu->regs[VCPU_REGS_R10] = regs->r10; 1899 vcpu->regs[VCPU_REGS_R11] = regs->r11; 1900 vcpu->regs[VCPU_REGS_R12] = regs->r12; 1901 vcpu->regs[VCPU_REGS_R13] = regs->r13; 1902 vcpu->regs[VCPU_REGS_R14] = regs->r14; 1903 vcpu->regs[VCPU_REGS_R15] = regs->r15; 1904#endif 1905 1906 vcpu->rip = regs->rip; 1907 kvm_arch_ops->set_rflags(vcpu, regs->rflags); 1908 1909 kvm_arch_ops->decache_regs(vcpu); 1910 1911 vcpu_put(vcpu); 1912 1913 return 0; 1914} 1915 1916static void get_segment(struct kvm_vcpu *vcpu, 1917 struct kvm_segment *var, int seg) 1918{ 1919 return kvm_arch_ops->get_segment(vcpu, var, seg); 1920} 1921 1922static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 1923 struct kvm_sregs *sregs) 1924{ 1925 struct descriptor_table dt; 1926 1927 vcpu_load(vcpu); 1928 1929 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 1930 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 1931 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 1932 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 1933 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 1934 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 1935 1936 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 1937 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 1938 1939 kvm_arch_ops->get_idt(vcpu, &dt); 1940 sregs->idt.limit = dt.limit; 1941 sregs->idt.base = dt.base; 1942 kvm_arch_ops->get_gdt(vcpu, &dt); 1943 sregs->gdt.limit = dt.limit; 1944 sregs->gdt.base = dt.base; 1945 1946 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 1947 sregs->cr0 = vcpu->cr0; 1948 sregs->cr2 = vcpu->cr2; 1949 sregs->cr3 = vcpu->cr3; 1950 sregs->cr4 = vcpu->cr4; 1951 sregs->cr8 = vcpu->cr8; 1952 sregs->efer = vcpu->shadow_efer; 1953 sregs->apic_base = vcpu->apic_base; 1954 1955 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, 1956 sizeof sregs->interrupt_bitmap); 1957 1958 vcpu_put(vcpu); 1959 1960 return 0; 1961} 1962 1963static void set_segment(struct kvm_vcpu *vcpu, 1964 struct kvm_segment *var, int seg) 1965{ 1966 return kvm_arch_ops->set_segment(vcpu, var, seg); 1967} 1968 1969static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 1970 struct kvm_sregs *sregs) 1971{ 1972 int mmu_reset_needed = 0; 1973 int i; 1974 struct descriptor_table dt; 1975 1976 vcpu_load(vcpu); 1977 1978 dt.limit = sregs->idt.limit; 1979 dt.base = sregs->idt.base; 1980 kvm_arch_ops->set_idt(vcpu, &dt); 1981 dt.limit = sregs->gdt.limit; 1982 dt.base = sregs->gdt.base; 1983 kvm_arch_ops->set_gdt(vcpu, &dt); 1984 1985 vcpu->cr2 = sregs->cr2; 1986 mmu_reset_needed |= vcpu->cr3 != sregs->cr3; 1987 vcpu->cr3 = sregs->cr3; 1988 1989 vcpu->cr8 = sregs->cr8; 1990 1991 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; 1992#ifdef CONFIG_X86_64 1993 kvm_arch_ops->set_efer(vcpu, sregs->efer); 1994#endif 1995 vcpu->apic_base = sregs->apic_base; 1996 1997 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 1998 1999 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 2000 kvm_arch_ops->set_cr0(vcpu, sregs->cr0); 2001 2002 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 2003 kvm_arch_ops->set_cr4(vcpu, sregs->cr4); 2004 if (!is_long_mode(vcpu) && is_pae(vcpu)) 2005 load_pdptrs(vcpu, vcpu->cr3); 2006 2007 if (mmu_reset_needed) 2008 kvm_mmu_reset_context(vcpu); 2009 2010 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, 2011 sizeof vcpu->irq_pending); 2012 vcpu->irq_summary = 0; 2013 for (i = 0; i < NR_IRQ_WORDS; ++i) 2014 if (vcpu->irq_pending[i]) 2015 __set_bit(i, &vcpu->irq_summary); 2016 2017 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 2018 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 2019 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 2020 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 2021 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 2022 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 2023 2024 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 2025 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 2026 2027 vcpu_put(vcpu); 2028 2029 return 0; 2030} 2031 2032/* 2033 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 2034 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 2035 * 2036 * This list is modified at module load time to reflect the 2037 * capabilities of the host cpu. 2038 */ 2039static u32 msrs_to_save[] = { 2040 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 2041 MSR_K6_STAR, 2042#ifdef CONFIG_X86_64 2043 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 2044#endif 2045 MSR_IA32_TIME_STAMP_COUNTER, 2046}; 2047 2048static unsigned num_msrs_to_save; 2049 2050static u32 emulated_msrs[] = { 2051 MSR_IA32_MISC_ENABLE, 2052}; 2053 2054static __init void kvm_init_msr_list(void) 2055{ 2056 u32 dummy[2]; 2057 unsigned i, j; 2058 2059 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2060 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2061 continue; 2062 if (j < i) 2063 msrs_to_save[j] = msrs_to_save[i]; 2064 j++; 2065 } 2066 num_msrs_to_save = j; 2067} 2068 2069/* 2070 * Adapt set_msr() to msr_io()'s calling convention 2071 */ 2072static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 2073{ 2074 return set_msr(vcpu, index, *data); 2075} 2076 2077/* 2078 * Read or write a bunch of msrs. All parameters are kernel addresses. 2079 * 2080 * @return number of msrs set successfully. 2081 */ 2082static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 2083 struct kvm_msr_entry *entries, 2084 int (*do_msr)(struct kvm_vcpu *vcpu, 2085 unsigned index, u64 *data)) 2086{ 2087 int i; 2088 2089 vcpu_load(vcpu); 2090 2091 for (i = 0; i < msrs->nmsrs; ++i) 2092 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 2093 break; 2094 2095 vcpu_put(vcpu); 2096 2097 return i; 2098} 2099 2100/* 2101 * Read or write a bunch of msrs. Parameters are user addresses. 2102 * 2103 * @return number of msrs set successfully. 2104 */ 2105static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 2106 int (*do_msr)(struct kvm_vcpu *vcpu, 2107 unsigned index, u64 *data), 2108 int writeback) 2109{ 2110 struct kvm_msrs msrs; 2111 struct kvm_msr_entry *entries; 2112 int r, n; 2113 unsigned size; 2114 2115 r = -EFAULT; 2116 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 2117 goto out; 2118 2119 r = -E2BIG; 2120 if (msrs.nmsrs >= MAX_IO_MSRS) 2121 goto out; 2122 2123 r = -ENOMEM; 2124 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 2125 entries = vmalloc(size); 2126 if (!entries) 2127 goto out; 2128 2129 r = -EFAULT; 2130 if (copy_from_user(entries, user_msrs->entries, size)) 2131 goto out_free; 2132 2133 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 2134 if (r < 0) 2135 goto out_free; 2136 2137 r = -EFAULT; 2138 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 2139 goto out_free; 2140 2141 r = n; 2142 2143out_free: 2144 vfree(entries); 2145out: 2146 return r; 2147} 2148 2149/* 2150 * Translate a guest virtual address to a guest physical address. 2151 */ 2152static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 2153 struct kvm_translation *tr) 2154{ 2155 unsigned long vaddr = tr->linear_address; 2156 gpa_t gpa; 2157 2158 vcpu_load(vcpu); 2159 spin_lock(&vcpu->kvm->lock); 2160 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); 2161 tr->physical_address = gpa; 2162 tr->valid = gpa != UNMAPPED_GVA; 2163 tr->writeable = 1; 2164 tr->usermode = 0; 2165 spin_unlock(&vcpu->kvm->lock); 2166 vcpu_put(vcpu); 2167 2168 return 0; 2169} 2170 2171static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2172 struct kvm_interrupt *irq) 2173{ 2174 if (irq->irq < 0 || irq->irq >= 256) 2175 return -EINVAL; 2176 vcpu_load(vcpu); 2177 2178 set_bit(irq->irq, vcpu->irq_pending); 2179 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); 2180 2181 vcpu_put(vcpu); 2182 2183 return 0; 2184} 2185 2186static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, 2187 struct kvm_debug_guest *dbg) 2188{ 2189 int r; 2190 2191 vcpu_load(vcpu); 2192 2193 r = kvm_arch_ops->set_guest_debug(vcpu, dbg); 2194 2195 vcpu_put(vcpu); 2196 2197 return r; 2198} 2199 2200static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, 2201 unsigned long address, 2202 int *type) 2203{ 2204 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 2205 unsigned long pgoff; 2206 struct page *page; 2207 2208 *type = VM_FAULT_MINOR; 2209 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2210 if (pgoff == 0) 2211 page = virt_to_page(vcpu->run); 2212 else if (pgoff == KVM_PIO_PAGE_OFFSET) 2213 page = virt_to_page(vcpu->pio_data); 2214 else 2215 return NOPAGE_SIGBUS; 2216 get_page(page); 2217 return page; 2218} 2219 2220static struct vm_operations_struct kvm_vcpu_vm_ops = { 2221 .nopage = kvm_vcpu_nopage, 2222}; 2223 2224static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2225{ 2226 vma->vm_ops = &kvm_vcpu_vm_ops; 2227 return 0; 2228} 2229 2230static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2231{ 2232 struct kvm_vcpu *vcpu = filp->private_data; 2233 2234 fput(vcpu->kvm->filp); 2235 return 0; 2236} 2237 2238static struct file_operations kvm_vcpu_fops = { 2239 .release = kvm_vcpu_release, 2240 .unlocked_ioctl = kvm_vcpu_ioctl, 2241 .compat_ioctl = kvm_vcpu_ioctl, 2242 .mmap = kvm_vcpu_mmap, 2243}; 2244 2245/* 2246 * Allocates an inode for the vcpu. 2247 */ 2248static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2249{ 2250 int fd, r; 2251 struct inode *inode; 2252 struct file *file; 2253 2254 atomic_inc(&vcpu->kvm->filp->f_count); 2255 inode = kvmfs_inode(&kvm_vcpu_fops); 2256 if (IS_ERR(inode)) { 2257 r = PTR_ERR(inode); 2258 goto out1; 2259 } 2260 2261 file = kvmfs_file(inode, vcpu); 2262 if (IS_ERR(file)) { 2263 r = PTR_ERR(file); 2264 goto out2; 2265 } 2266 2267 r = get_unused_fd(); 2268 if (r < 0) 2269 goto out3; 2270 fd = r; 2271 fd_install(fd, file); 2272 2273 return fd; 2274 2275out3: 2276 fput(file); 2277out2: 2278 iput(inode); 2279out1: 2280 fput(vcpu->kvm->filp); 2281 return r; 2282} 2283 2284/* 2285 * Creates some virtual cpus. Good luck creating more than one. 2286 */ 2287static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 2288{ 2289 int r; 2290 struct kvm_vcpu *vcpu; 2291 struct page *page; 2292 2293 r = -EINVAL; 2294 if (!valid_vcpu(n)) 2295 goto out; 2296 2297 vcpu = &kvm->vcpus[n]; 2298 2299 mutex_lock(&vcpu->mutex); 2300 2301 if (vcpu->vmcs) { 2302 mutex_unlock(&vcpu->mutex); 2303 return -EEXIST; 2304 } 2305 2306 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2307 r = -ENOMEM; 2308 if (!page) 2309 goto out_unlock; 2310 vcpu->run = page_address(page); 2311 2312 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2313 r = -ENOMEM; 2314 if (!page) 2315 goto out_free_run; 2316 vcpu->pio_data = page_address(page); 2317 2318 vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, 2319 FX_IMAGE_ALIGN); 2320 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; 2321 vcpu->cr0 = 0x10; 2322 2323 r = kvm_arch_ops->vcpu_create(vcpu); 2324 if (r < 0) 2325 goto out_free_vcpus; 2326 2327 r = kvm_mmu_create(vcpu); 2328 if (r < 0) 2329 goto out_free_vcpus; 2330 2331 kvm_arch_ops->vcpu_load(vcpu); 2332 r = kvm_mmu_setup(vcpu); 2333 if (r >= 0) 2334 r = kvm_arch_ops->vcpu_setup(vcpu); 2335 vcpu_put(vcpu); 2336 2337 if (r < 0) 2338 goto out_free_vcpus; 2339 2340 r = create_vcpu_fd(vcpu); 2341 if (r < 0) 2342 goto out_free_vcpus; 2343 2344 return r; 2345 2346out_free_vcpus: 2347 kvm_free_vcpu(vcpu); 2348out_free_run: 2349 free_page((unsigned long)vcpu->run); 2350 vcpu->run = NULL; 2351out_unlock: 2352 mutex_unlock(&vcpu->mutex); 2353out: 2354 return r; 2355} 2356 2357static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 2358 struct kvm_cpuid *cpuid, 2359 struct kvm_cpuid_entry __user *entries) 2360{ 2361 int r; 2362 2363 r = -E2BIG; 2364 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 2365 goto out; 2366 r = -EFAULT; 2367 if (copy_from_user(&vcpu->cpuid_entries, entries, 2368 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 2369 goto out; 2370 vcpu->cpuid_nent = cpuid->nent; 2371 return 0; 2372 2373out: 2374 return r; 2375} 2376 2377static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2378{ 2379 if (sigset) { 2380 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2381 vcpu->sigset_active = 1; 2382 vcpu->sigset = *sigset; 2383 } else 2384 vcpu->sigset_active = 0; 2385 return 0; 2386} 2387 2388/* 2389 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 2390 * we have asm/x86/processor.h 2391 */ 2392struct fxsave { 2393 u16 cwd; 2394 u16 swd; 2395 u16 twd; 2396 u16 fop; 2397 u64 rip; 2398 u64 rdp; 2399 u32 mxcsr; 2400 u32 mxcsr_mask; 2401 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 2402#ifdef CONFIG_X86_64 2403 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 2404#else 2405 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 2406#endif 2407}; 2408 2409static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2410{ 2411 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; 2412 2413 vcpu_load(vcpu); 2414 2415 memcpy(fpu->fpr, fxsave->st_space, 128); 2416 fpu->fcw = fxsave->cwd; 2417 fpu->fsw = fxsave->swd; 2418 fpu->ftwx = fxsave->twd; 2419 fpu->last_opcode = fxsave->fop; 2420 fpu->last_ip = fxsave->rip; 2421 fpu->last_dp = fxsave->rdp; 2422 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 2423 2424 vcpu_put(vcpu); 2425 2426 return 0; 2427} 2428 2429static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2430{ 2431 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; 2432 2433 vcpu_load(vcpu); 2434 2435 memcpy(fxsave->st_space, fpu->fpr, 128); 2436 fxsave->cwd = fpu->fcw; 2437 fxsave->swd = fpu->fsw; 2438 fxsave->twd = fpu->ftwx; 2439 fxsave->fop = fpu->last_opcode; 2440 fxsave->rip = fpu->last_ip; 2441 fxsave->rdp = fpu->last_dp; 2442 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 2443 2444 vcpu_put(vcpu); 2445 2446 return 0; 2447} 2448 2449static long kvm_vcpu_ioctl(struct file *filp, 2450 unsigned int ioctl, unsigned long arg) 2451{ 2452 struct kvm_vcpu *vcpu = filp->private_data; 2453 void __user *argp = (void __user *)arg; 2454 int r = -EINVAL; 2455 2456 switch (ioctl) { 2457 case KVM_RUN: 2458 r = -EINVAL; 2459 if (arg) 2460 goto out; 2461 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); 2462 break; 2463 case KVM_GET_REGS: { 2464 struct kvm_regs kvm_regs; 2465 2466 memset(&kvm_regs, 0, sizeof kvm_regs); 2467 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); 2468 if (r) 2469 goto out; 2470 r = -EFAULT; 2471 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) 2472 goto out; 2473 r = 0; 2474 break; 2475 } 2476 case KVM_SET_REGS: { 2477 struct kvm_regs kvm_regs; 2478 2479 r = -EFAULT; 2480 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) 2481 goto out; 2482 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); 2483 if (r) 2484 goto out; 2485 r = 0; 2486 break; 2487 } 2488 case KVM_GET_SREGS: { 2489 struct kvm_sregs kvm_sregs; 2490 2491 memset(&kvm_sregs, 0, sizeof kvm_sregs); 2492 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); 2493 if (r) 2494 goto out; 2495 r = -EFAULT; 2496 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) 2497 goto out; 2498 r = 0; 2499 break; 2500 } 2501 case KVM_SET_SREGS: { 2502 struct kvm_sregs kvm_sregs; 2503 2504 r = -EFAULT; 2505 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) 2506 goto out; 2507 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); 2508 if (r) 2509 goto out; 2510 r = 0; 2511 break; 2512 } 2513 case KVM_TRANSLATE: { 2514 struct kvm_translation tr; 2515 2516 r = -EFAULT; 2517 if (copy_from_user(&tr, argp, sizeof tr)) 2518 goto out; 2519 r = kvm_vcpu_ioctl_translate(vcpu, &tr); 2520 if (r) 2521 goto out; 2522 r = -EFAULT; 2523 if (copy_to_user(argp, &tr, sizeof tr)) 2524 goto out; 2525 r = 0; 2526 break; 2527 } 2528 case KVM_INTERRUPT: { 2529 struct kvm_interrupt irq; 2530 2531 r = -EFAULT; 2532 if (copy_from_user(&irq, argp, sizeof irq)) 2533 goto out; 2534 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 2535 if (r) 2536 goto out; 2537 r = 0; 2538 break; 2539 } 2540 case KVM_DEBUG_GUEST: { 2541 struct kvm_debug_guest dbg; 2542 2543 r = -EFAULT; 2544 if (copy_from_user(&dbg, argp, sizeof dbg)) 2545 goto out; 2546 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); 2547 if (r) 2548 goto out; 2549 r = 0; 2550 break; 2551 } 2552 case KVM_GET_MSRS: 2553 r = msr_io(vcpu, argp, get_msr, 1); 2554 break; 2555 case KVM_SET_MSRS: 2556 r = msr_io(vcpu, argp, do_set_msr, 0); 2557 break; 2558 case KVM_SET_CPUID: { 2559 struct kvm_cpuid __user *cpuid_arg = argp; 2560 struct kvm_cpuid cpuid; 2561 2562 r = -EFAULT; 2563 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2564 goto out; 2565 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 2566 if (r) 2567 goto out; 2568 break; 2569 } 2570 case KVM_SET_SIGNAL_MASK: { 2571 struct kvm_signal_mask __user *sigmask_arg = argp; 2572 struct kvm_signal_mask kvm_sigmask; 2573 sigset_t sigset, *p; 2574 2575 p = NULL; 2576 if (argp) { 2577 r = -EFAULT; 2578 if (copy_from_user(&kvm_sigmask, argp, 2579 sizeof kvm_sigmask)) 2580 goto out; 2581 r = -EINVAL; 2582 if (kvm_sigmask.len != sizeof sigset) 2583 goto out; 2584 r = -EFAULT; 2585 if (copy_from_user(&sigset, sigmask_arg->sigset, 2586 sizeof sigset)) 2587 goto out; 2588 p = &sigset; 2589 } 2590 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2591 break; 2592 } 2593 case KVM_GET_FPU: { 2594 struct kvm_fpu fpu; 2595 2596 memset(&fpu, 0, sizeof fpu); 2597 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); 2598 if (r) 2599 goto out; 2600 r = -EFAULT; 2601 if (copy_to_user(argp, &fpu, sizeof fpu)) 2602 goto out; 2603 r = 0; 2604 break; 2605 } 2606 case KVM_SET_FPU: { 2607 struct kvm_fpu fpu; 2608 2609 r = -EFAULT; 2610 if (copy_from_user(&fpu, argp, sizeof fpu)) 2611 goto out; 2612 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); 2613 if (r) 2614 goto out; 2615 r = 0; 2616 break; 2617 } 2618 default: 2619 ; 2620 } 2621out: 2622 return r; 2623} 2624 2625static long kvm_vm_ioctl(struct file *filp, 2626 unsigned int ioctl, unsigned long arg) 2627{ 2628 struct kvm *kvm = filp->private_data; 2629 void __user *argp = (void __user *)arg; 2630 int r = -EINVAL; 2631 2632 switch (ioctl) { 2633 case KVM_CREATE_VCPU: 2634 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2635 if (r < 0) 2636 goto out; 2637 break; 2638 case KVM_SET_MEMORY_REGION: { 2639 struct kvm_memory_region kvm_mem; 2640 2641 r = -EFAULT; 2642 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2643 goto out; 2644 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); 2645 if (r) 2646 goto out; 2647 break; 2648 } 2649 case KVM_GET_DIRTY_LOG: { 2650 struct kvm_dirty_log log; 2651 2652 r = -EFAULT; 2653 if (copy_from_user(&log, argp, sizeof log)) 2654 goto out; 2655 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2656 if (r) 2657 goto out; 2658 break; 2659 } 2660 case KVM_SET_MEMORY_ALIAS: { 2661 struct kvm_memory_alias alias; 2662 2663 r = -EFAULT; 2664 if (copy_from_user(&alias, argp, sizeof alias)) 2665 goto out; 2666 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 2667 if (r) 2668 goto out; 2669 break; 2670 } 2671 default: 2672 ; 2673 } 2674out: 2675 return r; 2676} 2677 2678static struct page *kvm_vm_nopage(struct vm_area_struct *vma, 2679 unsigned long address, 2680 int *type) 2681{ 2682 struct kvm *kvm = vma->vm_file->private_data; 2683 unsigned long pgoff; 2684 struct page *page; 2685 2686 *type = VM_FAULT_MINOR; 2687 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2688 page = gfn_to_page(kvm, pgoff); 2689 if (!page) 2690 return NOPAGE_SIGBUS; 2691 get_page(page); 2692 return page; 2693} 2694 2695static struct vm_operations_struct kvm_vm_vm_ops = { 2696 .nopage = kvm_vm_nopage, 2697}; 2698 2699static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2700{ 2701 vma->vm_ops = &kvm_vm_vm_ops; 2702 return 0; 2703} 2704 2705static struct file_operations kvm_vm_fops = { 2706 .release = kvm_vm_release, 2707 .unlocked_ioctl = kvm_vm_ioctl, 2708 .compat_ioctl = kvm_vm_ioctl, 2709 .mmap = kvm_vm_mmap, 2710}; 2711 2712static int kvm_dev_ioctl_create_vm(void) 2713{ 2714 int fd, r; 2715 struct inode *inode; 2716 struct file *file; 2717 struct kvm *kvm; 2718 2719 inode = kvmfs_inode(&kvm_vm_fops); 2720 if (IS_ERR(inode)) { 2721 r = PTR_ERR(inode); 2722 goto out1; 2723 } 2724 2725 kvm = kvm_create_vm(); 2726 if (IS_ERR(kvm)) { 2727 r = PTR_ERR(kvm); 2728 goto out2; 2729 } 2730 2731 file = kvmfs_file(inode, kvm); 2732 if (IS_ERR(file)) { 2733 r = PTR_ERR(file); 2734 goto out3; 2735 } 2736 kvm->filp = file; 2737 2738 r = get_unused_fd(); 2739 if (r < 0) 2740 goto out4; 2741 fd = r; 2742 fd_install(fd, file); 2743 2744 return fd; 2745 2746out4: 2747 fput(file); 2748out3: 2749 kvm_destroy_vm(kvm); 2750out2: 2751 iput(inode); 2752out1: 2753 return r; 2754} 2755 2756static long kvm_dev_ioctl(struct file *filp, 2757 unsigned int ioctl, unsigned long arg) 2758{ 2759 void __user *argp = (void __user *)arg; 2760 long r = -EINVAL; 2761 2762 switch (ioctl) { 2763 case KVM_GET_API_VERSION: 2764 r = -EINVAL; 2765 if (arg) 2766 goto out; 2767 r = KVM_API_VERSION; 2768 break; 2769 case KVM_CREATE_VM: 2770 r = -EINVAL; 2771 if (arg) 2772 goto out; 2773 r = kvm_dev_ioctl_create_vm(); 2774 break; 2775 case KVM_GET_MSR_INDEX_LIST: { 2776 struct kvm_msr_list __user *user_msr_list = argp; 2777 struct kvm_msr_list msr_list; 2778 unsigned n; 2779 2780 r = -EFAULT; 2781 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 2782 goto out; 2783 n = msr_list.nmsrs; 2784 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 2785 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 2786 goto out; 2787 r = -E2BIG; 2788 if (n < num_msrs_to_save) 2789 goto out; 2790 r = -EFAULT; 2791 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 2792 num_msrs_to_save * sizeof(u32))) 2793 goto out; 2794 if (copy_to_user(user_msr_list->indices 2795 + num_msrs_to_save * sizeof(u32), 2796 &emulated_msrs, 2797 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 2798 goto out; 2799 r = 0; 2800 break; 2801 } 2802 case KVM_CHECK_EXTENSION: 2803 /* 2804 * No extensions defined at present. 2805 */ 2806 r = 0; 2807 break; 2808 case KVM_GET_VCPU_MMAP_SIZE: 2809 r = -EINVAL; 2810 if (arg) 2811 goto out; 2812 r = 2 * PAGE_SIZE; 2813 break; 2814 default: 2815 ; 2816 } 2817out: 2818 return r; 2819} 2820 2821static struct file_operations kvm_chardev_ops = { 2822 .open = kvm_dev_open, 2823 .release = kvm_dev_release, 2824 .unlocked_ioctl = kvm_dev_ioctl, 2825 .compat_ioctl = kvm_dev_ioctl, 2826}; 2827 2828static struct miscdevice kvm_dev = { 2829 KVM_MINOR, 2830 "kvm", 2831 &kvm_chardev_ops, 2832}; 2833 2834static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2835 void *v) 2836{ 2837 if (val == SYS_RESTART) { 2838 /* 2839 * Some (well, at least mine) BIOSes hang on reboot if 2840 * in vmx root mode. 2841 */ 2842 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2843 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 2844 } 2845 return NOTIFY_OK; 2846} 2847 2848static struct notifier_block kvm_reboot_notifier = { 2849 .notifier_call = kvm_reboot, 2850 .priority = 0, 2851}; 2852 2853/* 2854 * Make sure that a cpu that is being hot-unplugged does not have any vcpus 2855 * cached on it. 2856 */ 2857static void decache_vcpus_on_cpu(int cpu) 2858{ 2859 struct kvm *vm; 2860 struct kvm_vcpu *vcpu; 2861 int i; 2862 2863 spin_lock(&kvm_lock); 2864 list_for_each_entry(vm, &vm_list, vm_list) 2865 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2866 vcpu = &vm->vcpus[i]; 2867 /* 2868 * If the vcpu is locked, then it is running on some 2869 * other cpu and therefore it is not cached on the 2870 * cpu in question. 2871 * 2872 * If it's not locked, check the last cpu it executed 2873 * on. 2874 */ 2875 if (mutex_trylock(&vcpu->mutex)) { 2876 if (vcpu->cpu == cpu) { 2877 kvm_arch_ops->vcpu_decache(vcpu); 2878 vcpu->cpu = -1; 2879 } 2880 mutex_unlock(&vcpu->mutex); 2881 } 2882 } 2883 spin_unlock(&kvm_lock); 2884} 2885 2886static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2887 void *v) 2888{ 2889 int cpu = (long)v; 2890 2891 switch (val) { 2892 case CPU_DOWN_PREPARE: 2893 case CPU_DOWN_PREPARE_FROZEN: 2894 case CPU_UP_CANCELED: 2895 case CPU_UP_CANCELED_FROZEN: 2896 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2897 cpu); 2898 decache_vcpus_on_cpu(cpu); 2899 smp_call_function_single(cpu, kvm_arch_ops->hardware_disable, 2900 NULL, 0, 1); 2901 break; 2902 case CPU_ONLINE: 2903 case CPU_ONLINE_FROZEN: 2904 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2905 cpu); 2906 smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, 2907 NULL, 0, 1); 2908 break; 2909 } 2910 return NOTIFY_OK; 2911} 2912 2913static struct notifier_block kvm_cpu_notifier = { 2914 .notifier_call = kvm_cpu_hotplug, 2915 .priority = 20, /* must be > scheduler priority */ 2916}; 2917 2918static u64 stat_get(void *_offset) 2919{ 2920 unsigned offset = (long)_offset; 2921 u64 total = 0; 2922 struct kvm *kvm; 2923 struct kvm_vcpu *vcpu; 2924 int i; 2925 2926 spin_lock(&kvm_lock); 2927 list_for_each_entry(kvm, &vm_list, vm_list) 2928 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2929 vcpu = &kvm->vcpus[i]; 2930 total += *(u32 *)((void *)vcpu + offset); 2931 } 2932 spin_unlock(&kvm_lock); 2933 return total; 2934} 2935 2936static void stat_set(void *offset, u64 val) 2937{ 2938} 2939 2940DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n"); 2941 2942static __init void kvm_init_debug(void) 2943{ 2944 struct kvm_stats_debugfs_item *p; 2945 2946 debugfs_dir = debugfs_create_dir("kvm", NULL); 2947 for (p = debugfs_entries; p->name; ++p) 2948 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, 2949 (void *)(long)p->offset, 2950 &stat_fops); 2951} 2952 2953static void kvm_exit_debug(void) 2954{ 2955 struct kvm_stats_debugfs_item *p; 2956 2957 for (p = debugfs_entries; p->name; ++p) 2958 debugfs_remove(p->dentry); 2959 debugfs_remove(debugfs_dir); 2960} 2961 2962static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2963{ 2964 decache_vcpus_on_cpu(raw_smp_processor_id()); 2965 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 2966 return 0; 2967} 2968 2969static int kvm_resume(struct sys_device *dev) 2970{ 2971 on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); 2972 return 0; 2973} 2974 2975static struct sysdev_class kvm_sysdev_class = { 2976 set_kset_name("kvm"), 2977 .suspend = kvm_suspend, 2978 .resume = kvm_resume, 2979}; 2980 2981static struct sys_device kvm_sysdev = { 2982 .id = 0, 2983 .cls = &kvm_sysdev_class, 2984}; 2985 2986hpa_t bad_page_address; 2987 2988static int kvmfs_get_sb(struct file_system_type *fs_type, int flags, 2989 const char *dev_name, void *data, struct vfsmount *mnt) 2990{ 2991 return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt); 2992} 2993 2994static struct file_system_type kvm_fs_type = { 2995 .name = "kvmfs", 2996 .get_sb = kvmfs_get_sb, 2997 .kill_sb = kill_anon_super, 2998}; 2999 3000int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) 3001{ 3002 int r; 3003 3004 if (kvm_arch_ops) { 3005 printk(KERN_ERR "kvm: already loaded the other module\n"); 3006 return -EEXIST; 3007 } 3008 3009 if (!ops->cpu_has_kvm_support()) { 3010 printk(KERN_ERR "kvm: no hardware support\n"); 3011 return -EOPNOTSUPP; 3012 } 3013 if (ops->disabled_by_bios()) { 3014 printk(KERN_ERR "kvm: disabled by bios\n"); 3015 return -EOPNOTSUPP; 3016 } 3017 3018 kvm_arch_ops = ops; 3019 3020 r = kvm_arch_ops->hardware_setup(); 3021 if (r < 0) 3022 goto out; 3023 3024 on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); 3025 r = register_cpu_notifier(&kvm_cpu_notifier); 3026 if (r) 3027 goto out_free_1; 3028 register_reboot_notifier(&kvm_reboot_notifier); 3029 3030 r = sysdev_class_register(&kvm_sysdev_class); 3031 if (r) 3032 goto out_free_2; 3033 3034 r = sysdev_register(&kvm_sysdev); 3035 if (r) 3036 goto out_free_3; 3037 3038 kvm_chardev_ops.owner = module; 3039 3040 r = misc_register(&kvm_dev); 3041 if (r) { 3042 printk (KERN_ERR "kvm: misc device register failed\n"); 3043 goto out_free; 3044 } 3045 3046 return r; 3047 3048out_free: 3049 sysdev_unregister(&kvm_sysdev); 3050out_free_3: 3051 sysdev_class_unregister(&kvm_sysdev_class); 3052out_free_2: 3053 unregister_reboot_notifier(&kvm_reboot_notifier); 3054 unregister_cpu_notifier(&kvm_cpu_notifier); 3055out_free_1: 3056 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 3057 kvm_arch_ops->hardware_unsetup(); 3058out: 3059 kvm_arch_ops = NULL; 3060 return r; 3061} 3062 3063void kvm_exit_arch(void) 3064{ 3065 misc_deregister(&kvm_dev); 3066 sysdev_unregister(&kvm_sysdev); 3067 sysdev_class_unregister(&kvm_sysdev_class); 3068 unregister_reboot_notifier(&kvm_reboot_notifier); 3069 unregister_cpu_notifier(&kvm_cpu_notifier); 3070 on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); 3071 kvm_arch_ops->hardware_unsetup(); 3072 kvm_arch_ops = NULL; 3073} 3074 3075static __init int kvm_init(void) 3076{ 3077 static struct page *bad_page; 3078 int r; 3079 3080 r = kvm_mmu_module_init(); 3081 if (r) 3082 goto out4; 3083 3084 r = register_filesystem(&kvm_fs_type); 3085 if (r) 3086 goto out3; 3087 3088 kvmfs_mnt = kern_mount(&kvm_fs_type); 3089 r = PTR_ERR(kvmfs_mnt); 3090 if (IS_ERR(kvmfs_mnt)) 3091 goto out2; 3092 kvm_init_debug(); 3093 3094 kvm_init_msr_list(); 3095 3096 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { 3097 r = -ENOMEM; 3098 goto out; 3099 } 3100 3101 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; 3102 memset(__va(bad_page_address), 0, PAGE_SIZE); 3103 3104 return 0; 3105 3106out: 3107 kvm_exit_debug(); 3108 mntput(kvmfs_mnt); 3109out2: 3110 unregister_filesystem(&kvm_fs_type); 3111out3: 3112 kvm_mmu_module_exit(); 3113out4: 3114 return r; 3115} 3116 3117static __exit void kvm_exit(void) 3118{ 3119 kvm_exit_debug(); 3120 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); 3121 mntput(kvmfs_mnt); 3122 unregister_filesystem(&kvm_fs_type); 3123 kvm_mmu_module_exit(); 3124} 3125 3126module_init(kvm_init) 3127module_exit(kvm_exit) 3128 3129EXPORT_SYMBOL_GPL(kvm_init_arch); 3130EXPORT_SYMBOL_GPL(kvm_exit_arch);