Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.2 10450 lines 302 kB view raw
1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19#include "irq.h" 20#include "mmu.h" 21#include "cpuid.h" 22 23#include <linux/kvm_host.h> 24#include <linux/module.h> 25#include <linux/kernel.h> 26#include <linux/mm.h> 27#include <linux/highmem.h> 28#include <linux/sched.h> 29#include <linux/moduleparam.h> 30#include <linux/mod_devicetable.h> 31#include <linux/trace_events.h> 32#include <linux/slab.h> 33#include <linux/tboot.h> 34#include <linux/hrtimer.h> 35#include "kvm_cache_regs.h" 36#include "x86.h" 37 38#include <asm/io.h> 39#include <asm/desc.h> 40#include <asm/vmx.h> 41#include <asm/virtext.h> 42#include <asm/mce.h> 43#include <asm/fpu/internal.h> 44#include <asm/perf_event.h> 45#include <asm/debugreg.h> 46#include <asm/kexec.h> 47#include <asm/apic.h> 48 49#include "trace.h" 50#include "pmu.h" 51 52#define __ex(x) __kvm_handle_fault_on_reboot(x) 53#define __ex_clear(x, reg) \ 54 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) 55 56MODULE_AUTHOR("Qumranet"); 57MODULE_LICENSE("GPL"); 58 59static const struct x86_cpu_id vmx_cpu_id[] = { 60 X86_FEATURE_MATCH(X86_FEATURE_VMX), 61 {} 62}; 63MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 64 65static bool __read_mostly enable_vpid = 1; 66module_param_named(vpid, enable_vpid, bool, 0444); 67 68static bool __read_mostly flexpriority_enabled = 1; 69module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 70 71static bool __read_mostly enable_ept = 1; 72module_param_named(ept, enable_ept, bool, S_IRUGO); 73 74static bool __read_mostly enable_unrestricted_guest = 1; 75module_param_named(unrestricted_guest, 76 enable_unrestricted_guest, bool, S_IRUGO); 77 78static bool __read_mostly enable_ept_ad_bits = 1; 79module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 80 81static bool __read_mostly emulate_invalid_guest_state = true; 82module_param(emulate_invalid_guest_state, bool, S_IRUGO); 83 84static bool __read_mostly vmm_exclusive = 1; 85module_param(vmm_exclusive, bool, S_IRUGO); 86 87static bool __read_mostly fasteoi = 1; 88module_param(fasteoi, bool, S_IRUGO); 89 90static bool __read_mostly enable_apicv = 1; 91module_param(enable_apicv, bool, S_IRUGO); 92 93static bool __read_mostly enable_shadow_vmcs = 1; 94module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 95/* 96 * If nested=1, nested virtualization is supported, i.e., guests may use 97 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 98 * use VMX instructions. 99 */ 100static bool __read_mostly nested = 0; 101module_param(nested, bool, S_IRUGO); 102 103static u64 __read_mostly host_xss; 104 105static bool __read_mostly enable_pml = 1; 106module_param_named(pml, enable_pml, bool, S_IRUGO); 107 108#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 109#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 110#define KVM_VM_CR0_ALWAYS_ON \ 111 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 112#define KVM_CR4_GUEST_OWNED_BITS \ 113 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 114 | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) 115 116#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 117#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 118 119#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 120 121#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 122 123/* 124 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 125 * ple_gap: upper bound on the amount of time between two successive 126 * executions of PAUSE in a loop. Also indicate if ple enabled. 127 * According to test, this time is usually smaller than 128 cycles. 128 * ple_window: upper bound on the amount of time a guest is allowed to execute 129 * in a PAUSE loop. Tests indicate that most spinlocks are held for 130 * less than 2^12 cycles 131 * Time is measured based on a counter that runs at the same rate as the TSC, 132 * refer SDM volume 3b section 21.6.13 & 22.1.3. 133 */ 134#define KVM_VMX_DEFAULT_PLE_GAP 128 135#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 136#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 137#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 138#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ 139 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW 140 141static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 142module_param(ple_gap, int, S_IRUGO); 143 144static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 145module_param(ple_window, int, S_IRUGO); 146 147/* Default doubles per-vcpu window every exit. */ 148static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; 149module_param(ple_window_grow, int, S_IRUGO); 150 151/* Default resets per-vcpu window every exit to ple_window. */ 152static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; 153module_param(ple_window_shrink, int, S_IRUGO); 154 155/* Default is to compute the maximum so we can never overflow. */ 156static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 157static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 158module_param(ple_window_max, int, S_IRUGO); 159 160extern const ulong vmx_return; 161 162#define NR_AUTOLOAD_MSRS 8 163#define VMCS02_POOL_SIZE 1 164 165struct vmcs { 166 u32 revision_id; 167 u32 abort; 168 char data[0]; 169}; 170 171/* 172 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also 173 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs 174 * loaded on this CPU (so we can clear them if the CPU goes down). 175 */ 176struct loaded_vmcs { 177 struct vmcs *vmcs; 178 int cpu; 179 int launched; 180 struct list_head loaded_vmcss_on_cpu_link; 181}; 182 183struct shared_msr_entry { 184 unsigned index; 185 u64 data; 186 u64 mask; 187}; 188 189/* 190 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a 191 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has 192 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is 193 * stored in guest memory specified by VMPTRLD, but is opaque to the guest, 194 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. 195 * More than one of these structures may exist, if L1 runs multiple L2 guests. 196 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the 197 * underlying hardware which will be used to run L2. 198 * This structure is packed to ensure that its layout is identical across 199 * machines (necessary for live migration). 200 * If there are changes in this struct, VMCS12_REVISION must be changed. 201 */ 202typedef u64 natural_width; 203struct __packed vmcs12 { 204 /* According to the Intel spec, a VMCS region must start with the 205 * following two fields. Then follow implementation-specific data. 206 */ 207 u32 revision_id; 208 u32 abort; 209 210 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ 211 u32 padding[7]; /* room for future expansion */ 212 213 u64 io_bitmap_a; 214 u64 io_bitmap_b; 215 u64 msr_bitmap; 216 u64 vm_exit_msr_store_addr; 217 u64 vm_exit_msr_load_addr; 218 u64 vm_entry_msr_load_addr; 219 u64 tsc_offset; 220 u64 virtual_apic_page_addr; 221 u64 apic_access_addr; 222 u64 posted_intr_desc_addr; 223 u64 ept_pointer; 224 u64 eoi_exit_bitmap0; 225 u64 eoi_exit_bitmap1; 226 u64 eoi_exit_bitmap2; 227 u64 eoi_exit_bitmap3; 228 u64 xss_exit_bitmap; 229 u64 guest_physical_address; 230 u64 vmcs_link_pointer; 231 u64 guest_ia32_debugctl; 232 u64 guest_ia32_pat; 233 u64 guest_ia32_efer; 234 u64 guest_ia32_perf_global_ctrl; 235 u64 guest_pdptr0; 236 u64 guest_pdptr1; 237 u64 guest_pdptr2; 238 u64 guest_pdptr3; 239 u64 guest_bndcfgs; 240 u64 host_ia32_pat; 241 u64 host_ia32_efer; 242 u64 host_ia32_perf_global_ctrl; 243 u64 padding64[8]; /* room for future expansion */ 244 /* 245 * To allow migration of L1 (complete with its L2 guests) between 246 * machines of different natural widths (32 or 64 bit), we cannot have 247 * unsigned long fields with no explict size. We use u64 (aliased 248 * natural_width) instead. Luckily, x86 is little-endian. 249 */ 250 natural_width cr0_guest_host_mask; 251 natural_width cr4_guest_host_mask; 252 natural_width cr0_read_shadow; 253 natural_width cr4_read_shadow; 254 natural_width cr3_target_value0; 255 natural_width cr3_target_value1; 256 natural_width cr3_target_value2; 257 natural_width cr3_target_value3; 258 natural_width exit_qualification; 259 natural_width guest_linear_address; 260 natural_width guest_cr0; 261 natural_width guest_cr3; 262 natural_width guest_cr4; 263 natural_width guest_es_base; 264 natural_width guest_cs_base; 265 natural_width guest_ss_base; 266 natural_width guest_ds_base; 267 natural_width guest_fs_base; 268 natural_width guest_gs_base; 269 natural_width guest_ldtr_base; 270 natural_width guest_tr_base; 271 natural_width guest_gdtr_base; 272 natural_width guest_idtr_base; 273 natural_width guest_dr7; 274 natural_width guest_rsp; 275 natural_width guest_rip; 276 natural_width guest_rflags; 277 natural_width guest_pending_dbg_exceptions; 278 natural_width guest_sysenter_esp; 279 natural_width guest_sysenter_eip; 280 natural_width host_cr0; 281 natural_width host_cr3; 282 natural_width host_cr4; 283 natural_width host_fs_base; 284 natural_width host_gs_base; 285 natural_width host_tr_base; 286 natural_width host_gdtr_base; 287 natural_width host_idtr_base; 288 natural_width host_ia32_sysenter_esp; 289 natural_width host_ia32_sysenter_eip; 290 natural_width host_rsp; 291 natural_width host_rip; 292 natural_width paddingl[8]; /* room for future expansion */ 293 u32 pin_based_vm_exec_control; 294 u32 cpu_based_vm_exec_control; 295 u32 exception_bitmap; 296 u32 page_fault_error_code_mask; 297 u32 page_fault_error_code_match; 298 u32 cr3_target_count; 299 u32 vm_exit_controls; 300 u32 vm_exit_msr_store_count; 301 u32 vm_exit_msr_load_count; 302 u32 vm_entry_controls; 303 u32 vm_entry_msr_load_count; 304 u32 vm_entry_intr_info_field; 305 u32 vm_entry_exception_error_code; 306 u32 vm_entry_instruction_len; 307 u32 tpr_threshold; 308 u32 secondary_vm_exec_control; 309 u32 vm_instruction_error; 310 u32 vm_exit_reason; 311 u32 vm_exit_intr_info; 312 u32 vm_exit_intr_error_code; 313 u32 idt_vectoring_info_field; 314 u32 idt_vectoring_error_code; 315 u32 vm_exit_instruction_len; 316 u32 vmx_instruction_info; 317 u32 guest_es_limit; 318 u32 guest_cs_limit; 319 u32 guest_ss_limit; 320 u32 guest_ds_limit; 321 u32 guest_fs_limit; 322 u32 guest_gs_limit; 323 u32 guest_ldtr_limit; 324 u32 guest_tr_limit; 325 u32 guest_gdtr_limit; 326 u32 guest_idtr_limit; 327 u32 guest_es_ar_bytes; 328 u32 guest_cs_ar_bytes; 329 u32 guest_ss_ar_bytes; 330 u32 guest_ds_ar_bytes; 331 u32 guest_fs_ar_bytes; 332 u32 guest_gs_ar_bytes; 333 u32 guest_ldtr_ar_bytes; 334 u32 guest_tr_ar_bytes; 335 u32 guest_interruptibility_info; 336 u32 guest_activity_state; 337 u32 guest_sysenter_cs; 338 u32 host_ia32_sysenter_cs; 339 u32 vmx_preemption_timer_value; 340 u32 padding32[7]; /* room for future expansion */ 341 u16 virtual_processor_id; 342 u16 posted_intr_nv; 343 u16 guest_es_selector; 344 u16 guest_cs_selector; 345 u16 guest_ss_selector; 346 u16 guest_ds_selector; 347 u16 guest_fs_selector; 348 u16 guest_gs_selector; 349 u16 guest_ldtr_selector; 350 u16 guest_tr_selector; 351 u16 guest_intr_status; 352 u16 host_es_selector; 353 u16 host_cs_selector; 354 u16 host_ss_selector; 355 u16 host_ds_selector; 356 u16 host_fs_selector; 357 u16 host_gs_selector; 358 u16 host_tr_selector; 359}; 360 361/* 362 * VMCS12_REVISION is an arbitrary id that should be changed if the content or 363 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and 364 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. 365 */ 366#define VMCS12_REVISION 0x11e57ed0 367 368/* 369 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region 370 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the 371 * current implementation, 4K are reserved to avoid future complications. 372 */ 373#define VMCS12_SIZE 0x1000 374 375/* Used to remember the last vmcs02 used for some recently used vmcs12s */ 376struct vmcs02_list { 377 struct list_head list; 378 gpa_t vmptr; 379 struct loaded_vmcs vmcs02; 380}; 381 382/* 383 * The nested_vmx structure is part of vcpu_vmx, and holds information we need 384 * for correct emulation of VMX (i.e., nested VMX) on this vcpu. 385 */ 386struct nested_vmx { 387 /* Has the level1 guest done vmxon? */ 388 bool vmxon; 389 gpa_t vmxon_ptr; 390 391 /* The guest-physical address of the current VMCS L1 keeps for L2 */ 392 gpa_t current_vmptr; 393 /* The host-usable pointer to the above */ 394 struct page *current_vmcs12_page; 395 struct vmcs12 *current_vmcs12; 396 struct vmcs *current_shadow_vmcs; 397 /* 398 * Indicates if the shadow vmcs must be updated with the 399 * data hold by vmcs12 400 */ 401 bool sync_shadow_vmcs; 402 403 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 404 struct list_head vmcs02_pool; 405 int vmcs02_num; 406 u64 vmcs01_tsc_offset; 407 /* L2 must run next, and mustn't decide to exit to L1. */ 408 bool nested_run_pending; 409 /* 410 * Guest pages referred to in vmcs02 with host-physical pointers, so 411 * we must keep them pinned while L2 runs. 412 */ 413 struct page *apic_access_page; 414 struct page *virtual_apic_page; 415 struct page *pi_desc_page; 416 struct pi_desc *pi_desc; 417 bool pi_pending; 418 u16 posted_intr_nv; 419 u64 msr_ia32_feature_control; 420 421 struct hrtimer preemption_timer; 422 bool preemption_timer_expired; 423 424 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 425 u64 vmcs01_debugctl; 426 427 u32 nested_vmx_procbased_ctls_low; 428 u32 nested_vmx_procbased_ctls_high; 429 u32 nested_vmx_true_procbased_ctls_low; 430 u32 nested_vmx_secondary_ctls_low; 431 u32 nested_vmx_secondary_ctls_high; 432 u32 nested_vmx_pinbased_ctls_low; 433 u32 nested_vmx_pinbased_ctls_high; 434 u32 nested_vmx_exit_ctls_low; 435 u32 nested_vmx_exit_ctls_high; 436 u32 nested_vmx_true_exit_ctls_low; 437 u32 nested_vmx_entry_ctls_low; 438 u32 nested_vmx_entry_ctls_high; 439 u32 nested_vmx_true_entry_ctls_low; 440 u32 nested_vmx_misc_low; 441 u32 nested_vmx_misc_high; 442 u32 nested_vmx_ept_caps; 443}; 444 445#define POSTED_INTR_ON 0 446/* Posted-Interrupt Descriptor */ 447struct pi_desc { 448 u32 pir[8]; /* Posted interrupt requested */ 449 u32 control; /* bit 0 of control is outstanding notification bit */ 450 u32 rsvd[7]; 451} __aligned(64); 452 453static bool pi_test_and_set_on(struct pi_desc *pi_desc) 454{ 455 return test_and_set_bit(POSTED_INTR_ON, 456 (unsigned long *)&pi_desc->control); 457} 458 459static bool pi_test_and_clear_on(struct pi_desc *pi_desc) 460{ 461 return test_and_clear_bit(POSTED_INTR_ON, 462 (unsigned long *)&pi_desc->control); 463} 464 465static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) 466{ 467 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 468} 469 470struct vcpu_vmx { 471 struct kvm_vcpu vcpu; 472 unsigned long host_rsp; 473 u8 fail; 474 bool nmi_known_unmasked; 475 u32 exit_intr_info; 476 u32 idt_vectoring_info; 477 ulong rflags; 478 struct shared_msr_entry *guest_msrs; 479 int nmsrs; 480 int save_nmsrs; 481 unsigned long host_idt_base; 482#ifdef CONFIG_X86_64 483 u64 msr_host_kernel_gs_base; 484 u64 msr_guest_kernel_gs_base; 485#endif 486 u32 vm_entry_controls_shadow; 487 u32 vm_exit_controls_shadow; 488 /* 489 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 490 * non-nested (L1) guest, it always points to vmcs01. For a nested 491 * guest (L2), it points to a different VMCS. 492 */ 493 struct loaded_vmcs vmcs01; 494 struct loaded_vmcs *loaded_vmcs; 495 bool __launched; /* temporary, used in vmx_vcpu_run */ 496 struct msr_autoload { 497 unsigned nr; 498 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 499 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; 500 } msr_autoload; 501 struct { 502 int loaded; 503 u16 fs_sel, gs_sel, ldt_sel; 504#ifdef CONFIG_X86_64 505 u16 ds_sel, es_sel; 506#endif 507 int gs_ldt_reload_needed; 508 int fs_reload_needed; 509 u64 msr_host_bndcfgs; 510 unsigned long vmcs_host_cr4; /* May not match real cr4 */ 511 } host_state; 512 struct { 513 int vm86_active; 514 ulong save_rflags; 515 struct kvm_segment segs[8]; 516 } rmode; 517 struct { 518 u32 bitmask; /* 4 bits per segment (1 bit per field) */ 519 struct kvm_save_segment { 520 u16 selector; 521 unsigned long base; 522 u32 limit; 523 u32 ar; 524 } seg[8]; 525 } segment_cache; 526 int vpid; 527 bool emulation_required; 528 529 /* Support for vnmi-less CPUs */ 530 int soft_vnmi_blocked; 531 ktime_t entry_time; 532 s64 vnmi_blocked_time; 533 u32 exit_reason; 534 535 bool rdtscp_enabled; 536 537 /* Posted interrupt descriptor */ 538 struct pi_desc pi_desc; 539 540 /* Support for a guest hypervisor (nested VMX) */ 541 struct nested_vmx nested; 542 543 /* Dynamic PLE window. */ 544 int ple_window; 545 bool ple_window_dirty; 546 547 /* Support for PML */ 548#define PML_ENTITY_NUM 512 549 struct page *pml_pg; 550}; 551 552enum segment_cache_field { 553 SEG_FIELD_SEL = 0, 554 SEG_FIELD_BASE = 1, 555 SEG_FIELD_LIMIT = 2, 556 SEG_FIELD_AR = 3, 557 558 SEG_FIELD_NR = 4 559}; 560 561static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 562{ 563 return container_of(vcpu, struct vcpu_vmx, vcpu); 564} 565 566#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) 567#define FIELD(number, name) [number] = VMCS12_OFFSET(name) 568#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 569 [number##_HIGH] = VMCS12_OFFSET(name)+4 570 571 572static unsigned long shadow_read_only_fields[] = { 573 /* 574 * We do NOT shadow fields that are modified when L0 575 * traps and emulates any vmx instruction (e.g. VMPTRLD, 576 * VMXON...) executed by L1. 577 * For example, VM_INSTRUCTION_ERROR is read 578 * by L1 if a vmx instruction fails (part of the error path). 579 * Note the code assumes this logic. If for some reason 580 * we start shadowing these fields then we need to 581 * force a shadow sync when L0 emulates vmx instructions 582 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified 583 * by nested_vmx_failValid) 584 */ 585 VM_EXIT_REASON, 586 VM_EXIT_INTR_INFO, 587 VM_EXIT_INSTRUCTION_LEN, 588 IDT_VECTORING_INFO_FIELD, 589 IDT_VECTORING_ERROR_CODE, 590 VM_EXIT_INTR_ERROR_CODE, 591 EXIT_QUALIFICATION, 592 GUEST_LINEAR_ADDRESS, 593 GUEST_PHYSICAL_ADDRESS 594}; 595static int max_shadow_read_only_fields = 596 ARRAY_SIZE(shadow_read_only_fields); 597 598static unsigned long shadow_read_write_fields[] = { 599 TPR_THRESHOLD, 600 GUEST_RIP, 601 GUEST_RSP, 602 GUEST_CR0, 603 GUEST_CR3, 604 GUEST_CR4, 605 GUEST_INTERRUPTIBILITY_INFO, 606 GUEST_RFLAGS, 607 GUEST_CS_SELECTOR, 608 GUEST_CS_AR_BYTES, 609 GUEST_CS_LIMIT, 610 GUEST_CS_BASE, 611 GUEST_ES_BASE, 612 GUEST_BNDCFGS, 613 CR0_GUEST_HOST_MASK, 614 CR0_READ_SHADOW, 615 CR4_READ_SHADOW, 616 TSC_OFFSET, 617 EXCEPTION_BITMAP, 618 CPU_BASED_VM_EXEC_CONTROL, 619 VM_ENTRY_EXCEPTION_ERROR_CODE, 620 VM_ENTRY_INTR_INFO_FIELD, 621 VM_ENTRY_INSTRUCTION_LEN, 622 VM_ENTRY_EXCEPTION_ERROR_CODE, 623 HOST_FS_BASE, 624 HOST_GS_BASE, 625 HOST_FS_SELECTOR, 626 HOST_GS_SELECTOR 627}; 628static int max_shadow_read_write_fields = 629 ARRAY_SIZE(shadow_read_write_fields); 630 631static const unsigned short vmcs_field_to_offset_table[] = { 632 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 633 FIELD(POSTED_INTR_NV, posted_intr_nv), 634 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 635 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 636 FIELD(GUEST_SS_SELECTOR, guest_ss_selector), 637 FIELD(GUEST_DS_SELECTOR, guest_ds_selector), 638 FIELD(GUEST_FS_SELECTOR, guest_fs_selector), 639 FIELD(GUEST_GS_SELECTOR, guest_gs_selector), 640 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), 641 FIELD(GUEST_TR_SELECTOR, guest_tr_selector), 642 FIELD(GUEST_INTR_STATUS, guest_intr_status), 643 FIELD(HOST_ES_SELECTOR, host_es_selector), 644 FIELD(HOST_CS_SELECTOR, host_cs_selector), 645 FIELD(HOST_SS_SELECTOR, host_ss_selector), 646 FIELD(HOST_DS_SELECTOR, host_ds_selector), 647 FIELD(HOST_FS_SELECTOR, host_fs_selector), 648 FIELD(HOST_GS_SELECTOR, host_gs_selector), 649 FIELD(HOST_TR_SELECTOR, host_tr_selector), 650 FIELD64(IO_BITMAP_A, io_bitmap_a), 651 FIELD64(IO_BITMAP_B, io_bitmap_b), 652 FIELD64(MSR_BITMAP, msr_bitmap), 653 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), 654 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), 655 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), 656 FIELD64(TSC_OFFSET, tsc_offset), 657 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 658 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 659 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), 660 FIELD64(EPT_POINTER, ept_pointer), 661 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), 662 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), 663 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), 664 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), 665 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), 666 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 667 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), 668 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), 669 FIELD64(GUEST_IA32_PAT, guest_ia32_pat), 670 FIELD64(GUEST_IA32_EFER, guest_ia32_efer), 671 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), 672 FIELD64(GUEST_PDPTR0, guest_pdptr0), 673 FIELD64(GUEST_PDPTR1, guest_pdptr1), 674 FIELD64(GUEST_PDPTR2, guest_pdptr2), 675 FIELD64(GUEST_PDPTR3, guest_pdptr3), 676 FIELD64(GUEST_BNDCFGS, guest_bndcfgs), 677 FIELD64(HOST_IA32_PAT, host_ia32_pat), 678 FIELD64(HOST_IA32_EFER, host_ia32_efer), 679 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), 680 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), 681 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), 682 FIELD(EXCEPTION_BITMAP, exception_bitmap), 683 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), 684 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), 685 FIELD(CR3_TARGET_COUNT, cr3_target_count), 686 FIELD(VM_EXIT_CONTROLS, vm_exit_controls), 687 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), 688 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), 689 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), 690 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), 691 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), 692 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), 693 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), 694 FIELD(TPR_THRESHOLD, tpr_threshold), 695 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), 696 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), 697 FIELD(VM_EXIT_REASON, vm_exit_reason), 698 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), 699 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), 700 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), 701 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), 702 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), 703 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), 704 FIELD(GUEST_ES_LIMIT, guest_es_limit), 705 FIELD(GUEST_CS_LIMIT, guest_cs_limit), 706 FIELD(GUEST_SS_LIMIT, guest_ss_limit), 707 FIELD(GUEST_DS_LIMIT, guest_ds_limit), 708 FIELD(GUEST_FS_LIMIT, guest_fs_limit), 709 FIELD(GUEST_GS_LIMIT, guest_gs_limit), 710 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), 711 FIELD(GUEST_TR_LIMIT, guest_tr_limit), 712 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), 713 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), 714 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), 715 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), 716 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), 717 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), 718 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), 719 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), 720 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), 721 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), 722 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), 723 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), 724 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), 725 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), 726 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), 727 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), 728 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), 729 FIELD(CR0_READ_SHADOW, cr0_read_shadow), 730 FIELD(CR4_READ_SHADOW, cr4_read_shadow), 731 FIELD(CR3_TARGET_VALUE0, cr3_target_value0), 732 FIELD(CR3_TARGET_VALUE1, cr3_target_value1), 733 FIELD(CR3_TARGET_VALUE2, cr3_target_value2), 734 FIELD(CR3_TARGET_VALUE3, cr3_target_value3), 735 FIELD(EXIT_QUALIFICATION, exit_qualification), 736 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), 737 FIELD(GUEST_CR0, guest_cr0), 738 FIELD(GUEST_CR3, guest_cr3), 739 FIELD(GUEST_CR4, guest_cr4), 740 FIELD(GUEST_ES_BASE, guest_es_base), 741 FIELD(GUEST_CS_BASE, guest_cs_base), 742 FIELD(GUEST_SS_BASE, guest_ss_base), 743 FIELD(GUEST_DS_BASE, guest_ds_base), 744 FIELD(GUEST_FS_BASE, guest_fs_base), 745 FIELD(GUEST_GS_BASE, guest_gs_base), 746 FIELD(GUEST_LDTR_BASE, guest_ldtr_base), 747 FIELD(GUEST_TR_BASE, guest_tr_base), 748 FIELD(GUEST_GDTR_BASE, guest_gdtr_base), 749 FIELD(GUEST_IDTR_BASE, guest_idtr_base), 750 FIELD(GUEST_DR7, guest_dr7), 751 FIELD(GUEST_RSP, guest_rsp), 752 FIELD(GUEST_RIP, guest_rip), 753 FIELD(GUEST_RFLAGS, guest_rflags), 754 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), 755 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), 756 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), 757 FIELD(HOST_CR0, host_cr0), 758 FIELD(HOST_CR3, host_cr3), 759 FIELD(HOST_CR4, host_cr4), 760 FIELD(HOST_FS_BASE, host_fs_base), 761 FIELD(HOST_GS_BASE, host_gs_base), 762 FIELD(HOST_TR_BASE, host_tr_base), 763 FIELD(HOST_GDTR_BASE, host_gdtr_base), 764 FIELD(HOST_IDTR_BASE, host_idtr_base), 765 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), 766 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), 767 FIELD(HOST_RSP, host_rsp), 768 FIELD(HOST_RIP, host_rip), 769}; 770 771static inline short vmcs_field_to_offset(unsigned long field) 772{ 773 BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); 774 775 if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || 776 vmcs_field_to_offset_table[field] == 0) 777 return -ENOENT; 778 779 return vmcs_field_to_offset_table[field]; 780} 781 782static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 783{ 784 return to_vmx(vcpu)->nested.current_vmcs12; 785} 786 787static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 788{ 789 struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); 790 if (is_error_page(page)) 791 return NULL; 792 793 return page; 794} 795 796static void nested_release_page(struct page *page) 797{ 798 kvm_release_page_dirty(page); 799} 800 801static void nested_release_page_clean(struct page *page) 802{ 803 kvm_release_page_clean(page); 804} 805 806static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 807static u64 construct_eptp(unsigned long root_hpa); 808static void kvm_cpu_vmxon(u64 addr); 809static void kvm_cpu_vmxoff(void); 810static bool vmx_mpx_supported(void); 811static bool vmx_xsaves_supported(void); 812static int vmx_vm_has_apicv(struct kvm *kvm); 813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 814static void vmx_set_segment(struct kvm_vcpu *vcpu, 815 struct kvm_segment *var, int seg); 816static void vmx_get_segment(struct kvm_vcpu *vcpu, 817 struct kvm_segment *var, int seg); 818static bool guest_state_valid(struct kvm_vcpu *vcpu); 819static u32 vmx_segment_access_rights(struct kvm_segment *var); 820static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); 821static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); 822static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 823static int alloc_identity_pagetable(struct kvm *kvm); 824 825static DEFINE_PER_CPU(struct vmcs *, vmxarea); 826static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 827/* 828 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 829 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 830 */ 831static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 832static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 833 834static unsigned long *vmx_io_bitmap_a; 835static unsigned long *vmx_io_bitmap_b; 836static unsigned long *vmx_msr_bitmap_legacy; 837static unsigned long *vmx_msr_bitmap_longmode; 838static unsigned long *vmx_msr_bitmap_legacy_x2apic; 839static unsigned long *vmx_msr_bitmap_longmode_x2apic; 840static unsigned long *vmx_msr_bitmap_nested; 841static unsigned long *vmx_vmread_bitmap; 842static unsigned long *vmx_vmwrite_bitmap; 843 844static bool cpu_has_load_ia32_efer; 845static bool cpu_has_load_perf_global_ctrl; 846 847static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 848static DEFINE_SPINLOCK(vmx_vpid_lock); 849 850static struct vmcs_config { 851 int size; 852 int order; 853 u32 revision_id; 854 u32 pin_based_exec_ctrl; 855 u32 cpu_based_exec_ctrl; 856 u32 cpu_based_2nd_exec_ctrl; 857 u32 vmexit_ctrl; 858 u32 vmentry_ctrl; 859} vmcs_config; 860 861static struct vmx_capability { 862 u32 ept; 863 u32 vpid; 864} vmx_capability; 865 866#define VMX_SEGMENT_FIELD(seg) \ 867 [VCPU_SREG_##seg] = { \ 868 .selector = GUEST_##seg##_SELECTOR, \ 869 .base = GUEST_##seg##_BASE, \ 870 .limit = GUEST_##seg##_LIMIT, \ 871 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 872 } 873 874static const struct kvm_vmx_segment_field { 875 unsigned selector; 876 unsigned base; 877 unsigned limit; 878 unsigned ar_bytes; 879} kvm_vmx_segment_fields[] = { 880 VMX_SEGMENT_FIELD(CS), 881 VMX_SEGMENT_FIELD(DS), 882 VMX_SEGMENT_FIELD(ES), 883 VMX_SEGMENT_FIELD(FS), 884 VMX_SEGMENT_FIELD(GS), 885 VMX_SEGMENT_FIELD(SS), 886 VMX_SEGMENT_FIELD(TR), 887 VMX_SEGMENT_FIELD(LDTR), 888}; 889 890static u64 host_efer; 891 892static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 893 894/* 895 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it 896 * away by decrementing the array size. 897 */ 898static const u32 vmx_msr_index[] = { 899#ifdef CONFIG_X86_64 900 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 901#endif 902 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 903}; 904 905static inline bool is_page_fault(u32 intr_info) 906{ 907 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 908 INTR_INFO_VALID_MASK)) == 909 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 910} 911 912static inline bool is_no_device(u32 intr_info) 913{ 914 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 915 INTR_INFO_VALID_MASK)) == 916 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 917} 918 919static inline bool is_invalid_opcode(u32 intr_info) 920{ 921 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 922 INTR_INFO_VALID_MASK)) == 923 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 924} 925 926static inline bool is_external_interrupt(u32 intr_info) 927{ 928 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 929 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 930} 931 932static inline bool is_machine_check(u32 intr_info) 933{ 934 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 935 INTR_INFO_VALID_MASK)) == 936 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 937} 938 939static inline bool cpu_has_vmx_msr_bitmap(void) 940{ 941 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 942} 943 944static inline bool cpu_has_vmx_tpr_shadow(void) 945{ 946 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 947} 948 949static inline bool vm_need_tpr_shadow(struct kvm *kvm) 950{ 951 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 952} 953 954static inline bool cpu_has_secondary_exec_ctrls(void) 955{ 956 return vmcs_config.cpu_based_exec_ctrl & 957 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 958} 959 960static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 961{ 962 return vmcs_config.cpu_based_2nd_exec_ctrl & 963 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 964} 965 966static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) 967{ 968 return vmcs_config.cpu_based_2nd_exec_ctrl & 969 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 970} 971 972static inline bool cpu_has_vmx_apic_register_virt(void) 973{ 974 return vmcs_config.cpu_based_2nd_exec_ctrl & 975 SECONDARY_EXEC_APIC_REGISTER_VIRT; 976} 977 978static inline bool cpu_has_vmx_virtual_intr_delivery(void) 979{ 980 return vmcs_config.cpu_based_2nd_exec_ctrl & 981 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 982} 983 984static inline bool cpu_has_vmx_posted_intr(void) 985{ 986 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 987} 988 989static inline bool cpu_has_vmx_apicv(void) 990{ 991 return cpu_has_vmx_apic_register_virt() && 992 cpu_has_vmx_virtual_intr_delivery() && 993 cpu_has_vmx_posted_intr(); 994} 995 996static inline bool cpu_has_vmx_flexpriority(void) 997{ 998 return cpu_has_vmx_tpr_shadow() && 999 cpu_has_vmx_virtualize_apic_accesses(); 1000} 1001 1002static inline bool cpu_has_vmx_ept_execute_only(void) 1003{ 1004 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; 1005} 1006 1007static inline bool cpu_has_vmx_ept_2m_page(void) 1008{ 1009 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; 1010} 1011 1012static inline bool cpu_has_vmx_ept_1g_page(void) 1013{ 1014 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 1015} 1016 1017static inline bool cpu_has_vmx_ept_4levels(void) 1018{ 1019 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 1020} 1021 1022static inline bool cpu_has_vmx_ept_ad_bits(void) 1023{ 1024 return vmx_capability.ept & VMX_EPT_AD_BIT; 1025} 1026 1027static inline bool cpu_has_vmx_invept_context(void) 1028{ 1029 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; 1030} 1031 1032static inline bool cpu_has_vmx_invept_global(void) 1033{ 1034 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 1035} 1036 1037static inline bool cpu_has_vmx_invvpid_single(void) 1038{ 1039 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; 1040} 1041 1042static inline bool cpu_has_vmx_invvpid_global(void) 1043{ 1044 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; 1045} 1046 1047static inline bool cpu_has_vmx_ept(void) 1048{ 1049 return vmcs_config.cpu_based_2nd_exec_ctrl & 1050 SECONDARY_EXEC_ENABLE_EPT; 1051} 1052 1053static inline bool cpu_has_vmx_unrestricted_guest(void) 1054{ 1055 return vmcs_config.cpu_based_2nd_exec_ctrl & 1056 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1057} 1058 1059static inline bool cpu_has_vmx_ple(void) 1060{ 1061 return vmcs_config.cpu_based_2nd_exec_ctrl & 1062 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1063} 1064 1065static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) 1066{ 1067 return flexpriority_enabled && irqchip_in_kernel(kvm); 1068} 1069 1070static inline bool cpu_has_vmx_vpid(void) 1071{ 1072 return vmcs_config.cpu_based_2nd_exec_ctrl & 1073 SECONDARY_EXEC_ENABLE_VPID; 1074} 1075 1076static inline bool cpu_has_vmx_rdtscp(void) 1077{ 1078 return vmcs_config.cpu_based_2nd_exec_ctrl & 1079 SECONDARY_EXEC_RDTSCP; 1080} 1081 1082static inline bool cpu_has_vmx_invpcid(void) 1083{ 1084 return vmcs_config.cpu_based_2nd_exec_ctrl & 1085 SECONDARY_EXEC_ENABLE_INVPCID; 1086} 1087 1088static inline bool cpu_has_virtual_nmis(void) 1089{ 1090 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 1091} 1092 1093static inline bool cpu_has_vmx_wbinvd_exit(void) 1094{ 1095 return vmcs_config.cpu_based_2nd_exec_ctrl & 1096 SECONDARY_EXEC_WBINVD_EXITING; 1097} 1098 1099static inline bool cpu_has_vmx_shadow_vmcs(void) 1100{ 1101 u64 vmx_msr; 1102 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 1103 /* check if the cpu supports writing r/o exit information fields */ 1104 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 1105 return false; 1106 1107 return vmcs_config.cpu_based_2nd_exec_ctrl & 1108 SECONDARY_EXEC_SHADOW_VMCS; 1109} 1110 1111static inline bool cpu_has_vmx_pml(void) 1112{ 1113 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; 1114} 1115 1116static inline bool report_flexpriority(void) 1117{ 1118 return flexpriority_enabled; 1119} 1120 1121static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) 1122{ 1123 return vmcs12->cpu_based_vm_exec_control & bit; 1124} 1125 1126static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) 1127{ 1128 return (vmcs12->cpu_based_vm_exec_control & 1129 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 1130 (vmcs12->secondary_vm_exec_control & bit); 1131} 1132 1133static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) 1134{ 1135 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1136} 1137 1138static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) 1139{ 1140 return vmcs12->pin_based_vm_exec_control & 1141 PIN_BASED_VMX_PREEMPTION_TIMER; 1142} 1143 1144static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) 1145{ 1146 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); 1147} 1148 1149static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) 1150{ 1151 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && 1152 vmx_xsaves_supported(); 1153} 1154 1155static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) 1156{ 1157 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 1158} 1159 1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) 1161{ 1162 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); 1163} 1164 1165static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) 1166{ 1167 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 1168} 1169 1170static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) 1171{ 1172 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; 1173} 1174 1175static inline bool is_exception(u32 intr_info) 1176{ 1177 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1178 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); 1179} 1180 1181static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 1182 u32 exit_intr_info, 1183 unsigned long exit_qualification); 1184static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 1185 struct vmcs12 *vmcs12, 1186 u32 reason, unsigned long qualification); 1187 1188static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 1189{ 1190 int i; 1191 1192 for (i = 0; i < vmx->nmsrs; ++i) 1193 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) 1194 return i; 1195 return -1; 1196} 1197 1198static inline void __invvpid(int ext, u16 vpid, gva_t gva) 1199{ 1200 struct { 1201 u64 vpid : 16; 1202 u64 rsvd : 48; 1203 u64 gva; 1204 } operand = { vpid, 0, gva }; 1205 1206 asm volatile (__ex(ASM_VMX_INVVPID) 1207 /* CF==1 or ZF==1 --> rc = -1 */ 1208 "; ja 1f ; ud2 ; 1:" 1209 : : "a"(&operand), "c"(ext) : "cc", "memory"); 1210} 1211 1212static inline void __invept(int ext, u64 eptp, gpa_t gpa) 1213{ 1214 struct { 1215 u64 eptp, gpa; 1216 } operand = {eptp, gpa}; 1217 1218 asm volatile (__ex(ASM_VMX_INVEPT) 1219 /* CF==1 or ZF==1 --> rc = -1 */ 1220 "; ja 1f ; ud2 ; 1:\n" 1221 : : "a" (&operand), "c" (ext) : "cc", "memory"); 1222} 1223 1224static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 1225{ 1226 int i; 1227 1228 i = __find_msr_index(vmx, msr); 1229 if (i >= 0) 1230 return &vmx->guest_msrs[i]; 1231 return NULL; 1232} 1233 1234static void vmcs_clear(struct vmcs *vmcs) 1235{ 1236 u64 phys_addr = __pa(vmcs); 1237 u8 error; 1238 1239 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 1240 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1241 : "cc", "memory"); 1242 if (error) 1243 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 1244 vmcs, phys_addr); 1245} 1246 1247static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) 1248{ 1249 vmcs_clear(loaded_vmcs->vmcs); 1250 loaded_vmcs->cpu = -1; 1251 loaded_vmcs->launched = 0; 1252} 1253 1254static void vmcs_load(struct vmcs *vmcs) 1255{ 1256 u64 phys_addr = __pa(vmcs); 1257 u8 error; 1258 1259 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 1260 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1261 : "cc", "memory"); 1262 if (error) 1263 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", 1264 vmcs, phys_addr); 1265} 1266 1267#ifdef CONFIG_KEXEC 1268/* 1269 * This bitmap is used to indicate whether the vmclear 1270 * operation is enabled on all cpus. All disabled by 1271 * default. 1272 */ 1273static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; 1274 1275static inline void crash_enable_local_vmclear(int cpu) 1276{ 1277 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); 1278} 1279 1280static inline void crash_disable_local_vmclear(int cpu) 1281{ 1282 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); 1283} 1284 1285static inline int crash_local_vmclear_enabled(int cpu) 1286{ 1287 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); 1288} 1289 1290static void crash_vmclear_local_loaded_vmcss(void) 1291{ 1292 int cpu = raw_smp_processor_id(); 1293 struct loaded_vmcs *v; 1294 1295 if (!crash_local_vmclear_enabled(cpu)) 1296 return; 1297 1298 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 1299 loaded_vmcss_on_cpu_link) 1300 vmcs_clear(v->vmcs); 1301} 1302#else 1303static inline void crash_enable_local_vmclear(int cpu) { } 1304static inline void crash_disable_local_vmclear(int cpu) { } 1305#endif /* CONFIG_KEXEC */ 1306 1307static void __loaded_vmcs_clear(void *arg) 1308{ 1309 struct loaded_vmcs *loaded_vmcs = arg; 1310 int cpu = raw_smp_processor_id(); 1311 1312 if (loaded_vmcs->cpu != cpu) 1313 return; /* vcpu migration can race with cpu offline */ 1314 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1315 per_cpu(current_vmcs, cpu) = NULL; 1316 crash_disable_local_vmclear(cpu); 1317 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1318 1319 /* 1320 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link 1321 * is before setting loaded_vmcs->vcpu to -1 which is done in 1322 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist 1323 * then adds the vmcs into percpu list before it is deleted. 1324 */ 1325 smp_wmb(); 1326 1327 loaded_vmcs_init(loaded_vmcs); 1328 crash_enable_local_vmclear(cpu); 1329} 1330 1331static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1332{ 1333 int cpu = loaded_vmcs->cpu; 1334 1335 if (cpu != -1) 1336 smp_call_function_single(cpu, 1337 __loaded_vmcs_clear, loaded_vmcs, 1); 1338} 1339 1340static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1341{ 1342 if (vmx->vpid == 0) 1343 return; 1344 1345 if (cpu_has_vmx_invvpid_single()) 1346 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 1347} 1348 1349static inline void vpid_sync_vcpu_global(void) 1350{ 1351 if (cpu_has_vmx_invvpid_global()) 1352 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); 1353} 1354 1355static inline void vpid_sync_context(struct vcpu_vmx *vmx) 1356{ 1357 if (cpu_has_vmx_invvpid_single()) 1358 vpid_sync_vcpu_single(vmx); 1359 else 1360 vpid_sync_vcpu_global(); 1361} 1362 1363static inline void ept_sync_global(void) 1364{ 1365 if (cpu_has_vmx_invept_global()) 1366 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); 1367} 1368 1369static inline void ept_sync_context(u64 eptp) 1370{ 1371 if (enable_ept) { 1372 if (cpu_has_vmx_invept_context()) 1373 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1374 else 1375 ept_sync_global(); 1376 } 1377} 1378 1379static __always_inline unsigned long vmcs_readl(unsigned long field) 1380{ 1381 unsigned long value; 1382 1383 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") 1384 : "=a"(value) : "d"(field) : "cc"); 1385 return value; 1386} 1387 1388static __always_inline u16 vmcs_read16(unsigned long field) 1389{ 1390 return vmcs_readl(field); 1391} 1392 1393static __always_inline u32 vmcs_read32(unsigned long field) 1394{ 1395 return vmcs_readl(field); 1396} 1397 1398static __always_inline u64 vmcs_read64(unsigned long field) 1399{ 1400#ifdef CONFIG_X86_64 1401 return vmcs_readl(field); 1402#else 1403 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); 1404#endif 1405} 1406 1407static noinline void vmwrite_error(unsigned long field, unsigned long value) 1408{ 1409 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", 1410 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 1411 dump_stack(); 1412} 1413 1414static void vmcs_writel(unsigned long field, unsigned long value) 1415{ 1416 u8 error; 1417 1418 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" 1419 : "=q"(error) : "a"(value), "d"(field) : "cc"); 1420 if (unlikely(error)) 1421 vmwrite_error(field, value); 1422} 1423 1424static void vmcs_write16(unsigned long field, u16 value) 1425{ 1426 vmcs_writel(field, value); 1427} 1428 1429static void vmcs_write32(unsigned long field, u32 value) 1430{ 1431 vmcs_writel(field, value); 1432} 1433 1434static void vmcs_write64(unsigned long field, u64 value) 1435{ 1436 vmcs_writel(field, value); 1437#ifndef CONFIG_X86_64 1438 asm volatile (""); 1439 vmcs_writel(field+1, value >> 32); 1440#endif 1441} 1442 1443static void vmcs_clear_bits(unsigned long field, u32 mask) 1444{ 1445 vmcs_writel(field, vmcs_readl(field) & ~mask); 1446} 1447 1448static void vmcs_set_bits(unsigned long field, u32 mask) 1449{ 1450 vmcs_writel(field, vmcs_readl(field) | mask); 1451} 1452 1453static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) 1454{ 1455 vmcs_write32(VM_ENTRY_CONTROLS, val); 1456 vmx->vm_entry_controls_shadow = val; 1457} 1458 1459static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) 1460{ 1461 if (vmx->vm_entry_controls_shadow != val) 1462 vm_entry_controls_init(vmx, val); 1463} 1464 1465static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) 1466{ 1467 return vmx->vm_entry_controls_shadow; 1468} 1469 1470 1471static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1472{ 1473 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); 1474} 1475 1476static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1477{ 1478 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); 1479} 1480 1481static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) 1482{ 1483 vmcs_write32(VM_EXIT_CONTROLS, val); 1484 vmx->vm_exit_controls_shadow = val; 1485} 1486 1487static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) 1488{ 1489 if (vmx->vm_exit_controls_shadow != val) 1490 vm_exit_controls_init(vmx, val); 1491} 1492 1493static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) 1494{ 1495 return vmx->vm_exit_controls_shadow; 1496} 1497 1498 1499static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1500{ 1501 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); 1502} 1503 1504static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1505{ 1506 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); 1507} 1508 1509static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 1510{ 1511 vmx->segment_cache.bitmask = 0; 1512} 1513 1514static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 1515 unsigned field) 1516{ 1517 bool ret; 1518 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 1519 1520 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { 1521 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); 1522 vmx->segment_cache.bitmask = 0; 1523 } 1524 ret = vmx->segment_cache.bitmask & mask; 1525 vmx->segment_cache.bitmask |= mask; 1526 return ret; 1527} 1528 1529static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 1530{ 1531 u16 *p = &vmx->segment_cache.seg[seg].selector; 1532 1533 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 1534 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 1535 return *p; 1536} 1537 1538static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 1539{ 1540 ulong *p = &vmx->segment_cache.seg[seg].base; 1541 1542 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 1543 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 1544 return *p; 1545} 1546 1547static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 1548{ 1549 u32 *p = &vmx->segment_cache.seg[seg].limit; 1550 1551 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 1552 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 1553 return *p; 1554} 1555 1556static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 1557{ 1558 u32 *p = &vmx->segment_cache.seg[seg].ar; 1559 1560 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 1561 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 1562 return *p; 1563} 1564 1565static void update_exception_bitmap(struct kvm_vcpu *vcpu) 1566{ 1567 u32 eb; 1568 1569 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 1570 (1u << NM_VECTOR) | (1u << DB_VECTOR); 1571 if ((vcpu->guest_debug & 1572 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 1573 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 1574 eb |= 1u << BP_VECTOR; 1575 if (to_vmx(vcpu)->rmode.vm86_active) 1576 eb = ~0; 1577 if (enable_ept) 1578 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1579 if (vcpu->fpu_active) 1580 eb &= ~(1u << NM_VECTOR); 1581 1582 /* When we are running a nested L2 guest and L1 specified for it a 1583 * certain exception bitmap, we must trap the same exceptions and pass 1584 * them to L1. When running L2, we will only handle the exceptions 1585 * specified above if L1 did not want them. 1586 */ 1587 if (is_guest_mode(vcpu)) 1588 eb |= get_vmcs12(vcpu)->exception_bitmap; 1589 1590 vmcs_write32(EXCEPTION_BITMAP, eb); 1591} 1592 1593static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1594 unsigned long entry, unsigned long exit) 1595{ 1596 vm_entry_controls_clearbit(vmx, entry); 1597 vm_exit_controls_clearbit(vmx, exit); 1598} 1599 1600static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1601{ 1602 unsigned i; 1603 struct msr_autoload *m = &vmx->msr_autoload; 1604 1605 switch (msr) { 1606 case MSR_EFER: 1607 if (cpu_has_load_ia32_efer) { 1608 clear_atomic_switch_msr_special(vmx, 1609 VM_ENTRY_LOAD_IA32_EFER, 1610 VM_EXIT_LOAD_IA32_EFER); 1611 return; 1612 } 1613 break; 1614 case MSR_CORE_PERF_GLOBAL_CTRL: 1615 if (cpu_has_load_perf_global_ctrl) { 1616 clear_atomic_switch_msr_special(vmx, 1617 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1618 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1619 return; 1620 } 1621 break; 1622 } 1623 1624 for (i = 0; i < m->nr; ++i) 1625 if (m->guest[i].index == msr) 1626 break; 1627 1628 if (i == m->nr) 1629 return; 1630 --m->nr; 1631 m->guest[i] = m->guest[m->nr]; 1632 m->host[i] = m->host[m->nr]; 1633 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1634 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1635} 1636 1637static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1638 unsigned long entry, unsigned long exit, 1639 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1640 u64 guest_val, u64 host_val) 1641{ 1642 vmcs_write64(guest_val_vmcs, guest_val); 1643 vmcs_write64(host_val_vmcs, host_val); 1644 vm_entry_controls_setbit(vmx, entry); 1645 vm_exit_controls_setbit(vmx, exit); 1646} 1647 1648static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1649 u64 guest_val, u64 host_val) 1650{ 1651 unsigned i; 1652 struct msr_autoload *m = &vmx->msr_autoload; 1653 1654 switch (msr) { 1655 case MSR_EFER: 1656 if (cpu_has_load_ia32_efer) { 1657 add_atomic_switch_msr_special(vmx, 1658 VM_ENTRY_LOAD_IA32_EFER, 1659 VM_EXIT_LOAD_IA32_EFER, 1660 GUEST_IA32_EFER, 1661 HOST_IA32_EFER, 1662 guest_val, host_val); 1663 return; 1664 } 1665 break; 1666 case MSR_CORE_PERF_GLOBAL_CTRL: 1667 if (cpu_has_load_perf_global_ctrl) { 1668 add_atomic_switch_msr_special(vmx, 1669 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1670 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1671 GUEST_IA32_PERF_GLOBAL_CTRL, 1672 HOST_IA32_PERF_GLOBAL_CTRL, 1673 guest_val, host_val); 1674 return; 1675 } 1676 break; 1677 } 1678 1679 for (i = 0; i < m->nr; ++i) 1680 if (m->guest[i].index == msr) 1681 break; 1682 1683 if (i == NR_AUTOLOAD_MSRS) { 1684 printk_once(KERN_WARNING "Not enough msr switch entries. " 1685 "Can't add msr %x\n", msr); 1686 return; 1687 } else if (i == m->nr) { 1688 ++m->nr; 1689 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1690 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1691 } 1692 1693 m->guest[i].index = msr; 1694 m->guest[i].value = guest_val; 1695 m->host[i].index = msr; 1696 m->host[i].value = host_val; 1697} 1698 1699static void reload_tss(void) 1700{ 1701 /* 1702 * VT restores TR but not its size. Useless. 1703 */ 1704 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1705 struct desc_struct *descs; 1706 1707 descs = (void *)gdt->address; 1708 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 1709 load_TR_desc(); 1710} 1711 1712static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) 1713{ 1714 u64 guest_efer; 1715 u64 ignore_bits; 1716 1717 guest_efer = vmx->vcpu.arch.efer; 1718 1719 /* 1720 * NX is emulated; LMA and LME handled by hardware; SCE meaningless 1721 * outside long mode 1722 */ 1723 ignore_bits = EFER_NX | EFER_SCE; 1724#ifdef CONFIG_X86_64 1725 ignore_bits |= EFER_LMA | EFER_LME; 1726 /* SCE is meaningful only in long mode on Intel */ 1727 if (guest_efer & EFER_LMA) 1728 ignore_bits &= ~(u64)EFER_SCE; 1729#endif 1730 guest_efer &= ~ignore_bits; 1731 guest_efer |= host_efer & ignore_bits; 1732 vmx->guest_msrs[efer_offset].data = guest_efer; 1733 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 1734 1735 clear_atomic_switch_msr(vmx, MSR_EFER); 1736 1737 /* 1738 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1739 * On CPUs that support "load IA32_EFER", always switch EFER 1740 * atomically, since it's faster than switching it manually. 1741 */ 1742 if (cpu_has_load_ia32_efer || 1743 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 1744 guest_efer = vmx->vcpu.arch.efer; 1745 if (!(guest_efer & EFER_LMA)) 1746 guest_efer &= ~EFER_LME; 1747 if (guest_efer != host_efer) 1748 add_atomic_switch_msr(vmx, MSR_EFER, 1749 guest_efer, host_efer); 1750 return false; 1751 } 1752 1753 return true; 1754} 1755 1756static unsigned long segment_base(u16 selector) 1757{ 1758 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1759 struct desc_struct *d; 1760 unsigned long table_base; 1761 unsigned long v; 1762 1763 if (!(selector & ~3)) 1764 return 0; 1765 1766 table_base = gdt->address; 1767 1768 if (selector & 4) { /* from ldt */ 1769 u16 ldt_selector = kvm_read_ldt(); 1770 1771 if (!(ldt_selector & ~3)) 1772 return 0; 1773 1774 table_base = segment_base(ldt_selector); 1775 } 1776 d = (struct desc_struct *)(table_base + (selector & ~7)); 1777 v = get_desc_base(d); 1778#ifdef CONFIG_X86_64 1779 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 1780 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 1781#endif 1782 return v; 1783} 1784 1785static inline unsigned long kvm_read_tr_base(void) 1786{ 1787 u16 tr; 1788 asm("str %0" : "=g"(tr)); 1789 return segment_base(tr); 1790} 1791 1792static void vmx_save_host_state(struct kvm_vcpu *vcpu) 1793{ 1794 struct vcpu_vmx *vmx = to_vmx(vcpu); 1795 int i; 1796 1797 if (vmx->host_state.loaded) 1798 return; 1799 1800 vmx->host_state.loaded = 1; 1801 /* 1802 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1803 * allow segment selectors with cpl > 0 or ti == 1. 1804 */ 1805 vmx->host_state.ldt_sel = kvm_read_ldt(); 1806 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 1807 savesegment(fs, vmx->host_state.fs_sel); 1808 if (!(vmx->host_state.fs_sel & 7)) { 1809 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 1810 vmx->host_state.fs_reload_needed = 0; 1811 } else { 1812 vmcs_write16(HOST_FS_SELECTOR, 0); 1813 vmx->host_state.fs_reload_needed = 1; 1814 } 1815 savesegment(gs, vmx->host_state.gs_sel); 1816 if (!(vmx->host_state.gs_sel & 7)) 1817 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 1818 else { 1819 vmcs_write16(HOST_GS_SELECTOR, 0); 1820 vmx->host_state.gs_ldt_reload_needed = 1; 1821 } 1822 1823#ifdef CONFIG_X86_64 1824 savesegment(ds, vmx->host_state.ds_sel); 1825 savesegment(es, vmx->host_state.es_sel); 1826#endif 1827 1828#ifdef CONFIG_X86_64 1829 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1830 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1831#else 1832 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); 1833 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); 1834#endif 1835 1836#ifdef CONFIG_X86_64 1837 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1838 if (is_long_mode(&vmx->vcpu)) 1839 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1840#endif 1841 if (boot_cpu_has(X86_FEATURE_MPX)) 1842 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 1843 for (i = 0; i < vmx->save_nmsrs; ++i) 1844 kvm_set_shared_msr(vmx->guest_msrs[i].index, 1845 vmx->guest_msrs[i].data, 1846 vmx->guest_msrs[i].mask); 1847} 1848 1849static void __vmx_load_host_state(struct vcpu_vmx *vmx) 1850{ 1851 if (!vmx->host_state.loaded) 1852 return; 1853 1854 ++vmx->vcpu.stat.host_state_reload; 1855 vmx->host_state.loaded = 0; 1856#ifdef CONFIG_X86_64 1857 if (is_long_mode(&vmx->vcpu)) 1858 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1859#endif 1860 if (vmx->host_state.gs_ldt_reload_needed) { 1861 kvm_load_ldt(vmx->host_state.ldt_sel); 1862#ifdef CONFIG_X86_64 1863 load_gs_index(vmx->host_state.gs_sel); 1864#else 1865 loadsegment(gs, vmx->host_state.gs_sel); 1866#endif 1867 } 1868 if (vmx->host_state.fs_reload_needed) 1869 loadsegment(fs, vmx->host_state.fs_sel); 1870#ifdef CONFIG_X86_64 1871 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { 1872 loadsegment(ds, vmx->host_state.ds_sel); 1873 loadsegment(es, vmx->host_state.es_sel); 1874 } 1875#endif 1876 reload_tss(); 1877#ifdef CONFIG_X86_64 1878 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1879#endif 1880 if (vmx->host_state.msr_host_bndcfgs) 1881 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 1882 /* 1883 * If the FPU is not active (through the host task or 1884 * the guest vcpu), then restore the cr0.TS bit. 1885 */ 1886 if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) 1887 stts(); 1888 load_gdt(this_cpu_ptr(&host_gdt)); 1889} 1890 1891static void vmx_load_host_state(struct vcpu_vmx *vmx) 1892{ 1893 preempt_disable(); 1894 __vmx_load_host_state(vmx); 1895 preempt_enable(); 1896} 1897 1898/* 1899 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1900 * vcpu mutex is already taken. 1901 */ 1902static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1903{ 1904 struct vcpu_vmx *vmx = to_vmx(vcpu); 1905 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1906 1907 if (!vmm_exclusive) 1908 kvm_cpu_vmxon(phys_addr); 1909 else if (vmx->loaded_vmcs->cpu != cpu) 1910 loaded_vmcs_clear(vmx->loaded_vmcs); 1911 1912 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { 1913 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1914 vmcs_load(vmx->loaded_vmcs->vmcs); 1915 } 1916 1917 if (vmx->loaded_vmcs->cpu != cpu) { 1918 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1919 unsigned long sysenter_esp; 1920 1921 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1922 local_irq_disable(); 1923 crash_disable_local_vmclear(cpu); 1924 1925 /* 1926 * Read loaded_vmcs->cpu should be before fetching 1927 * loaded_vmcs->loaded_vmcss_on_cpu_link. 1928 * See the comments in __loaded_vmcs_clear(). 1929 */ 1930 smp_rmb(); 1931 1932 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1933 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1934 crash_enable_local_vmclear(cpu); 1935 local_irq_enable(); 1936 1937 /* 1938 * Linux uses per-cpu TSS and GDT, so set these when switching 1939 * processors. 1940 */ 1941 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 1942 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ 1943 1944 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1945 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1946 vmx->loaded_vmcs->cpu = cpu; 1947 } 1948} 1949 1950static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1951{ 1952 __vmx_load_host_state(to_vmx(vcpu)); 1953 if (!vmm_exclusive) { 1954 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); 1955 vcpu->cpu = -1; 1956 kvm_cpu_vmxoff(); 1957 } 1958} 1959 1960static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 1961{ 1962 ulong cr0; 1963 1964 if (vcpu->fpu_active) 1965 return; 1966 vcpu->fpu_active = 1; 1967 cr0 = vmcs_readl(GUEST_CR0); 1968 cr0 &= ~(X86_CR0_TS | X86_CR0_MP); 1969 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); 1970 vmcs_writel(GUEST_CR0, cr0); 1971 update_exception_bitmap(vcpu); 1972 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 1973 if (is_guest_mode(vcpu)) 1974 vcpu->arch.cr0_guest_owned_bits &= 1975 ~get_vmcs12(vcpu)->cr0_guest_host_mask; 1976 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1977} 1978 1979static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1980 1981/* 1982 * Return the cr0 value that a nested guest would read. This is a combination 1983 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by 1984 * its hypervisor (cr0_read_shadow). 1985 */ 1986static inline unsigned long nested_read_cr0(struct vmcs12 *fields) 1987{ 1988 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | 1989 (fields->cr0_read_shadow & fields->cr0_guest_host_mask); 1990} 1991static inline unsigned long nested_read_cr4(struct vmcs12 *fields) 1992{ 1993 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | 1994 (fields->cr4_read_shadow & fields->cr4_guest_host_mask); 1995} 1996 1997static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 1998{ 1999 /* Note that there is no vcpu->fpu_active = 0 here. The caller must 2000 * set this *before* calling this function. 2001 */ 2002 vmx_decache_cr0_guest_bits(vcpu); 2003 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 2004 update_exception_bitmap(vcpu); 2005 vcpu->arch.cr0_guest_owned_bits = 0; 2006 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2007 if (is_guest_mode(vcpu)) { 2008 /* 2009 * L1's specified read shadow might not contain the TS bit, 2010 * so now that we turned on shadowing of this bit, we need to 2011 * set this bit of the shadow. Like in nested_vmx_run we need 2012 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet 2013 * up-to-date here because we just decached cr0.TS (and we'll 2014 * only update vmcs12->guest_cr0 on nested exit). 2015 */ 2016 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2017 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | 2018 (vcpu->arch.cr0 & X86_CR0_TS); 2019 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2020 } else 2021 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2022} 2023 2024static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 2025{ 2026 unsigned long rflags, save_rflags; 2027 2028 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { 2029 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 2030 rflags = vmcs_readl(GUEST_RFLAGS); 2031 if (to_vmx(vcpu)->rmode.vm86_active) { 2032 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2033 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 2034 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 2035 } 2036 to_vmx(vcpu)->rflags = rflags; 2037 } 2038 return to_vmx(vcpu)->rflags; 2039} 2040 2041static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 2042{ 2043 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 2044 to_vmx(vcpu)->rflags = rflags; 2045 if (to_vmx(vcpu)->rmode.vm86_active) { 2046 to_vmx(vcpu)->rmode.save_rflags = rflags; 2047 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 2048 } 2049 vmcs_writel(GUEST_RFLAGS, rflags); 2050} 2051 2052static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 2053{ 2054 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 2055 int ret = 0; 2056 2057 if (interruptibility & GUEST_INTR_STATE_STI) 2058 ret |= KVM_X86_SHADOW_INT_STI; 2059 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 2060 ret |= KVM_X86_SHADOW_INT_MOV_SS; 2061 2062 return ret; 2063} 2064 2065static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 2066{ 2067 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 2068 u32 interruptibility = interruptibility_old; 2069 2070 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 2071 2072 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 2073 interruptibility |= GUEST_INTR_STATE_MOV_SS; 2074 else if (mask & KVM_X86_SHADOW_INT_STI) 2075 interruptibility |= GUEST_INTR_STATE_STI; 2076 2077 if ((interruptibility != interruptibility_old)) 2078 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 2079} 2080 2081static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 2082{ 2083 unsigned long rip; 2084 2085 rip = kvm_rip_read(vcpu); 2086 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 2087 kvm_rip_write(vcpu, rip); 2088 2089 /* skipping an emulated instruction also counts */ 2090 vmx_set_interrupt_shadow(vcpu, 0); 2091} 2092 2093/* 2094 * KVM wants to inject page-faults which it got to the guest. This function 2095 * checks whether in a nested guest, we need to inject them to L1 or L2. 2096 */ 2097static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) 2098{ 2099 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2100 2101 if (!(vmcs12->exception_bitmap & (1u << nr))) 2102 return 0; 2103 2104 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 2105 vmcs_read32(VM_EXIT_INTR_INFO), 2106 vmcs_readl(EXIT_QUALIFICATION)); 2107 return 1; 2108} 2109 2110static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 2111 bool has_error_code, u32 error_code, 2112 bool reinject) 2113{ 2114 struct vcpu_vmx *vmx = to_vmx(vcpu); 2115 u32 intr_info = nr | INTR_INFO_VALID_MASK; 2116 2117 if (!reinject && is_guest_mode(vcpu) && 2118 nested_vmx_check_exception(vcpu, nr)) 2119 return; 2120 2121 if (has_error_code) { 2122 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 2123 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 2124 } 2125 2126 if (vmx->rmode.vm86_active) { 2127 int inc_eip = 0; 2128 if (kvm_exception_is_soft(nr)) 2129 inc_eip = vcpu->arch.event_exit_inst_len; 2130 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) 2131 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2132 return; 2133 } 2134 2135 if (kvm_exception_is_soft(nr)) { 2136 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2137 vmx->vcpu.arch.event_exit_inst_len); 2138 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 2139 } else 2140 intr_info |= INTR_TYPE_HARD_EXCEPTION; 2141 2142 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 2143} 2144 2145static bool vmx_rdtscp_supported(void) 2146{ 2147 return cpu_has_vmx_rdtscp(); 2148} 2149 2150static bool vmx_invpcid_supported(void) 2151{ 2152 return cpu_has_vmx_invpcid() && enable_ept; 2153} 2154 2155/* 2156 * Swap MSR entry in host/guest MSR entry array. 2157 */ 2158static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 2159{ 2160 struct shared_msr_entry tmp; 2161 2162 tmp = vmx->guest_msrs[to]; 2163 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 2164 vmx->guest_msrs[from] = tmp; 2165} 2166 2167static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) 2168{ 2169 unsigned long *msr_bitmap; 2170 2171 if (is_guest_mode(vcpu)) 2172 msr_bitmap = vmx_msr_bitmap_nested; 2173 else if (vcpu->arch.apic_base & X2APIC_ENABLE) { 2174 if (is_long_mode(vcpu)) 2175 msr_bitmap = vmx_msr_bitmap_longmode_x2apic; 2176 else 2177 msr_bitmap = vmx_msr_bitmap_legacy_x2apic; 2178 } else { 2179 if (is_long_mode(vcpu)) 2180 msr_bitmap = vmx_msr_bitmap_longmode; 2181 else 2182 msr_bitmap = vmx_msr_bitmap_legacy; 2183 } 2184 2185 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); 2186} 2187 2188/* 2189 * Set up the vmcs to automatically save and restore system 2190 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 2191 * mode, as fiddling with msrs is very expensive. 2192 */ 2193static void setup_msrs(struct vcpu_vmx *vmx) 2194{ 2195 int save_nmsrs, index; 2196 2197 save_nmsrs = 0; 2198#ifdef CONFIG_X86_64 2199 if (is_long_mode(&vmx->vcpu)) { 2200 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 2201 if (index >= 0) 2202 move_msr_up(vmx, index, save_nmsrs++); 2203 index = __find_msr_index(vmx, MSR_LSTAR); 2204 if (index >= 0) 2205 move_msr_up(vmx, index, save_nmsrs++); 2206 index = __find_msr_index(vmx, MSR_CSTAR); 2207 if (index >= 0) 2208 move_msr_up(vmx, index, save_nmsrs++); 2209 index = __find_msr_index(vmx, MSR_TSC_AUX); 2210 if (index >= 0 && vmx->rdtscp_enabled) 2211 move_msr_up(vmx, index, save_nmsrs++); 2212 /* 2213 * MSR_STAR is only needed on long mode guests, and only 2214 * if efer.sce is enabled. 2215 */ 2216 index = __find_msr_index(vmx, MSR_STAR); 2217 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) 2218 move_msr_up(vmx, index, save_nmsrs++); 2219 } 2220#endif 2221 index = __find_msr_index(vmx, MSR_EFER); 2222 if (index >= 0 && update_transition_efer(vmx, index)) 2223 move_msr_up(vmx, index, save_nmsrs++); 2224 2225 vmx->save_nmsrs = save_nmsrs; 2226 2227 if (cpu_has_vmx_msr_bitmap()) 2228 vmx_set_msr_bitmap(&vmx->vcpu); 2229} 2230 2231/* 2232 * reads and returns guest's timestamp counter "register" 2233 * guest_tsc = host_tsc + tsc_offset -- 21.3 2234 */ 2235static u64 guest_read_tsc(void) 2236{ 2237 u64 host_tsc, tsc_offset; 2238 2239 rdtscll(host_tsc); 2240 tsc_offset = vmcs_read64(TSC_OFFSET); 2241 return host_tsc + tsc_offset; 2242} 2243 2244/* 2245 * Like guest_read_tsc, but always returns L1's notion of the timestamp 2246 * counter, even if a nested guest (L2) is currently running. 2247 */ 2248static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2249{ 2250 u64 tsc_offset; 2251 2252 tsc_offset = is_guest_mode(vcpu) ? 2253 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 2254 vmcs_read64(TSC_OFFSET); 2255 return host_tsc + tsc_offset; 2256} 2257 2258/* 2259 * Engage any workarounds for mis-matched TSC rates. Currently limited to 2260 * software catchup for faster rates on slower CPUs. 2261 */ 2262static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) 2263{ 2264 if (!scale) 2265 return; 2266 2267 if (user_tsc_khz > tsc_khz) { 2268 vcpu->arch.tsc_catchup = 1; 2269 vcpu->arch.tsc_always_catchup = 1; 2270 } else 2271 WARN(1, "user requested TSC rate below hardware speed\n"); 2272} 2273 2274static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) 2275{ 2276 return vmcs_read64(TSC_OFFSET); 2277} 2278 2279/* 2280 * writes 'offset' into guest's timestamp counter offset register 2281 */ 2282static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 2283{ 2284 if (is_guest_mode(vcpu)) { 2285 /* 2286 * We're here if L1 chose not to trap WRMSR to TSC. According 2287 * to the spec, this should set L1's TSC; The offset that L1 2288 * set for L2 remains unchanged, and still needs to be added 2289 * to the newly set TSC to get L2's TSC. 2290 */ 2291 struct vmcs12 *vmcs12; 2292 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; 2293 /* recalculate vmcs02.TSC_OFFSET: */ 2294 vmcs12 = get_vmcs12(vcpu); 2295 vmcs_write64(TSC_OFFSET, offset + 2296 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? 2297 vmcs12->tsc_offset : 0)); 2298 } else { 2299 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 2300 vmcs_read64(TSC_OFFSET), offset); 2301 vmcs_write64(TSC_OFFSET, offset); 2302 } 2303} 2304 2305static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 2306{ 2307 u64 offset = vmcs_read64(TSC_OFFSET); 2308 2309 vmcs_write64(TSC_OFFSET, offset + adjustment); 2310 if (is_guest_mode(vcpu)) { 2311 /* Even when running L2, the adjustment needs to apply to L1 */ 2312 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2313 } else 2314 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, 2315 offset + adjustment); 2316} 2317 2318static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2319{ 2320 return target_tsc - native_read_tsc(); 2321} 2322 2323static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) 2324{ 2325 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); 2326 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); 2327} 2328 2329/* 2330 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 2331 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 2332 * all guests if the "nested" module option is off, and can also be disabled 2333 * for a single guest by disabling its VMX cpuid bit. 2334 */ 2335static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 2336{ 2337 return nested && guest_cpuid_has_vmx(vcpu); 2338} 2339 2340/* 2341 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 2342 * returned for the various VMX controls MSRs when nested VMX is enabled. 2343 * The same values should also be used to verify that vmcs12 control fields are 2344 * valid during nested entry from L1 to L2. 2345 * Each of these control msrs has a low and high 32-bit half: A low bit is on 2346 * if the corresponding bit in the (32-bit) control field *must* be on, and a 2347 * bit in the high half is on if the corresponding bit in the control field 2348 * may be on. See also vmx_control_verify(). 2349 */ 2350static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) 2351{ 2352 /* 2353 * Note that as a general rule, the high half of the MSRs (bits in 2354 * the control fields which may be 1) should be initialized by the 2355 * intersection of the underlying hardware's MSR (i.e., features which 2356 * can be supported) and the list of features we want to expose - 2357 * because they are known to be properly supported in our code. 2358 * Also, usually, the low half of the MSRs (bits which must be 1) can 2359 * be set to 0, meaning that L1 may turn off any of these bits. The 2360 * reason is that if one of these bits is necessary, it will appear 2361 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 2362 * fields of vmcs01 and vmcs02, will turn these bits off - and 2363 * nested_vmx_exit_handled() will not pass related exits to L1. 2364 * These rules have exceptions below. 2365 */ 2366 2367 /* pin-based controls */ 2368 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2369 vmx->nested.nested_vmx_pinbased_ctls_low, 2370 vmx->nested.nested_vmx_pinbased_ctls_high); 2371 vmx->nested.nested_vmx_pinbased_ctls_low |= 2372 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2373 vmx->nested.nested_vmx_pinbased_ctls_high &= 2374 PIN_BASED_EXT_INTR_MASK | 2375 PIN_BASED_NMI_EXITING | 2376 PIN_BASED_VIRTUAL_NMIS; 2377 vmx->nested.nested_vmx_pinbased_ctls_high |= 2378 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2379 PIN_BASED_VMX_PREEMPTION_TIMER; 2380 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) 2381 vmx->nested.nested_vmx_pinbased_ctls_high |= 2382 PIN_BASED_POSTED_INTR; 2383 2384 /* exit controls */ 2385 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2386 vmx->nested.nested_vmx_exit_ctls_low, 2387 vmx->nested.nested_vmx_exit_ctls_high); 2388 vmx->nested.nested_vmx_exit_ctls_low = 2389 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2390 2391 vmx->nested.nested_vmx_exit_ctls_high &= 2392#ifdef CONFIG_X86_64 2393 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2394#endif 2395 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2396 vmx->nested.nested_vmx_exit_ctls_high |= 2397 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2398 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2399 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2400 2401 if (vmx_mpx_supported()) 2402 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2403 2404 /* We support free control of debug control saving. */ 2405 vmx->nested.nested_vmx_true_exit_ctls_low = 2406 vmx->nested.nested_vmx_exit_ctls_low & 2407 ~VM_EXIT_SAVE_DEBUG_CONTROLS; 2408 2409 /* entry controls */ 2410 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2411 vmx->nested.nested_vmx_entry_ctls_low, 2412 vmx->nested.nested_vmx_entry_ctls_high); 2413 vmx->nested.nested_vmx_entry_ctls_low = 2414 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2415 vmx->nested.nested_vmx_entry_ctls_high &= 2416#ifdef CONFIG_X86_64 2417 VM_ENTRY_IA32E_MODE | 2418#endif 2419 VM_ENTRY_LOAD_IA32_PAT; 2420 vmx->nested.nested_vmx_entry_ctls_high |= 2421 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 2422 if (vmx_mpx_supported()) 2423 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 2424 2425 /* We support free control of debug control loading. */ 2426 vmx->nested.nested_vmx_true_entry_ctls_low = 2427 vmx->nested.nested_vmx_entry_ctls_low & 2428 ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 2429 2430 /* cpu-based controls */ 2431 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2432 vmx->nested.nested_vmx_procbased_ctls_low, 2433 vmx->nested.nested_vmx_procbased_ctls_high); 2434 vmx->nested.nested_vmx_procbased_ctls_low = 2435 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2436 vmx->nested.nested_vmx_procbased_ctls_high &= 2437 CPU_BASED_VIRTUAL_INTR_PENDING | 2438 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 2439 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 2440 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 2441 CPU_BASED_CR3_STORE_EXITING | 2442#ifdef CONFIG_X86_64 2443 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 2444#endif 2445 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 2446 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 2447 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | 2448 CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | 2449 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2450 /* 2451 * We can allow some features even when not supported by the 2452 * hardware. For example, L1 can specify an MSR bitmap - and we 2453 * can use it to avoid exits to L1 - even when L0 runs L2 2454 * without MSR bitmaps. 2455 */ 2456 vmx->nested.nested_vmx_procbased_ctls_high |= 2457 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2458 CPU_BASED_USE_MSR_BITMAPS; 2459 2460 /* We support free control of CR3 access interception. */ 2461 vmx->nested.nested_vmx_true_procbased_ctls_low = 2462 vmx->nested.nested_vmx_procbased_ctls_low & 2463 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2464 2465 /* secondary cpu-based controls */ 2466 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2467 vmx->nested.nested_vmx_secondary_ctls_low, 2468 vmx->nested.nested_vmx_secondary_ctls_high); 2469 vmx->nested.nested_vmx_secondary_ctls_low = 0; 2470 vmx->nested.nested_vmx_secondary_ctls_high &= 2471 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2472 SECONDARY_EXEC_RDTSCP | 2473 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2474 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2475 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2476 SECONDARY_EXEC_WBINVD_EXITING | 2477 SECONDARY_EXEC_XSAVES; 2478 2479 if (enable_ept) { 2480 /* nested EPT: emulate EPT also to L1 */ 2481 vmx->nested.nested_vmx_secondary_ctls_high |= 2482 SECONDARY_EXEC_ENABLE_EPT; 2483 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 2484 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | 2485 VMX_EPT_INVEPT_BIT; 2486 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; 2487 /* 2488 * For nested guests, we don't do anything specific 2489 * for single context invalidation. Hence, only advertise 2490 * support for global context invalidation. 2491 */ 2492 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; 2493 } else 2494 vmx->nested.nested_vmx_ept_caps = 0; 2495 2496 if (enable_unrestricted_guest) 2497 vmx->nested.nested_vmx_secondary_ctls_high |= 2498 SECONDARY_EXEC_UNRESTRICTED_GUEST; 2499 2500 /* miscellaneous data */ 2501 rdmsr(MSR_IA32_VMX_MISC, 2502 vmx->nested.nested_vmx_misc_low, 2503 vmx->nested.nested_vmx_misc_high); 2504 vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; 2505 vmx->nested.nested_vmx_misc_low |= 2506 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 2507 VMX_MISC_ACTIVITY_HLT; 2508 vmx->nested.nested_vmx_misc_high = 0; 2509} 2510 2511static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2512{ 2513 /* 2514 * Bits 0 in high must be 0, and bits 1 in low must be 1. 2515 */ 2516 return ((control & high) | low) == control; 2517} 2518 2519static inline u64 vmx_control_msr(u32 low, u32 high) 2520{ 2521 return low | ((u64)high << 32); 2522} 2523 2524/* Returns 0 on success, non-0 otherwise. */ 2525static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2526{ 2527 struct vcpu_vmx *vmx = to_vmx(vcpu); 2528 2529 switch (msr_index) { 2530 case MSR_IA32_VMX_BASIC: 2531 /* 2532 * This MSR reports some information about VMX support. We 2533 * should return information about the VMX we emulate for the 2534 * guest, and the VMCS structure we give it - not about the 2535 * VMX support of the underlying hardware. 2536 */ 2537 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | 2538 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 2539 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 2540 break; 2541 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 2542 case MSR_IA32_VMX_PINBASED_CTLS: 2543 *pdata = vmx_control_msr( 2544 vmx->nested.nested_vmx_pinbased_ctls_low, 2545 vmx->nested.nested_vmx_pinbased_ctls_high); 2546 break; 2547 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 2548 *pdata = vmx_control_msr( 2549 vmx->nested.nested_vmx_true_procbased_ctls_low, 2550 vmx->nested.nested_vmx_procbased_ctls_high); 2551 break; 2552 case MSR_IA32_VMX_PROCBASED_CTLS: 2553 *pdata = vmx_control_msr( 2554 vmx->nested.nested_vmx_procbased_ctls_low, 2555 vmx->nested.nested_vmx_procbased_ctls_high); 2556 break; 2557 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 2558 *pdata = vmx_control_msr( 2559 vmx->nested.nested_vmx_true_exit_ctls_low, 2560 vmx->nested.nested_vmx_exit_ctls_high); 2561 break; 2562 case MSR_IA32_VMX_EXIT_CTLS: 2563 *pdata = vmx_control_msr( 2564 vmx->nested.nested_vmx_exit_ctls_low, 2565 vmx->nested.nested_vmx_exit_ctls_high); 2566 break; 2567 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 2568 *pdata = vmx_control_msr( 2569 vmx->nested.nested_vmx_true_entry_ctls_low, 2570 vmx->nested.nested_vmx_entry_ctls_high); 2571 break; 2572 case MSR_IA32_VMX_ENTRY_CTLS: 2573 *pdata = vmx_control_msr( 2574 vmx->nested.nested_vmx_entry_ctls_low, 2575 vmx->nested.nested_vmx_entry_ctls_high); 2576 break; 2577 case MSR_IA32_VMX_MISC: 2578 *pdata = vmx_control_msr( 2579 vmx->nested.nested_vmx_misc_low, 2580 vmx->nested.nested_vmx_misc_high); 2581 break; 2582 /* 2583 * These MSRs specify bits which the guest must keep fixed (on or off) 2584 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 2585 * We picked the standard core2 setting. 2586 */ 2587#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 2588#define VMXON_CR4_ALWAYSON X86_CR4_VMXE 2589 case MSR_IA32_VMX_CR0_FIXED0: 2590 *pdata = VMXON_CR0_ALWAYSON; 2591 break; 2592 case MSR_IA32_VMX_CR0_FIXED1: 2593 *pdata = -1ULL; 2594 break; 2595 case MSR_IA32_VMX_CR4_FIXED0: 2596 *pdata = VMXON_CR4_ALWAYSON; 2597 break; 2598 case MSR_IA32_VMX_CR4_FIXED1: 2599 *pdata = -1ULL; 2600 break; 2601 case MSR_IA32_VMX_VMCS_ENUM: 2602 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 2603 break; 2604 case MSR_IA32_VMX_PROCBASED_CTLS2: 2605 *pdata = vmx_control_msr( 2606 vmx->nested.nested_vmx_secondary_ctls_low, 2607 vmx->nested.nested_vmx_secondary_ctls_high); 2608 break; 2609 case MSR_IA32_VMX_EPT_VPID_CAP: 2610 /* Currently, no nested vpid support */ 2611 *pdata = vmx->nested.nested_vmx_ept_caps; 2612 break; 2613 default: 2614 return 1; 2615 } 2616 2617 return 0; 2618} 2619 2620/* 2621 * Reads an msr value (of 'msr_index') into 'pdata'. 2622 * Returns 0 on success, non-0 otherwise. 2623 * Assumes vcpu_load() was already called. 2624 */ 2625static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2626{ 2627 struct shared_msr_entry *msr; 2628 2629 switch (msr_info->index) { 2630#ifdef CONFIG_X86_64 2631 case MSR_FS_BASE: 2632 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2633 break; 2634 case MSR_GS_BASE: 2635 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2636 break; 2637 case MSR_KERNEL_GS_BASE: 2638 vmx_load_host_state(to_vmx(vcpu)); 2639 msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; 2640 break; 2641#endif 2642 case MSR_EFER: 2643 return kvm_get_msr_common(vcpu, msr_info); 2644 case MSR_IA32_TSC: 2645 msr_info->data = guest_read_tsc(); 2646 break; 2647 case MSR_IA32_SYSENTER_CS: 2648 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2649 break; 2650 case MSR_IA32_SYSENTER_EIP: 2651 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2652 break; 2653 case MSR_IA32_SYSENTER_ESP: 2654 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2655 break; 2656 case MSR_IA32_BNDCFGS: 2657 if (!vmx_mpx_supported()) 2658 return 1; 2659 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2660 break; 2661 case MSR_IA32_FEATURE_CONTROL: 2662 if (!nested_vmx_allowed(vcpu)) 2663 return 1; 2664 msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; 2665 break; 2666 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2667 if (!nested_vmx_allowed(vcpu)) 2668 return 1; 2669 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); 2670 case MSR_IA32_XSS: 2671 if (!vmx_xsaves_supported()) 2672 return 1; 2673 msr_info->data = vcpu->arch.ia32_xss; 2674 break; 2675 case MSR_TSC_AUX: 2676 if (!to_vmx(vcpu)->rdtscp_enabled) 2677 return 1; 2678 /* Otherwise falls through */ 2679 default: 2680 msr = find_msr_entry(to_vmx(vcpu), msr_info->index); 2681 if (msr) { 2682 msr_info->data = msr->data; 2683 break; 2684 } 2685 return kvm_get_msr_common(vcpu, msr_info); 2686 } 2687 2688 return 0; 2689} 2690 2691static void vmx_leave_nested(struct kvm_vcpu *vcpu); 2692 2693/* 2694 * Writes msr value into into the appropriate "register". 2695 * Returns 0 on success, non-0 otherwise. 2696 * Assumes vcpu_load() was already called. 2697 */ 2698static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2699{ 2700 struct vcpu_vmx *vmx = to_vmx(vcpu); 2701 struct shared_msr_entry *msr; 2702 int ret = 0; 2703 u32 msr_index = msr_info->index; 2704 u64 data = msr_info->data; 2705 2706 switch (msr_index) { 2707 case MSR_EFER: 2708 ret = kvm_set_msr_common(vcpu, msr_info); 2709 break; 2710#ifdef CONFIG_X86_64 2711 case MSR_FS_BASE: 2712 vmx_segment_cache_clear(vmx); 2713 vmcs_writel(GUEST_FS_BASE, data); 2714 break; 2715 case MSR_GS_BASE: 2716 vmx_segment_cache_clear(vmx); 2717 vmcs_writel(GUEST_GS_BASE, data); 2718 break; 2719 case MSR_KERNEL_GS_BASE: 2720 vmx_load_host_state(vmx); 2721 vmx->msr_guest_kernel_gs_base = data; 2722 break; 2723#endif 2724 case MSR_IA32_SYSENTER_CS: 2725 vmcs_write32(GUEST_SYSENTER_CS, data); 2726 break; 2727 case MSR_IA32_SYSENTER_EIP: 2728 vmcs_writel(GUEST_SYSENTER_EIP, data); 2729 break; 2730 case MSR_IA32_SYSENTER_ESP: 2731 vmcs_writel(GUEST_SYSENTER_ESP, data); 2732 break; 2733 case MSR_IA32_BNDCFGS: 2734 if (!vmx_mpx_supported()) 2735 return 1; 2736 vmcs_write64(GUEST_BNDCFGS, data); 2737 break; 2738 case MSR_IA32_TSC: 2739 kvm_write_tsc(vcpu, msr_info); 2740 break; 2741 case MSR_IA32_CR_PAT: 2742 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2743 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 2744 return 1; 2745 vmcs_write64(GUEST_IA32_PAT, data); 2746 vcpu->arch.pat = data; 2747 break; 2748 } 2749 ret = kvm_set_msr_common(vcpu, msr_info); 2750 break; 2751 case MSR_IA32_TSC_ADJUST: 2752 ret = kvm_set_msr_common(vcpu, msr_info); 2753 break; 2754 case MSR_IA32_FEATURE_CONTROL: 2755 if (!nested_vmx_allowed(vcpu) || 2756 (to_vmx(vcpu)->nested.msr_ia32_feature_control & 2757 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) 2758 return 1; 2759 vmx->nested.msr_ia32_feature_control = data; 2760 if (msr_info->host_initiated && data == 0) 2761 vmx_leave_nested(vcpu); 2762 break; 2763 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2764 return 1; /* they are read-only */ 2765 case MSR_IA32_XSS: 2766 if (!vmx_xsaves_supported()) 2767 return 1; 2768 /* 2769 * The only supported bit as of Skylake is bit 8, but 2770 * it is not supported on KVM. 2771 */ 2772 if (data != 0) 2773 return 1; 2774 vcpu->arch.ia32_xss = data; 2775 if (vcpu->arch.ia32_xss != host_xss) 2776 add_atomic_switch_msr(vmx, MSR_IA32_XSS, 2777 vcpu->arch.ia32_xss, host_xss); 2778 else 2779 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 2780 break; 2781 case MSR_TSC_AUX: 2782 if (!vmx->rdtscp_enabled) 2783 return 1; 2784 /* Check reserved bit, higher 32 bits should be zero */ 2785 if ((data >> 32) != 0) 2786 return 1; 2787 /* Otherwise falls through */ 2788 default: 2789 msr = find_msr_entry(vmx, msr_index); 2790 if (msr) { 2791 u64 old_msr_data = msr->data; 2792 msr->data = data; 2793 if (msr - vmx->guest_msrs < vmx->save_nmsrs) { 2794 preempt_disable(); 2795 ret = kvm_set_shared_msr(msr->index, msr->data, 2796 msr->mask); 2797 preempt_enable(); 2798 if (ret) 2799 msr->data = old_msr_data; 2800 } 2801 break; 2802 } 2803 ret = kvm_set_msr_common(vcpu, msr_info); 2804 } 2805 2806 return ret; 2807} 2808 2809static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2810{ 2811 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); 2812 switch (reg) { 2813 case VCPU_REGS_RSP: 2814 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2815 break; 2816 case VCPU_REGS_RIP: 2817 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2818 break; 2819 case VCPU_EXREG_PDPTR: 2820 if (enable_ept) 2821 ept_save_pdptrs(vcpu); 2822 break; 2823 default: 2824 break; 2825 } 2826} 2827 2828static __init int cpu_has_kvm_support(void) 2829{ 2830 return cpu_has_vmx(); 2831} 2832 2833static __init int vmx_disabled_by_bios(void) 2834{ 2835 u64 msr; 2836 2837 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 2838 if (msr & FEATURE_CONTROL_LOCKED) { 2839 /* launched w/ TXT and VMX disabled */ 2840 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2841 && tboot_enabled()) 2842 return 1; 2843 /* launched w/o TXT and VMX only enabled w/ TXT */ 2844 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2845 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2846 && !tboot_enabled()) { 2847 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 2848 "activate TXT before enabling KVM\n"); 2849 return 1; 2850 } 2851 /* launched w/o TXT and VMX disabled */ 2852 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2853 && !tboot_enabled()) 2854 return 1; 2855 } 2856 2857 return 0; 2858} 2859 2860static void kvm_cpu_vmxon(u64 addr) 2861{ 2862 asm volatile (ASM_VMX_VMXON_RAX 2863 : : "a"(&addr), "m"(addr) 2864 : "memory", "cc"); 2865} 2866 2867static int hardware_enable(void) 2868{ 2869 int cpu = raw_smp_processor_id(); 2870 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2871 u64 old, test_bits; 2872 2873 if (cr4_read_shadow() & X86_CR4_VMXE) 2874 return -EBUSY; 2875 2876 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2877 2878 /* 2879 * Now we can enable the vmclear operation in kdump 2880 * since the loaded_vmcss_on_cpu list on this cpu 2881 * has been initialized. 2882 * 2883 * Though the cpu is not in VMX operation now, there 2884 * is no problem to enable the vmclear operation 2885 * for the loaded_vmcss_on_cpu list is empty! 2886 */ 2887 crash_enable_local_vmclear(cpu); 2888 2889 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2890 2891 test_bits = FEATURE_CONTROL_LOCKED; 2892 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 2893 if (tboot_enabled()) 2894 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; 2895 2896 if ((old & test_bits) != test_bits) { 2897 /* enable and lock */ 2898 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 2899 } 2900 cr4_set_bits(X86_CR4_VMXE); 2901 2902 if (vmm_exclusive) { 2903 kvm_cpu_vmxon(phys_addr); 2904 ept_sync_global(); 2905 } 2906 2907 native_store_gdt(this_cpu_ptr(&host_gdt)); 2908 2909 return 0; 2910} 2911 2912static void vmclear_local_loaded_vmcss(void) 2913{ 2914 int cpu = raw_smp_processor_id(); 2915 struct loaded_vmcs *v, *n; 2916 2917 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2918 loaded_vmcss_on_cpu_link) 2919 __loaded_vmcs_clear(v); 2920} 2921 2922 2923/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() 2924 * tricks. 2925 */ 2926static void kvm_cpu_vmxoff(void) 2927{ 2928 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 2929} 2930 2931static void hardware_disable(void) 2932{ 2933 if (vmm_exclusive) { 2934 vmclear_local_loaded_vmcss(); 2935 kvm_cpu_vmxoff(); 2936 } 2937 cr4_clear_bits(X86_CR4_VMXE); 2938} 2939 2940static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 2941 u32 msr, u32 *result) 2942{ 2943 u32 vmx_msr_low, vmx_msr_high; 2944 u32 ctl = ctl_min | ctl_opt; 2945 2946 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2947 2948 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2949 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2950 2951 /* Ensure minimum (required) set of control bits are supported. */ 2952 if (ctl_min & ~ctl) 2953 return -EIO; 2954 2955 *result = ctl; 2956 return 0; 2957} 2958 2959static __init bool allow_1_setting(u32 msr, u32 ctl) 2960{ 2961 u32 vmx_msr_low, vmx_msr_high; 2962 2963 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2964 return vmx_msr_high & ctl; 2965} 2966 2967static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 2968{ 2969 u32 vmx_msr_low, vmx_msr_high; 2970 u32 min, opt, min2, opt2; 2971 u32 _pin_based_exec_control = 0; 2972 u32 _cpu_based_exec_control = 0; 2973 u32 _cpu_based_2nd_exec_control = 0; 2974 u32 _vmexit_control = 0; 2975 u32 _vmentry_control = 0; 2976 2977 min = CPU_BASED_HLT_EXITING | 2978#ifdef CONFIG_X86_64 2979 CPU_BASED_CR8_LOAD_EXITING | 2980 CPU_BASED_CR8_STORE_EXITING | 2981#endif 2982 CPU_BASED_CR3_LOAD_EXITING | 2983 CPU_BASED_CR3_STORE_EXITING | 2984 CPU_BASED_USE_IO_BITMAPS | 2985 CPU_BASED_MOV_DR_EXITING | 2986 CPU_BASED_USE_TSC_OFFSETING | 2987 CPU_BASED_MWAIT_EXITING | 2988 CPU_BASED_MONITOR_EXITING | 2989 CPU_BASED_INVLPG_EXITING | 2990 CPU_BASED_RDPMC_EXITING; 2991 2992 opt = CPU_BASED_TPR_SHADOW | 2993 CPU_BASED_USE_MSR_BITMAPS | 2994 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2995 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 2996 &_cpu_based_exec_control) < 0) 2997 return -EIO; 2998#ifdef CONFIG_X86_64 2999 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 3000 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 3001 ~CPU_BASED_CR8_STORE_EXITING; 3002#endif 3003 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 3004 min2 = 0; 3005 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 3006 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3007 SECONDARY_EXEC_WBINVD_EXITING | 3008 SECONDARY_EXEC_ENABLE_VPID | 3009 SECONDARY_EXEC_ENABLE_EPT | 3010 SECONDARY_EXEC_UNRESTRICTED_GUEST | 3011 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 3012 SECONDARY_EXEC_RDTSCP | 3013 SECONDARY_EXEC_ENABLE_INVPCID | 3014 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3015 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3016 SECONDARY_EXEC_SHADOW_VMCS | 3017 SECONDARY_EXEC_XSAVES | 3018 SECONDARY_EXEC_ENABLE_PML; 3019 if (adjust_vmx_controls(min2, opt2, 3020 MSR_IA32_VMX_PROCBASED_CTLS2, 3021 &_cpu_based_2nd_exec_control) < 0) 3022 return -EIO; 3023 } 3024#ifndef CONFIG_X86_64 3025 if (!(_cpu_based_2nd_exec_control & 3026 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 3027 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 3028#endif 3029 3030 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 3031 _cpu_based_2nd_exec_control &= ~( 3032 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3033 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3034 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3035 3036 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 3037 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 3038 enabled */ 3039 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 3040 CPU_BASED_CR3_STORE_EXITING | 3041 CPU_BASED_INVLPG_EXITING); 3042 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 3043 vmx_capability.ept, vmx_capability.vpid); 3044 } 3045 3046 min = VM_EXIT_SAVE_DEBUG_CONTROLS; 3047#ifdef CONFIG_X86_64 3048 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 3049#endif 3050 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | 3051 VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; 3052 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 3053 &_vmexit_control) < 0) 3054 return -EIO; 3055 3056 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 3057 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; 3058 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 3059 &_pin_based_exec_control) < 0) 3060 return -EIO; 3061 3062 if (!(_cpu_based_2nd_exec_control & 3063 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || 3064 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) 3065 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 3066 3067 min = VM_ENTRY_LOAD_DEBUG_CONTROLS; 3068 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 3069 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 3070 &_vmentry_control) < 0) 3071 return -EIO; 3072 3073 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 3074 3075 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 3076 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 3077 return -EIO; 3078 3079#ifdef CONFIG_X86_64 3080 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 3081 if (vmx_msr_high & (1u<<16)) 3082 return -EIO; 3083#endif 3084 3085 /* Require Write-Back (WB) memory type for VMCS accesses. */ 3086 if (((vmx_msr_high >> 18) & 15) != 6) 3087 return -EIO; 3088 3089 vmcs_conf->size = vmx_msr_high & 0x1fff; 3090 vmcs_conf->order = get_order(vmcs_config.size); 3091 vmcs_conf->revision_id = vmx_msr_low; 3092 3093 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 3094 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 3095 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 3096 vmcs_conf->vmexit_ctrl = _vmexit_control; 3097 vmcs_conf->vmentry_ctrl = _vmentry_control; 3098 3099 cpu_has_load_ia32_efer = 3100 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 3101 VM_ENTRY_LOAD_IA32_EFER) 3102 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 3103 VM_EXIT_LOAD_IA32_EFER); 3104 3105 cpu_has_load_perf_global_ctrl = 3106 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 3107 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 3108 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 3109 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 3110 3111 /* 3112 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL 3113 * but due to arrata below it can't be used. Workaround is to use 3114 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. 3115 * 3116 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] 3117 * 3118 * AAK155 (model 26) 3119 * AAP115 (model 30) 3120 * AAT100 (model 37) 3121 * BC86,AAY89,BD102 (model 44) 3122 * BA97 (model 46) 3123 * 3124 */ 3125 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { 3126 switch (boot_cpu_data.x86_model) { 3127 case 26: 3128 case 30: 3129 case 37: 3130 case 44: 3131 case 46: 3132 cpu_has_load_perf_global_ctrl = false; 3133 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 3134 "does not work properly. Using workaround\n"); 3135 break; 3136 default: 3137 break; 3138 } 3139 } 3140 3141 if (cpu_has_xsaves) 3142 rdmsrl(MSR_IA32_XSS, host_xss); 3143 3144 return 0; 3145} 3146 3147static struct vmcs *alloc_vmcs_cpu(int cpu) 3148{ 3149 int node = cpu_to_node(cpu); 3150 struct page *pages; 3151 struct vmcs *vmcs; 3152 3153 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); 3154 if (!pages) 3155 return NULL; 3156 vmcs = page_address(pages); 3157 memset(vmcs, 0, vmcs_config.size); 3158 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ 3159 return vmcs; 3160} 3161 3162static struct vmcs *alloc_vmcs(void) 3163{ 3164 return alloc_vmcs_cpu(raw_smp_processor_id()); 3165} 3166 3167static void free_vmcs(struct vmcs *vmcs) 3168{ 3169 free_pages((unsigned long)vmcs, vmcs_config.order); 3170} 3171 3172/* 3173 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 3174 */ 3175static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3176{ 3177 if (!loaded_vmcs->vmcs) 3178 return; 3179 loaded_vmcs_clear(loaded_vmcs); 3180 free_vmcs(loaded_vmcs->vmcs); 3181 loaded_vmcs->vmcs = NULL; 3182} 3183 3184static void free_kvm_area(void) 3185{ 3186 int cpu; 3187 3188 for_each_possible_cpu(cpu) { 3189 free_vmcs(per_cpu(vmxarea, cpu)); 3190 per_cpu(vmxarea, cpu) = NULL; 3191 } 3192} 3193 3194static void init_vmcs_shadow_fields(void) 3195{ 3196 int i, j; 3197 3198 /* No checks for read only fields yet */ 3199 3200 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 3201 switch (shadow_read_write_fields[i]) { 3202 case GUEST_BNDCFGS: 3203 if (!vmx_mpx_supported()) 3204 continue; 3205 break; 3206 default: 3207 break; 3208 } 3209 3210 if (j < i) 3211 shadow_read_write_fields[j] = 3212 shadow_read_write_fields[i]; 3213 j++; 3214 } 3215 max_shadow_read_write_fields = j; 3216 3217 /* shadowed fields guest access without vmexit */ 3218 for (i = 0; i < max_shadow_read_write_fields; i++) { 3219 clear_bit(shadow_read_write_fields[i], 3220 vmx_vmwrite_bitmap); 3221 clear_bit(shadow_read_write_fields[i], 3222 vmx_vmread_bitmap); 3223 } 3224 for (i = 0; i < max_shadow_read_only_fields; i++) 3225 clear_bit(shadow_read_only_fields[i], 3226 vmx_vmread_bitmap); 3227} 3228 3229static __init int alloc_kvm_area(void) 3230{ 3231 int cpu; 3232 3233 for_each_possible_cpu(cpu) { 3234 struct vmcs *vmcs; 3235 3236 vmcs = alloc_vmcs_cpu(cpu); 3237 if (!vmcs) { 3238 free_kvm_area(); 3239 return -ENOMEM; 3240 } 3241 3242 per_cpu(vmxarea, cpu) = vmcs; 3243 } 3244 return 0; 3245} 3246 3247static bool emulation_required(struct kvm_vcpu *vcpu) 3248{ 3249 return emulate_invalid_guest_state && !guest_state_valid(vcpu); 3250} 3251 3252static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3253 struct kvm_segment *save) 3254{ 3255 if (!emulate_invalid_guest_state) { 3256 /* 3257 * CS and SS RPL should be equal during guest entry according 3258 * to VMX spec, but in reality it is not always so. Since vcpu 3259 * is in the middle of the transition from real mode to 3260 * protected mode it is safe to assume that RPL 0 is a good 3261 * default value. 3262 */ 3263 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3264 save->selector &= ~SEGMENT_RPL_MASK; 3265 save->dpl = save->selector & SEGMENT_RPL_MASK; 3266 save->s = 1; 3267 } 3268 vmx_set_segment(vcpu, save, seg); 3269} 3270 3271static void enter_pmode(struct kvm_vcpu *vcpu) 3272{ 3273 unsigned long flags; 3274 struct vcpu_vmx *vmx = to_vmx(vcpu); 3275 3276 /* 3277 * Update real mode segment cache. It may be not up-to-date if sement 3278 * register was written while vcpu was in a guest mode. 3279 */ 3280 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3281 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3282 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3283 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3284 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3285 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3286 3287 vmx->rmode.vm86_active = 0; 3288 3289 vmx_segment_cache_clear(vmx); 3290 3291 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3292 3293 flags = vmcs_readl(GUEST_RFLAGS); 3294 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3295 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3296 vmcs_writel(GUEST_RFLAGS, flags); 3297 3298 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3299 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3300 3301 update_exception_bitmap(vcpu); 3302 3303 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3304 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3305 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3306 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3307 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3308 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3309} 3310 3311static void fix_rmode_seg(int seg, struct kvm_segment *save) 3312{ 3313 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3314 struct kvm_segment var = *save; 3315 3316 var.dpl = 0x3; 3317 if (seg == VCPU_SREG_CS) 3318 var.type = 0x3; 3319 3320 if (!emulate_invalid_guest_state) { 3321 var.selector = var.base >> 4; 3322 var.base = var.base & 0xffff0; 3323 var.limit = 0xffff; 3324 var.g = 0; 3325 var.db = 0; 3326 var.present = 1; 3327 var.s = 1; 3328 var.l = 0; 3329 var.unusable = 0; 3330 var.type = 0x3; 3331 var.avl = 0; 3332 if (save->base & 0xf) 3333 printk_once(KERN_WARNING "kvm: segment base is not " 3334 "paragraph aligned when entering " 3335 "protected mode (seg=%d)", seg); 3336 } 3337 3338 vmcs_write16(sf->selector, var.selector); 3339 vmcs_write32(sf->base, var.base); 3340 vmcs_write32(sf->limit, var.limit); 3341 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3342} 3343 3344static void enter_rmode(struct kvm_vcpu *vcpu) 3345{ 3346 unsigned long flags; 3347 struct vcpu_vmx *vmx = to_vmx(vcpu); 3348 3349 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3350 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3351 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3352 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3353 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3354 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3355 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3356 3357 vmx->rmode.vm86_active = 1; 3358 3359 /* 3360 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3361 * vcpu. Warn the user that an update is overdue. 3362 */ 3363 if (!vcpu->kvm->arch.tss_addr) 3364 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3365 "called before entering vcpu\n"); 3366 3367 vmx_segment_cache_clear(vmx); 3368 3369 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); 3370 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3371 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3372 3373 flags = vmcs_readl(GUEST_RFLAGS); 3374 vmx->rmode.save_rflags = flags; 3375 3376 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3377 3378 vmcs_writel(GUEST_RFLAGS, flags); 3379 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3380 update_exception_bitmap(vcpu); 3381 3382 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3383 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3384 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3385 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3386 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3387 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3388 3389 kvm_mmu_reset_context(vcpu); 3390} 3391 3392static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3393{ 3394 struct vcpu_vmx *vmx = to_vmx(vcpu); 3395 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 3396 3397 if (!msr) 3398 return; 3399 3400 /* 3401 * Force kernel_gs_base reloading before EFER changes, as control 3402 * of this msr depends on is_long_mode(). 3403 */ 3404 vmx_load_host_state(to_vmx(vcpu)); 3405 vcpu->arch.efer = efer; 3406 if (efer & EFER_LMA) { 3407 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3408 msr->data = efer; 3409 } else { 3410 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3411 3412 msr->data = efer & ~EFER_LME; 3413 } 3414 setup_msrs(vmx); 3415} 3416 3417#ifdef CONFIG_X86_64 3418 3419static void enter_lmode(struct kvm_vcpu *vcpu) 3420{ 3421 u32 guest_tr_ar; 3422 3423 vmx_segment_cache_clear(to_vmx(vcpu)); 3424 3425 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3426 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 3427 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3428 __func__); 3429 vmcs_write32(GUEST_TR_AR_BYTES, 3430 (guest_tr_ar & ~AR_TYPE_MASK) 3431 | AR_TYPE_BUSY_64_TSS); 3432 } 3433 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3434} 3435 3436static void exit_lmode(struct kvm_vcpu *vcpu) 3437{ 3438 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3439 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3440} 3441 3442#endif 3443 3444static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 3445{ 3446 vpid_sync_context(to_vmx(vcpu)); 3447 if (enable_ept) { 3448 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3449 return; 3450 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 3451 } 3452} 3453 3454static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 3455{ 3456 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 3457 3458 vcpu->arch.cr0 &= ~cr0_guest_owned_bits; 3459 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 3460} 3461 3462static void vmx_decache_cr3(struct kvm_vcpu *vcpu) 3463{ 3464 if (enable_ept && is_paging(vcpu)) 3465 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3466 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3467} 3468 3469static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 3470{ 3471 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 3472 3473 vcpu->arch.cr4 &= ~cr4_guest_owned_bits; 3474 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; 3475} 3476 3477static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 3478{ 3479 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3480 3481 if (!test_bit(VCPU_EXREG_PDPTR, 3482 (unsigned long *)&vcpu->arch.regs_dirty)) 3483 return; 3484 3485 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3486 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3487 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3488 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3489 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3490 } 3491} 3492 3493static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3494{ 3495 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3496 3497 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3498 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3499 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3500 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3501 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3502 } 3503 3504 __set_bit(VCPU_EXREG_PDPTR, 3505 (unsigned long *)&vcpu->arch.regs_avail); 3506 __set_bit(VCPU_EXREG_PDPTR, 3507 (unsigned long *)&vcpu->arch.regs_dirty); 3508} 3509 3510static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 3511 3512static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 3513 unsigned long cr0, 3514 struct kvm_vcpu *vcpu) 3515{ 3516 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) 3517 vmx_decache_cr3(vcpu); 3518 if (!(cr0 & X86_CR0_PG)) { 3519 /* From paging/starting to nonpaging */ 3520 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3521 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 3522 (CPU_BASED_CR3_LOAD_EXITING | 3523 CPU_BASED_CR3_STORE_EXITING)); 3524 vcpu->arch.cr0 = cr0; 3525 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3526 } else if (!is_paging(vcpu)) { 3527 /* From nonpaging to paging */ 3528 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3529 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 3530 ~(CPU_BASED_CR3_LOAD_EXITING | 3531 CPU_BASED_CR3_STORE_EXITING)); 3532 vcpu->arch.cr0 = cr0; 3533 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3534 } 3535 3536 if (!(cr0 & X86_CR0_WP)) 3537 *hw_cr0 &= ~X86_CR0_WP; 3538} 3539 3540static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3541{ 3542 struct vcpu_vmx *vmx = to_vmx(vcpu); 3543 unsigned long hw_cr0; 3544 3545 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); 3546 if (enable_unrestricted_guest) 3547 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3548 else { 3549 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3550 3551 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3552 enter_pmode(vcpu); 3553 3554 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3555 enter_rmode(vcpu); 3556 } 3557 3558#ifdef CONFIG_X86_64 3559 if (vcpu->arch.efer & EFER_LME) { 3560 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 3561 enter_lmode(vcpu); 3562 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 3563 exit_lmode(vcpu); 3564 } 3565#endif 3566 3567 if (enable_ept) 3568 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 3569 3570 if (!vcpu->fpu_active) 3571 hw_cr0 |= X86_CR0_TS | X86_CR0_MP; 3572 3573 vmcs_writel(CR0_READ_SHADOW, cr0); 3574 vmcs_writel(GUEST_CR0, hw_cr0); 3575 vcpu->arch.cr0 = cr0; 3576 3577 /* depends on vcpu->arch.cr0 to be set to a new value */ 3578 vmx->emulation_required = emulation_required(vcpu); 3579} 3580 3581static u64 construct_eptp(unsigned long root_hpa) 3582{ 3583 u64 eptp; 3584 3585 /* TODO write the value reading from MSR */ 3586 eptp = VMX_EPT_DEFAULT_MT | 3587 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3588 if (enable_ept_ad_bits) 3589 eptp |= VMX_EPT_AD_ENABLE_BIT; 3590 eptp |= (root_hpa & PAGE_MASK); 3591 3592 return eptp; 3593} 3594 3595static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 3596{ 3597 unsigned long guest_cr3; 3598 u64 eptp; 3599 3600 guest_cr3 = cr3; 3601 if (enable_ept) { 3602 eptp = construct_eptp(cr3); 3603 vmcs_write64(EPT_POINTER, eptp); 3604 if (is_paging(vcpu) || is_guest_mode(vcpu)) 3605 guest_cr3 = kvm_read_cr3(vcpu); 3606 else 3607 guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; 3608 ept_load_pdptrs(vcpu); 3609 } 3610 3611 vmx_flush_tlb(vcpu); 3612 vmcs_writel(GUEST_CR3, guest_cr3); 3613} 3614 3615static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3616{ 3617 /* 3618 * Pass through host's Machine Check Enable value to hw_cr4, which 3619 * is in force while we are in guest mode. Do not let guests control 3620 * this bit, even if host CR4.MCE == 0. 3621 */ 3622 unsigned long hw_cr4 = 3623 (cr4_read_shadow() & X86_CR4_MCE) | 3624 (cr4 & ~X86_CR4_MCE) | 3625 (to_vmx(vcpu)->rmode.vm86_active ? 3626 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3627 3628 if (cr4 & X86_CR4_VMXE) { 3629 /* 3630 * To use VMXON (and later other VMX instructions), a guest 3631 * must first be able to turn on cr4.VMXE (see handle_vmon()). 3632 * So basically the check on whether to allow nested VMX 3633 * is here. 3634 */ 3635 if (!nested_vmx_allowed(vcpu)) 3636 return 1; 3637 } 3638 if (to_vmx(vcpu)->nested.vmxon && 3639 ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) 3640 return 1; 3641 3642 vcpu->arch.cr4 = cr4; 3643 if (enable_ept) { 3644 if (!is_paging(vcpu)) { 3645 hw_cr4 &= ~X86_CR4_PAE; 3646 hw_cr4 |= X86_CR4_PSE; 3647 /* 3648 * SMEP/SMAP is disabled if CPU is in non-paging mode 3649 * in hardware. However KVM always uses paging mode to 3650 * emulate guest non-paging mode with TDP. 3651 * To emulate this behavior, SMEP/SMAP needs to be 3652 * manually disabled when guest switches to non-paging 3653 * mode. 3654 */ 3655 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); 3656 } else if (!(cr4 & X86_CR4_PAE)) { 3657 hw_cr4 &= ~X86_CR4_PAE; 3658 } 3659 } 3660 3661 vmcs_writel(CR4_READ_SHADOW, cr4); 3662 vmcs_writel(GUEST_CR4, hw_cr4); 3663 return 0; 3664} 3665 3666static void vmx_get_segment(struct kvm_vcpu *vcpu, 3667 struct kvm_segment *var, int seg) 3668{ 3669 struct vcpu_vmx *vmx = to_vmx(vcpu); 3670 u32 ar; 3671 3672 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3673 *var = vmx->rmode.segs[seg]; 3674 if (seg == VCPU_SREG_TR 3675 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3676 return; 3677 var->base = vmx_read_guest_seg_base(vmx, seg); 3678 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3679 return; 3680 } 3681 var->base = vmx_read_guest_seg_base(vmx, seg); 3682 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3683 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3684 ar = vmx_read_guest_seg_ar(vmx, seg); 3685 var->unusable = (ar >> 16) & 1; 3686 var->type = ar & 15; 3687 var->s = (ar >> 4) & 1; 3688 var->dpl = (ar >> 5) & 3; 3689 /* 3690 * Some userspaces do not preserve unusable property. Since usable 3691 * segment has to be present according to VMX spec we can use present 3692 * property to amend userspace bug by making unusable segment always 3693 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3694 * segment as unusable. 3695 */ 3696 var->present = !var->unusable; 3697 var->avl = (ar >> 12) & 1; 3698 var->l = (ar >> 13) & 1; 3699 var->db = (ar >> 14) & 1; 3700 var->g = (ar >> 15) & 1; 3701} 3702 3703static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3704{ 3705 struct kvm_segment s; 3706 3707 if (to_vmx(vcpu)->rmode.vm86_active) { 3708 vmx_get_segment(vcpu, &s, seg); 3709 return s.base; 3710 } 3711 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3712} 3713 3714static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3715{ 3716 struct vcpu_vmx *vmx = to_vmx(vcpu); 3717 3718 if (unlikely(vmx->rmode.vm86_active)) 3719 return 0; 3720 else { 3721 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3722 return AR_DPL(ar); 3723 } 3724} 3725 3726static u32 vmx_segment_access_rights(struct kvm_segment *var) 3727{ 3728 u32 ar; 3729 3730 if (var->unusable || !var->present) 3731 ar = 1 << 16; 3732 else { 3733 ar = var->type & 15; 3734 ar |= (var->s & 1) << 4; 3735 ar |= (var->dpl & 3) << 5; 3736 ar |= (var->present & 1) << 7; 3737 ar |= (var->avl & 1) << 12; 3738 ar |= (var->l & 1) << 13; 3739 ar |= (var->db & 1) << 14; 3740 ar |= (var->g & 1) << 15; 3741 } 3742 3743 return ar; 3744} 3745 3746static void vmx_set_segment(struct kvm_vcpu *vcpu, 3747 struct kvm_segment *var, int seg) 3748{ 3749 struct vcpu_vmx *vmx = to_vmx(vcpu); 3750 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3751 3752 vmx_segment_cache_clear(vmx); 3753 3754 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3755 vmx->rmode.segs[seg] = *var; 3756 if (seg == VCPU_SREG_TR) 3757 vmcs_write16(sf->selector, var->selector); 3758 else if (var->s) 3759 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3760 goto out; 3761 } 3762 3763 vmcs_writel(sf->base, var->base); 3764 vmcs_write32(sf->limit, var->limit); 3765 vmcs_write16(sf->selector, var->selector); 3766 3767 /* 3768 * Fix the "Accessed" bit in AR field of segment registers for older 3769 * qemu binaries. 3770 * IA32 arch specifies that at the time of processor reset the 3771 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3772 * is setting it to 0 in the userland code. This causes invalid guest 3773 * state vmexit when "unrestricted guest" mode is turned on. 3774 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3775 * tree. Newer qemu binaries with that qemu fix would not need this 3776 * kvm hack. 3777 */ 3778 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3779 var->type |= 0x1; /* Accessed */ 3780 3781 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3782 3783out: 3784 vmx->emulation_required = emulation_required(vcpu); 3785} 3786 3787static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3788{ 3789 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3790 3791 *db = (ar >> 14) & 1; 3792 *l = (ar >> 13) & 1; 3793} 3794 3795static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3796{ 3797 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3798 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3799} 3800 3801static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3802{ 3803 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3804 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3805} 3806 3807static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3808{ 3809 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3810 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3811} 3812 3813static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3814{ 3815 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3816 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3817} 3818 3819static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3820{ 3821 struct kvm_segment var; 3822 u32 ar; 3823 3824 vmx_get_segment(vcpu, &var, seg); 3825 var.dpl = 0x3; 3826 if (seg == VCPU_SREG_CS) 3827 var.type = 0x3; 3828 ar = vmx_segment_access_rights(&var); 3829 3830 if (var.base != (var.selector << 4)) 3831 return false; 3832 if (var.limit != 0xffff) 3833 return false; 3834 if (ar != 0xf3) 3835 return false; 3836 3837 return true; 3838} 3839 3840static bool code_segment_valid(struct kvm_vcpu *vcpu) 3841{ 3842 struct kvm_segment cs; 3843 unsigned int cs_rpl; 3844 3845 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3846 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3847 3848 if (cs.unusable) 3849 return false; 3850 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) 3851 return false; 3852 if (!cs.s) 3853 return false; 3854 if (cs.type & AR_TYPE_WRITEABLE_MASK) { 3855 if (cs.dpl > cs_rpl) 3856 return false; 3857 } else { 3858 if (cs.dpl != cs_rpl) 3859 return false; 3860 } 3861 if (!cs.present) 3862 return false; 3863 3864 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3865 return true; 3866} 3867 3868static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3869{ 3870 struct kvm_segment ss; 3871 unsigned int ss_rpl; 3872 3873 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3874 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3875 3876 if (ss.unusable) 3877 return true; 3878 if (ss.type != 3 && ss.type != 7) 3879 return false; 3880 if (!ss.s) 3881 return false; 3882 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3883 return false; 3884 if (!ss.present) 3885 return false; 3886 3887 return true; 3888} 3889 3890static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3891{ 3892 struct kvm_segment var; 3893 unsigned int rpl; 3894 3895 vmx_get_segment(vcpu, &var, seg); 3896 rpl = var.selector & SEGMENT_RPL_MASK; 3897 3898 if (var.unusable) 3899 return true; 3900 if (!var.s) 3901 return false; 3902 if (!var.present) 3903 return false; 3904 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { 3905 if (var.dpl < rpl) /* DPL < RPL */ 3906 return false; 3907 } 3908 3909 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3910 * rights flags 3911 */ 3912 return true; 3913} 3914 3915static bool tr_valid(struct kvm_vcpu *vcpu) 3916{ 3917 struct kvm_segment tr; 3918 3919 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3920 3921 if (tr.unusable) 3922 return false; 3923 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3924 return false; 3925 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3926 return false; 3927 if (!tr.present) 3928 return false; 3929 3930 return true; 3931} 3932 3933static bool ldtr_valid(struct kvm_vcpu *vcpu) 3934{ 3935 struct kvm_segment ldtr; 3936 3937 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3938 3939 if (ldtr.unusable) 3940 return true; 3941 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3942 return false; 3943 if (ldtr.type != 2) 3944 return false; 3945 if (!ldtr.present) 3946 return false; 3947 3948 return true; 3949} 3950 3951static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3952{ 3953 struct kvm_segment cs, ss; 3954 3955 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3956 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3957 3958 return ((cs.selector & SEGMENT_RPL_MASK) == 3959 (ss.selector & SEGMENT_RPL_MASK)); 3960} 3961 3962/* 3963 * Check if guest state is valid. Returns true if valid, false if 3964 * not. 3965 * We assume that registers are always usable 3966 */ 3967static bool guest_state_valid(struct kvm_vcpu *vcpu) 3968{ 3969 if (enable_unrestricted_guest) 3970 return true; 3971 3972 /* real mode guest state checks */ 3973 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3974 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3975 return false; 3976 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3977 return false; 3978 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3979 return false; 3980 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3981 return false; 3982 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3983 return false; 3984 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3985 return false; 3986 } else { 3987 /* protected mode guest state checks */ 3988 if (!cs_ss_rpl_check(vcpu)) 3989 return false; 3990 if (!code_segment_valid(vcpu)) 3991 return false; 3992 if (!stack_segment_valid(vcpu)) 3993 return false; 3994 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3995 return false; 3996 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3997 return false; 3998 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3999 return false; 4000 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 4001 return false; 4002 if (!tr_valid(vcpu)) 4003 return false; 4004 if (!ldtr_valid(vcpu)) 4005 return false; 4006 } 4007 /* TODO: 4008 * - Add checks on RIP 4009 * - Add checks on RFLAGS 4010 */ 4011 4012 return true; 4013} 4014 4015static int init_rmode_tss(struct kvm *kvm) 4016{ 4017 gfn_t fn; 4018 u16 data = 0; 4019 int idx, r; 4020 4021 idx = srcu_read_lock(&kvm->srcu); 4022 fn = kvm->arch.tss_addr >> PAGE_SHIFT; 4023 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 4024 if (r < 0) 4025 goto out; 4026 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 4027 r = kvm_write_guest_page(kvm, fn++, &data, 4028 TSS_IOPB_BASE_OFFSET, sizeof(u16)); 4029 if (r < 0) 4030 goto out; 4031 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 4032 if (r < 0) 4033 goto out; 4034 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 4035 if (r < 0) 4036 goto out; 4037 data = ~0; 4038 r = kvm_write_guest_page(kvm, fn, &data, 4039 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, 4040 sizeof(u8)); 4041out: 4042 srcu_read_unlock(&kvm->srcu, idx); 4043 return r; 4044} 4045 4046static int init_rmode_identity_map(struct kvm *kvm) 4047{ 4048 int i, idx, r = 0; 4049 pfn_t identity_map_pfn; 4050 u32 tmp; 4051 4052 if (!enable_ept) 4053 return 0; 4054 4055 /* Protect kvm->arch.ept_identity_pagetable_done. */ 4056 mutex_lock(&kvm->slots_lock); 4057 4058 if (likely(kvm->arch.ept_identity_pagetable_done)) 4059 goto out2; 4060 4061 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 4062 4063 r = alloc_identity_pagetable(kvm); 4064 if (r < 0) 4065 goto out2; 4066 4067 idx = srcu_read_lock(&kvm->srcu); 4068 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 4069 if (r < 0) 4070 goto out; 4071 /* Set up identity-mapping pagetable for EPT in real mode */ 4072 for (i = 0; i < PT32_ENT_PER_PAGE; i++) { 4073 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 4074 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 4075 r = kvm_write_guest_page(kvm, identity_map_pfn, 4076 &tmp, i * sizeof(tmp), sizeof(tmp)); 4077 if (r < 0) 4078 goto out; 4079 } 4080 kvm->arch.ept_identity_pagetable_done = true; 4081 4082out: 4083 srcu_read_unlock(&kvm->srcu, idx); 4084 4085out2: 4086 mutex_unlock(&kvm->slots_lock); 4087 return r; 4088} 4089 4090static void seg_setup(int seg) 4091{ 4092 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 4093 unsigned int ar; 4094 4095 vmcs_write16(sf->selector, 0); 4096 vmcs_writel(sf->base, 0); 4097 vmcs_write32(sf->limit, 0xffff); 4098 ar = 0x93; 4099 if (seg == VCPU_SREG_CS) 4100 ar |= 0x08; /* code segment */ 4101 4102 vmcs_write32(sf->ar_bytes, ar); 4103} 4104 4105static int alloc_apic_access_page(struct kvm *kvm) 4106{ 4107 struct page *page; 4108 struct kvm_userspace_memory_region kvm_userspace_mem; 4109 int r = 0; 4110 4111 mutex_lock(&kvm->slots_lock); 4112 if (kvm->arch.apic_access_page_done) 4113 goto out; 4114 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 4115 kvm_userspace_mem.flags = 0; 4116 kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; 4117 kvm_userspace_mem.memory_size = PAGE_SIZE; 4118 r = __x86_set_memory_region(kvm, &kvm_userspace_mem); 4119 if (r) 4120 goto out; 4121 4122 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 4123 if (is_error_page(page)) { 4124 r = -EFAULT; 4125 goto out; 4126 } 4127 4128 /* 4129 * Do not pin the page in memory, so that memory hot-unplug 4130 * is able to migrate it. 4131 */ 4132 put_page(page); 4133 kvm->arch.apic_access_page_done = true; 4134out: 4135 mutex_unlock(&kvm->slots_lock); 4136 return r; 4137} 4138 4139static int alloc_identity_pagetable(struct kvm *kvm) 4140{ 4141 /* Called with kvm->slots_lock held. */ 4142 4143 struct kvm_userspace_memory_region kvm_userspace_mem; 4144 int r = 0; 4145 4146 BUG_ON(kvm->arch.ept_identity_pagetable_done); 4147 4148 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 4149 kvm_userspace_mem.flags = 0; 4150 kvm_userspace_mem.guest_phys_addr = 4151 kvm->arch.ept_identity_map_addr; 4152 kvm_userspace_mem.memory_size = PAGE_SIZE; 4153 r = __x86_set_memory_region(kvm, &kvm_userspace_mem); 4154 4155 return r; 4156} 4157 4158static void allocate_vpid(struct vcpu_vmx *vmx) 4159{ 4160 int vpid; 4161 4162 vmx->vpid = 0; 4163 if (!enable_vpid) 4164 return; 4165 spin_lock(&vmx_vpid_lock); 4166 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4167 if (vpid < VMX_NR_VPIDS) { 4168 vmx->vpid = vpid; 4169 __set_bit(vpid, vmx_vpid_bitmap); 4170 } 4171 spin_unlock(&vmx_vpid_lock); 4172} 4173 4174static void free_vpid(struct vcpu_vmx *vmx) 4175{ 4176 if (!enable_vpid) 4177 return; 4178 spin_lock(&vmx_vpid_lock); 4179 if (vmx->vpid != 0) 4180 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 4181 spin_unlock(&vmx_vpid_lock); 4182} 4183 4184#define MSR_TYPE_R 1 4185#define MSR_TYPE_W 2 4186static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 4187 u32 msr, int type) 4188{ 4189 int f = sizeof(unsigned long); 4190 4191 if (!cpu_has_vmx_msr_bitmap()) 4192 return; 4193 4194 /* 4195 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4196 * have the write-low and read-high bitmap offsets the wrong way round. 4197 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4198 */ 4199 if (msr <= 0x1fff) { 4200 if (type & MSR_TYPE_R) 4201 /* read-low */ 4202 __clear_bit(msr, msr_bitmap + 0x000 / f); 4203 4204 if (type & MSR_TYPE_W) 4205 /* write-low */ 4206 __clear_bit(msr, msr_bitmap + 0x800 / f); 4207 4208 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4209 msr &= 0x1fff; 4210 if (type & MSR_TYPE_R) 4211 /* read-high */ 4212 __clear_bit(msr, msr_bitmap + 0x400 / f); 4213 4214 if (type & MSR_TYPE_W) 4215 /* write-high */ 4216 __clear_bit(msr, msr_bitmap + 0xc00 / f); 4217 4218 } 4219} 4220 4221static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, 4222 u32 msr, int type) 4223{ 4224 int f = sizeof(unsigned long); 4225 4226 if (!cpu_has_vmx_msr_bitmap()) 4227 return; 4228 4229 /* 4230 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4231 * have the write-low and read-high bitmap offsets the wrong way round. 4232 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4233 */ 4234 if (msr <= 0x1fff) { 4235 if (type & MSR_TYPE_R) 4236 /* read-low */ 4237 __set_bit(msr, msr_bitmap + 0x000 / f); 4238 4239 if (type & MSR_TYPE_W) 4240 /* write-low */ 4241 __set_bit(msr, msr_bitmap + 0x800 / f); 4242 4243 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4244 msr &= 0x1fff; 4245 if (type & MSR_TYPE_R) 4246 /* read-high */ 4247 __set_bit(msr, msr_bitmap + 0x400 / f); 4248 4249 if (type & MSR_TYPE_W) 4250 /* write-high */ 4251 __set_bit(msr, msr_bitmap + 0xc00 / f); 4252 4253 } 4254} 4255 4256/* 4257 * If a msr is allowed by L0, we should check whether it is allowed by L1. 4258 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 4259 */ 4260static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 4261 unsigned long *msr_bitmap_nested, 4262 u32 msr, int type) 4263{ 4264 int f = sizeof(unsigned long); 4265 4266 if (!cpu_has_vmx_msr_bitmap()) { 4267 WARN_ON(1); 4268 return; 4269 } 4270 4271 /* 4272 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4273 * have the write-low and read-high bitmap offsets the wrong way round. 4274 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4275 */ 4276 if (msr <= 0x1fff) { 4277 if (type & MSR_TYPE_R && 4278 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 4279 /* read-low */ 4280 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 4281 4282 if (type & MSR_TYPE_W && 4283 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 4284 /* write-low */ 4285 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 4286 4287 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4288 msr &= 0x1fff; 4289 if (type & MSR_TYPE_R && 4290 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 4291 /* read-high */ 4292 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 4293 4294 if (type & MSR_TYPE_W && 4295 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 4296 /* write-high */ 4297 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 4298 4299 } 4300} 4301 4302static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 4303{ 4304 if (!longmode_only) 4305 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, 4306 msr, MSR_TYPE_R | MSR_TYPE_W); 4307 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, 4308 msr, MSR_TYPE_R | MSR_TYPE_W); 4309} 4310 4311static void vmx_enable_intercept_msr_read_x2apic(u32 msr) 4312{ 4313 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4314 msr, MSR_TYPE_R); 4315 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4316 msr, MSR_TYPE_R); 4317} 4318 4319static void vmx_disable_intercept_msr_read_x2apic(u32 msr) 4320{ 4321 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4322 msr, MSR_TYPE_R); 4323 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4324 msr, MSR_TYPE_R); 4325} 4326 4327static void vmx_disable_intercept_msr_write_x2apic(u32 msr) 4328{ 4329 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4330 msr, MSR_TYPE_W); 4331 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4332 msr, MSR_TYPE_W); 4333} 4334 4335static int vmx_vm_has_apicv(struct kvm *kvm) 4336{ 4337 return enable_apicv && irqchip_in_kernel(kvm); 4338} 4339 4340static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4341{ 4342 struct vcpu_vmx *vmx = to_vmx(vcpu); 4343 int max_irr; 4344 void *vapic_page; 4345 u16 status; 4346 4347 if (vmx->nested.pi_desc && 4348 vmx->nested.pi_pending) { 4349 vmx->nested.pi_pending = false; 4350 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4351 return 0; 4352 4353 max_irr = find_last_bit( 4354 (unsigned long *)vmx->nested.pi_desc->pir, 256); 4355 4356 if (max_irr == 256) 4357 return 0; 4358 4359 vapic_page = kmap(vmx->nested.virtual_apic_page); 4360 if (!vapic_page) { 4361 WARN_ON(1); 4362 return -ENOMEM; 4363 } 4364 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); 4365 kunmap(vmx->nested.virtual_apic_page); 4366 4367 status = vmcs_read16(GUEST_INTR_STATUS); 4368 if ((u8)max_irr > ((u8)status & 0xff)) { 4369 status &= ~0xff; 4370 status |= (u8)max_irr; 4371 vmcs_write16(GUEST_INTR_STATUS, status); 4372 } 4373 } 4374 return 0; 4375} 4376 4377static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) 4378{ 4379#ifdef CONFIG_SMP 4380 if (vcpu->mode == IN_GUEST_MODE) { 4381 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), 4382 POSTED_INTR_VECTOR); 4383 return true; 4384 } 4385#endif 4386 return false; 4387} 4388 4389static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4390 int vector) 4391{ 4392 struct vcpu_vmx *vmx = to_vmx(vcpu); 4393 4394 if (is_guest_mode(vcpu) && 4395 vector == vmx->nested.posted_intr_nv) { 4396 /* the PIR and ON have been set by L1. */ 4397 kvm_vcpu_trigger_posted_interrupt(vcpu); 4398 /* 4399 * If a posted intr is not recognized by hardware, 4400 * we will accomplish it in the next vmentry. 4401 */ 4402 vmx->nested.pi_pending = true; 4403 kvm_make_request(KVM_REQ_EVENT, vcpu); 4404 return 0; 4405 } 4406 return -1; 4407} 4408/* 4409 * Send interrupt to vcpu via posted interrupt way. 4410 * 1. If target vcpu is running(non-root mode), send posted interrupt 4411 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4412 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4413 * interrupt from PIR in next vmentry. 4414 */ 4415static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4416{ 4417 struct vcpu_vmx *vmx = to_vmx(vcpu); 4418 int r; 4419 4420 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4421 if (!r) 4422 return; 4423 4424 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4425 return; 4426 4427 r = pi_test_and_set_on(&vmx->pi_desc); 4428 kvm_make_request(KVM_REQ_EVENT, vcpu); 4429 if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) 4430 kvm_vcpu_kick(vcpu); 4431} 4432 4433static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 4434{ 4435 struct vcpu_vmx *vmx = to_vmx(vcpu); 4436 4437 if (!pi_test_and_clear_on(&vmx->pi_desc)) 4438 return; 4439 4440 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); 4441} 4442 4443static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) 4444{ 4445 return; 4446} 4447 4448/* 4449 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4450 * will not change in the lifetime of the guest. 4451 * Note that host-state that does change is set elsewhere. E.g., host-state 4452 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4453 */ 4454static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4455{ 4456 u32 low32, high32; 4457 unsigned long tmpl; 4458 struct desc_ptr dt; 4459 unsigned long cr4; 4460 4461 vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ 4462 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 4463 4464 /* Save the most likely value for this task's CR4 in the VMCS. */ 4465 cr4 = cr4_read_shadow(); 4466 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4467 vmx->host_state.vmcs_host_cr4 = cr4; 4468 4469 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4470#ifdef CONFIG_X86_64 4471 /* 4472 * Load null selectors, so we can avoid reloading them in 4473 * __vmx_load_host_state(), in case userspace uses the null selectors 4474 * too (the expected case). 4475 */ 4476 vmcs_write16(HOST_DS_SELECTOR, 0); 4477 vmcs_write16(HOST_ES_SELECTOR, 0); 4478#else 4479 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4480 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4481#endif 4482 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4483 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4484 4485 native_store_idt(&dt); 4486 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 4487 vmx->host_idt_base = dt.address; 4488 4489 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ 4490 4491 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4492 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4493 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4494 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4495 4496 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4497 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4498 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4499 } 4500} 4501 4502static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4503{ 4504 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 4505 if (enable_ept) 4506 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 4507 if (is_guest_mode(&vmx->vcpu)) 4508 vmx->vcpu.arch.cr4_guest_owned_bits &= 4509 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; 4510 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 4511} 4512 4513static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4514{ 4515 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4516 4517 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4518 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4519 return pin_based_exec_ctrl; 4520} 4521 4522static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4523{ 4524 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4525 4526 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4527 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4528 4529 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { 4530 exec_control &= ~CPU_BASED_TPR_SHADOW; 4531#ifdef CONFIG_X86_64 4532 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4533 CPU_BASED_CR8_LOAD_EXITING; 4534#endif 4535 } 4536 if (!enable_ept) 4537 exec_control |= CPU_BASED_CR3_STORE_EXITING | 4538 CPU_BASED_CR3_LOAD_EXITING | 4539 CPU_BASED_INVLPG_EXITING; 4540 return exec_control; 4541} 4542 4543static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4544{ 4545 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4546 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 4547 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4548 if (vmx->vpid == 0) 4549 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4550 if (!enable_ept) { 4551 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4552 enable_unrestricted_guest = 0; 4553 /* Enable INVPCID for non-ept guests may cause performance regression. */ 4554 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 4555 } 4556 if (!enable_unrestricted_guest) 4557 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4558 if (!ple_gap) 4559 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4560 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4561 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4562 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4563 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4564 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4565 (handle_vmptrld). 4566 We can NOT enable shadow_vmcs here because we don't have yet 4567 a current VMCS12 4568 */ 4569 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4570 /* PML is enabled/disabled in creating/destorying vcpu */ 4571 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4572 4573 return exec_control; 4574} 4575 4576static void ept_set_mmio_spte_mask(void) 4577{ 4578 /* 4579 * EPT Misconfigurations can be generated if the value of bits 2:0 4580 * of an EPT paging-structure entry is 110b (write/execute). 4581 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio 4582 * spte. 4583 */ 4584 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); 4585} 4586 4587#define VMX_XSS_EXIT_BITMAP 0 4588/* 4589 * Sets up the vmcs for emulated real mode. 4590 */ 4591static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 4592{ 4593#ifdef CONFIG_X86_64 4594 unsigned long a; 4595#endif 4596 int i; 4597 4598 /* I/O */ 4599 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 4600 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); 4601 4602 if (enable_shadow_vmcs) { 4603 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 4604 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 4605 } 4606 if (cpu_has_vmx_msr_bitmap()) 4607 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 4608 4609 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4610 4611 /* Control */ 4612 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4613 4614 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4615 4616 if (cpu_has_secondary_exec_ctrls()) { 4617 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4618 vmx_secondary_exec_control(vmx)); 4619 } 4620 4621 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { 4622 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4623 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4624 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4625 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4626 4627 vmcs_write16(GUEST_INTR_STATUS, 0); 4628 4629 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4630 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4631 } 4632 4633 if (ple_gap) { 4634 vmcs_write32(PLE_GAP, ple_gap); 4635 vmx->ple_window = ple_window; 4636 vmx->ple_window_dirty = true; 4637 } 4638 4639 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4640 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4641 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4642 4643 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4644 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4645 vmx_set_constant_host_state(vmx); 4646#ifdef CONFIG_X86_64 4647 rdmsrl(MSR_FS_BASE, a); 4648 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 4649 rdmsrl(MSR_GS_BASE, a); 4650 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ 4651#else 4652 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4653 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4654#endif 4655 4656 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4657 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4658 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 4659 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4660 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 4661 4662 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4663 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4664 4665 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { 4666 u32 index = vmx_msr_index[i]; 4667 u32 data_low, data_high; 4668 int j = vmx->nmsrs; 4669 4670 if (rdmsr_safe(index, &data_low, &data_high) < 0) 4671 continue; 4672 if (wrmsr_safe(index, data_low, data_high) < 0) 4673 continue; 4674 vmx->guest_msrs[j].index = i; 4675 vmx->guest_msrs[j].data = 0; 4676 vmx->guest_msrs[j].mask = -1ull; 4677 ++vmx->nmsrs; 4678 } 4679 4680 4681 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); 4682 4683 /* 22.2.1, 20.8.1 */ 4684 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); 4685 4686 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 4687 set_cr4_guest_host_mask(vmx); 4688 4689 if (vmx_xsaves_supported()) 4690 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4691 4692 return 0; 4693} 4694 4695static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4696{ 4697 struct vcpu_vmx *vmx = to_vmx(vcpu); 4698 struct msr_data apic_base_msr; 4699 u64 cr0; 4700 4701 vmx->rmode.vm86_active = 0; 4702 4703 vmx->soft_vnmi_blocked = 0; 4704 4705 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4706 kvm_set_cr8(vcpu, 0); 4707 4708 if (!init_event) { 4709 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | 4710 MSR_IA32_APICBASE_ENABLE; 4711 if (kvm_vcpu_is_reset_bsp(vcpu)) 4712 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4713 apic_base_msr.host_initiated = true; 4714 kvm_set_apic_base(vcpu, &apic_base_msr); 4715 } 4716 4717 vmx_segment_cache_clear(vmx); 4718 4719 seg_setup(VCPU_SREG_CS); 4720 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4721 vmcs_write32(GUEST_CS_BASE, 0xffff0000); 4722 4723 seg_setup(VCPU_SREG_DS); 4724 seg_setup(VCPU_SREG_ES); 4725 seg_setup(VCPU_SREG_FS); 4726 seg_setup(VCPU_SREG_GS); 4727 seg_setup(VCPU_SREG_SS); 4728 4729 vmcs_write16(GUEST_TR_SELECTOR, 0); 4730 vmcs_writel(GUEST_TR_BASE, 0); 4731 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4732 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4733 4734 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4735 vmcs_writel(GUEST_LDTR_BASE, 0); 4736 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4737 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4738 4739 if (!init_event) { 4740 vmcs_write32(GUEST_SYSENTER_CS, 0); 4741 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4742 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4743 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4744 } 4745 4746 vmcs_writel(GUEST_RFLAGS, 0x02); 4747 kvm_rip_write(vcpu, 0xfff0); 4748 4749 vmcs_writel(GUEST_GDTR_BASE, 0); 4750 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4751 4752 vmcs_writel(GUEST_IDTR_BASE, 0); 4753 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4754 4755 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4756 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4757 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4758 4759 setup_msrs(vmx); 4760 4761 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4762 4763 if (cpu_has_vmx_tpr_shadow() && !init_event) { 4764 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4765 if (vm_need_tpr_shadow(vcpu->kvm)) 4766 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4767 __pa(vcpu->arch.apic->regs)); 4768 vmcs_write32(TPR_THRESHOLD, 0); 4769 } 4770 4771 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4772 4773 if (vmx_vm_has_apicv(vcpu->kvm)) 4774 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); 4775 4776 if (vmx->vpid != 0) 4777 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4778 4779 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4780 vmx_set_cr0(vcpu, cr0); /* enter rmode */ 4781 vmx->vcpu.arch.cr0 = cr0; 4782 vmx_set_cr4(vcpu, 0); 4783 if (!init_event) 4784 vmx_set_efer(vcpu, 0); 4785 vmx_fpu_activate(vcpu); 4786 update_exception_bitmap(vcpu); 4787 4788 vpid_sync_context(vmx); 4789} 4790 4791/* 4792 * In nested virtualization, check if L1 asked to exit on external interrupts. 4793 * For most existing hypervisors, this will always return true. 4794 */ 4795static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) 4796{ 4797 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4798 PIN_BASED_EXT_INTR_MASK; 4799} 4800 4801/* 4802 * In nested virtualization, check if L1 has set 4803 * VM_EXIT_ACK_INTR_ON_EXIT 4804 */ 4805static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 4806{ 4807 return get_vmcs12(vcpu)->vm_exit_controls & 4808 VM_EXIT_ACK_INTR_ON_EXIT; 4809} 4810 4811static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 4812{ 4813 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4814 PIN_BASED_NMI_EXITING; 4815} 4816 4817static void enable_irq_window(struct kvm_vcpu *vcpu) 4818{ 4819 u32 cpu_based_vm_exec_control; 4820 4821 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4822 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 4823 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4824} 4825 4826static void enable_nmi_window(struct kvm_vcpu *vcpu) 4827{ 4828 u32 cpu_based_vm_exec_control; 4829 4830 if (!cpu_has_virtual_nmis() || 4831 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4832 enable_irq_window(vcpu); 4833 return; 4834 } 4835 4836 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4837 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 4838 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4839} 4840 4841static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4842{ 4843 struct vcpu_vmx *vmx = to_vmx(vcpu); 4844 uint32_t intr; 4845 int irq = vcpu->arch.interrupt.nr; 4846 4847 trace_kvm_inj_virq(irq); 4848 4849 ++vcpu->stat.irq_injections; 4850 if (vmx->rmode.vm86_active) { 4851 int inc_eip = 0; 4852 if (vcpu->arch.interrupt.soft) 4853 inc_eip = vcpu->arch.event_exit_inst_len; 4854 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) 4855 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4856 return; 4857 } 4858 intr = irq | INTR_INFO_VALID_MASK; 4859 if (vcpu->arch.interrupt.soft) { 4860 intr |= INTR_TYPE_SOFT_INTR; 4861 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4862 vmx->vcpu.arch.event_exit_inst_len); 4863 } else 4864 intr |= INTR_TYPE_EXT_INTR; 4865 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4866} 4867 4868static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4869{ 4870 struct vcpu_vmx *vmx = to_vmx(vcpu); 4871 4872 if (is_guest_mode(vcpu)) 4873 return; 4874 4875 if (!cpu_has_virtual_nmis()) { 4876 /* 4877 * Tracking the NMI-blocked state in software is built upon 4878 * finding the next open IRQ window. This, in turn, depends on 4879 * well-behaving guests: They have to keep IRQs disabled at 4880 * least as long as the NMI handler runs. Otherwise we may 4881 * cause NMI nesting, maybe breaking the guest. But as this is 4882 * highly unlikely, we can live with the residual risk. 4883 */ 4884 vmx->soft_vnmi_blocked = 1; 4885 vmx->vnmi_blocked_time = 0; 4886 } 4887 4888 ++vcpu->stat.nmi_injections; 4889 vmx->nmi_known_unmasked = false; 4890 if (vmx->rmode.vm86_active) { 4891 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) 4892 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4893 return; 4894 } 4895 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4896 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4897} 4898 4899static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4900{ 4901 if (!cpu_has_virtual_nmis()) 4902 return to_vmx(vcpu)->soft_vnmi_blocked; 4903 if (to_vmx(vcpu)->nmi_known_unmasked) 4904 return false; 4905 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4906} 4907 4908static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4909{ 4910 struct vcpu_vmx *vmx = to_vmx(vcpu); 4911 4912 if (!cpu_has_virtual_nmis()) { 4913 if (vmx->soft_vnmi_blocked != masked) { 4914 vmx->soft_vnmi_blocked = masked; 4915 vmx->vnmi_blocked_time = 0; 4916 } 4917 } else { 4918 vmx->nmi_known_unmasked = !masked; 4919 if (masked) 4920 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4921 GUEST_INTR_STATE_NMI); 4922 else 4923 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4924 GUEST_INTR_STATE_NMI); 4925 } 4926} 4927 4928static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4929{ 4930 if (to_vmx(vcpu)->nested.nested_run_pending) 4931 return 0; 4932 4933 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 4934 return 0; 4935 4936 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4937 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 4938 | GUEST_INTR_STATE_NMI)); 4939} 4940 4941static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4942{ 4943 return (!to_vmx(vcpu)->nested.nested_run_pending && 4944 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 4945 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4946 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4947} 4948 4949static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 4950{ 4951 int ret; 4952 struct kvm_userspace_memory_region tss_mem = { 4953 .slot = TSS_PRIVATE_MEMSLOT, 4954 .guest_phys_addr = addr, 4955 .memory_size = PAGE_SIZE * 3, 4956 .flags = 0, 4957 }; 4958 4959 ret = x86_set_memory_region(kvm, &tss_mem); 4960 if (ret) 4961 return ret; 4962 kvm->arch.tss_addr = addr; 4963 return init_rmode_tss(kvm); 4964} 4965 4966static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 4967{ 4968 switch (vec) { 4969 case BP_VECTOR: 4970 /* 4971 * Update instruction length as we may reinject the exception 4972 * from user space while in guest debugging mode. 4973 */ 4974 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4975 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4976 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4977 return false; 4978 /* fall through */ 4979 case DB_VECTOR: 4980 if (vcpu->guest_debug & 4981 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 4982 return false; 4983 /* fall through */ 4984 case DE_VECTOR: 4985 case OF_VECTOR: 4986 case BR_VECTOR: 4987 case UD_VECTOR: 4988 case DF_VECTOR: 4989 case SS_VECTOR: 4990 case GP_VECTOR: 4991 case MF_VECTOR: 4992 return true; 4993 break; 4994 } 4995 return false; 4996} 4997 4998static int handle_rmode_exception(struct kvm_vcpu *vcpu, 4999 int vec, u32 err_code) 5000{ 5001 /* 5002 * Instruction with address size override prefix opcode 0x67 5003 * Cause the #SS fault with 0 error code in VM86 mode. 5004 */ 5005 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5006 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { 5007 if (vcpu->arch.halt_request) { 5008 vcpu->arch.halt_request = 0; 5009 return kvm_vcpu_halt(vcpu); 5010 } 5011 return 1; 5012 } 5013 return 0; 5014 } 5015 5016 /* 5017 * Forward all other exceptions that are valid in real mode. 5018 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5019 * the required debugging infrastructure rework. 5020 */ 5021 kvm_queue_exception(vcpu, vec); 5022 return 1; 5023} 5024 5025/* 5026 * Trigger machine check on the host. We assume all the MSRs are already set up 5027 * by the CPU and that we still run on the same CPU as the MCE occurred on. 5028 * We pass a fake environment to the machine check handler because we want 5029 * the guest to be always treated like user space, no matter what context 5030 * it used internally. 5031 */ 5032static void kvm_machine_check(void) 5033{ 5034#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) 5035 struct pt_regs regs = { 5036 .cs = 3, /* Fake ring 3 no matter what the guest ran on */ 5037 .flags = X86_EFLAGS_IF, 5038 }; 5039 5040 do_machine_check(&regs, 0); 5041#endif 5042} 5043 5044static int handle_machine_check(struct kvm_vcpu *vcpu) 5045{ 5046 /* already handled by vcpu_run */ 5047 return 1; 5048} 5049 5050static int handle_exception(struct kvm_vcpu *vcpu) 5051{ 5052 struct vcpu_vmx *vmx = to_vmx(vcpu); 5053 struct kvm_run *kvm_run = vcpu->run; 5054 u32 intr_info, ex_no, error_code; 5055 unsigned long cr2, rip, dr6; 5056 u32 vect_info; 5057 enum emulation_result er; 5058 5059 vect_info = vmx->idt_vectoring_info; 5060 intr_info = vmx->exit_intr_info; 5061 5062 if (is_machine_check(intr_info)) 5063 return handle_machine_check(vcpu); 5064 5065 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 5066 return 1; /* already handled by vmx_vcpu_run() */ 5067 5068 if (is_no_device(intr_info)) { 5069 vmx_fpu_activate(vcpu); 5070 return 1; 5071 } 5072 5073 if (is_invalid_opcode(intr_info)) { 5074 if (is_guest_mode(vcpu)) { 5075 kvm_queue_exception(vcpu, UD_VECTOR); 5076 return 1; 5077 } 5078 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); 5079 if (er != EMULATE_DONE) 5080 kvm_queue_exception(vcpu, UD_VECTOR); 5081 return 1; 5082 } 5083 5084 error_code = 0; 5085 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5086 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5087 5088 /* 5089 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5090 * MMIO, it is better to report an internal error. 5091 * See the comments in vmx_handle_exit. 5092 */ 5093 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5094 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5095 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5096 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5097 vcpu->run->internal.ndata = 3; 5098 vcpu->run->internal.data[0] = vect_info; 5099 vcpu->run->internal.data[1] = intr_info; 5100 vcpu->run->internal.data[2] = error_code; 5101 return 0; 5102 } 5103 5104 if (is_page_fault(intr_info)) { 5105 /* EPT won't cause page fault directly */ 5106 BUG_ON(enable_ept); 5107 cr2 = vmcs_readl(EXIT_QUALIFICATION); 5108 trace_kvm_page_fault(cr2, error_code); 5109 5110 if (kvm_event_needs_reinjection(vcpu)) 5111 kvm_mmu_unprotect_page_virt(vcpu, cr2); 5112 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 5113 } 5114 5115 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5116 5117 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5118 return handle_rmode_exception(vcpu, ex_no, error_code); 5119 5120 switch (ex_no) { 5121 case DB_VECTOR: 5122 dr6 = vmcs_readl(EXIT_QUALIFICATION); 5123 if (!(vcpu->guest_debug & 5124 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5125 vcpu->arch.dr6 &= ~15; 5126 vcpu->arch.dr6 |= dr6 | DR6_RTM; 5127 if (!(dr6 & ~DR6_RESERVED)) /* icebp */ 5128 skip_emulated_instruction(vcpu); 5129 5130 kvm_queue_exception(vcpu, DB_VECTOR); 5131 return 1; 5132 } 5133 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; 5134 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5135 /* fall through */ 5136 case BP_VECTOR: 5137 /* 5138 * Update instruction length as we may reinject #BP from 5139 * user space while in guest debugging mode. Reading it for 5140 * #DB as well causes no harm, it is not used in that case. 5141 */ 5142 vmx->vcpu.arch.event_exit_inst_len = 5143 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5144 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5145 rip = kvm_rip_read(vcpu); 5146 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 5147 kvm_run->debug.arch.exception = ex_no; 5148 break; 5149 default: 5150 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5151 kvm_run->ex.exception = ex_no; 5152 kvm_run->ex.error_code = error_code; 5153 break; 5154 } 5155 return 0; 5156} 5157 5158static int handle_external_interrupt(struct kvm_vcpu *vcpu) 5159{ 5160 ++vcpu->stat.irq_exits; 5161 return 1; 5162} 5163 5164static int handle_triple_fault(struct kvm_vcpu *vcpu) 5165{ 5166 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5167 return 0; 5168} 5169 5170static int handle_io(struct kvm_vcpu *vcpu) 5171{ 5172 unsigned long exit_qualification; 5173 int size, in, string; 5174 unsigned port; 5175 5176 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5177 string = (exit_qualification & 16) != 0; 5178 in = (exit_qualification & 8) != 0; 5179 5180 ++vcpu->stat.io_exits; 5181 5182 if (string || in) 5183 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5184 5185 port = exit_qualification >> 16; 5186 size = (exit_qualification & 7) + 1; 5187 skip_emulated_instruction(vcpu); 5188 5189 return kvm_fast_pio_out(vcpu, size, port); 5190} 5191 5192static void 5193vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5194{ 5195 /* 5196 * Patch in the VMCALL instruction: 5197 */ 5198 hypercall[0] = 0x0f; 5199 hypercall[1] = 0x01; 5200 hypercall[2] = 0xc1; 5201} 5202 5203static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) 5204{ 5205 unsigned long always_on = VMXON_CR0_ALWAYSON; 5206 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5207 5208 if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & 5209 SECONDARY_EXEC_UNRESTRICTED_GUEST && 5210 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 5211 always_on &= ~(X86_CR0_PE | X86_CR0_PG); 5212 return (val & always_on) == always_on; 5213} 5214 5215/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5216static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5217{ 5218 if (is_guest_mode(vcpu)) { 5219 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5220 unsigned long orig_val = val; 5221 5222 /* 5223 * We get here when L2 changed cr0 in a way that did not change 5224 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5225 * but did change L0 shadowed bits. So we first calculate the 5226 * effective cr0 value that L1 would like to write into the 5227 * hardware. It consists of the L2-owned bits from the new 5228 * value combined with the L1-owned bits from L1's guest_cr0. 5229 */ 5230 val = (val & ~vmcs12->cr0_guest_host_mask) | 5231 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5232 5233 if (!nested_cr0_valid(vcpu, val)) 5234 return 1; 5235 5236 if (kvm_set_cr0(vcpu, val)) 5237 return 1; 5238 vmcs_writel(CR0_READ_SHADOW, orig_val); 5239 return 0; 5240 } else { 5241 if (to_vmx(vcpu)->nested.vmxon && 5242 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) 5243 return 1; 5244 return kvm_set_cr0(vcpu, val); 5245 } 5246} 5247 5248static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5249{ 5250 if (is_guest_mode(vcpu)) { 5251 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5252 unsigned long orig_val = val; 5253 5254 /* analogously to handle_set_cr0 */ 5255 val = (val & ~vmcs12->cr4_guest_host_mask) | 5256 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5257 if (kvm_set_cr4(vcpu, val)) 5258 return 1; 5259 vmcs_writel(CR4_READ_SHADOW, orig_val); 5260 return 0; 5261 } else 5262 return kvm_set_cr4(vcpu, val); 5263} 5264 5265/* called to set cr0 as approriate for clts instruction exit. */ 5266static void handle_clts(struct kvm_vcpu *vcpu) 5267{ 5268 if (is_guest_mode(vcpu)) { 5269 /* 5270 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS 5271 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, 5272 * just pretend it's off (also in arch.cr0 for fpu_activate). 5273 */ 5274 vmcs_writel(CR0_READ_SHADOW, 5275 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); 5276 vcpu->arch.cr0 &= ~X86_CR0_TS; 5277 } else 5278 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 5279} 5280 5281static int handle_cr(struct kvm_vcpu *vcpu) 5282{ 5283 unsigned long exit_qualification, val; 5284 int cr; 5285 int reg; 5286 int err; 5287 5288 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5289 cr = exit_qualification & 15; 5290 reg = (exit_qualification >> 8) & 15; 5291 switch ((exit_qualification >> 4) & 3) { 5292 case 0: /* mov to cr */ 5293 val = kvm_register_readl(vcpu, reg); 5294 trace_kvm_cr_write(cr, val); 5295 switch (cr) { 5296 case 0: 5297 err = handle_set_cr0(vcpu, val); 5298 kvm_complete_insn_gp(vcpu, err); 5299 return 1; 5300 case 3: 5301 err = kvm_set_cr3(vcpu, val); 5302 kvm_complete_insn_gp(vcpu, err); 5303 return 1; 5304 case 4: 5305 err = handle_set_cr4(vcpu, val); 5306 kvm_complete_insn_gp(vcpu, err); 5307 return 1; 5308 case 8: { 5309 u8 cr8_prev = kvm_get_cr8(vcpu); 5310 u8 cr8 = (u8)val; 5311 err = kvm_set_cr8(vcpu, cr8); 5312 kvm_complete_insn_gp(vcpu, err); 5313 if (irqchip_in_kernel(vcpu->kvm)) 5314 return 1; 5315 if (cr8_prev <= cr8) 5316 return 1; 5317 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5318 return 0; 5319 } 5320 } 5321 break; 5322 case 2: /* clts */ 5323 handle_clts(vcpu); 5324 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 5325 skip_emulated_instruction(vcpu); 5326 vmx_fpu_activate(vcpu); 5327 return 1; 5328 case 1: /*mov from cr*/ 5329 switch (cr) { 5330 case 3: 5331 val = kvm_read_cr3(vcpu); 5332 kvm_register_write(vcpu, reg, val); 5333 trace_kvm_cr_read(cr, val); 5334 skip_emulated_instruction(vcpu); 5335 return 1; 5336 case 8: 5337 val = kvm_get_cr8(vcpu); 5338 kvm_register_write(vcpu, reg, val); 5339 trace_kvm_cr_read(cr, val); 5340 skip_emulated_instruction(vcpu); 5341 return 1; 5342 } 5343 break; 5344 case 3: /* lmsw */ 5345 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5346 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); 5347 kvm_lmsw(vcpu, val); 5348 5349 skip_emulated_instruction(vcpu); 5350 return 1; 5351 default: 5352 break; 5353 } 5354 vcpu->run->exit_reason = 0; 5355 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5356 (int)(exit_qualification >> 4) & 3, cr); 5357 return 0; 5358} 5359 5360static int handle_dr(struct kvm_vcpu *vcpu) 5361{ 5362 unsigned long exit_qualification; 5363 int dr, dr7, reg; 5364 5365 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5366 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5367 5368 /* First, if DR does not exist, trigger UD */ 5369 if (!kvm_require_dr(vcpu, dr)) 5370 return 1; 5371 5372 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 5373 if (!kvm_require_cpl(vcpu, 0)) 5374 return 1; 5375 dr7 = vmcs_readl(GUEST_DR7); 5376 if (dr7 & DR7_GD) { 5377 /* 5378 * As the vm-exit takes precedence over the debug trap, we 5379 * need to emulate the latter, either for the host or the 5380 * guest debugging itself. 5381 */ 5382 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5383 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; 5384 vcpu->run->debug.arch.dr7 = dr7; 5385 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5386 vcpu->run->debug.arch.exception = DB_VECTOR; 5387 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5388 return 0; 5389 } else { 5390 vcpu->arch.dr6 &= ~15; 5391 vcpu->arch.dr6 |= DR6_BD | DR6_RTM; 5392 kvm_queue_exception(vcpu, DB_VECTOR); 5393 return 1; 5394 } 5395 } 5396 5397 if (vcpu->guest_debug == 0) { 5398 u32 cpu_based_vm_exec_control; 5399 5400 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5401 cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; 5402 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5403 5404 /* 5405 * No more DR vmexits; force a reload of the debug registers 5406 * and reenter on this instruction. The next vmexit will 5407 * retrieve the full state of the debug registers. 5408 */ 5409 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5410 return 1; 5411 } 5412 5413 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5414 if (exit_qualification & TYPE_MOV_FROM_DR) { 5415 unsigned long val; 5416 5417 if (kvm_get_dr(vcpu, dr, &val)) 5418 return 1; 5419 kvm_register_write(vcpu, reg, val); 5420 } else 5421 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) 5422 return 1; 5423 5424 skip_emulated_instruction(vcpu); 5425 return 1; 5426} 5427 5428static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) 5429{ 5430 return vcpu->arch.dr6; 5431} 5432 5433static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5434{ 5435} 5436 5437static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5438{ 5439 u32 cpu_based_vm_exec_control; 5440 5441 get_debugreg(vcpu->arch.db[0], 0); 5442 get_debugreg(vcpu->arch.db[1], 1); 5443 get_debugreg(vcpu->arch.db[2], 2); 5444 get_debugreg(vcpu->arch.db[3], 3); 5445 get_debugreg(vcpu->arch.dr6, 6); 5446 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5447 5448 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5449 5450 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5451 cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; 5452 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5453} 5454 5455static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5456{ 5457 vmcs_writel(GUEST_DR7, val); 5458} 5459 5460static int handle_cpuid(struct kvm_vcpu *vcpu) 5461{ 5462 kvm_emulate_cpuid(vcpu); 5463 return 1; 5464} 5465 5466static int handle_rdmsr(struct kvm_vcpu *vcpu) 5467{ 5468 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5469 struct msr_data msr_info; 5470 5471 msr_info.index = ecx; 5472 msr_info.host_initiated = false; 5473 if (vmx_get_msr(vcpu, &msr_info)) { 5474 trace_kvm_msr_read_ex(ecx); 5475 kvm_inject_gp(vcpu, 0); 5476 return 1; 5477 } 5478 5479 trace_kvm_msr_read(ecx, msr_info.data); 5480 5481 /* FIXME: handling of bits 32:63 of rax, rdx */ 5482 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; 5483 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; 5484 skip_emulated_instruction(vcpu); 5485 return 1; 5486} 5487 5488static int handle_wrmsr(struct kvm_vcpu *vcpu) 5489{ 5490 struct msr_data msr; 5491 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5492 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 5493 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 5494 5495 msr.data = data; 5496 msr.index = ecx; 5497 msr.host_initiated = false; 5498 if (kvm_set_msr(vcpu, &msr) != 0) { 5499 trace_kvm_msr_write_ex(ecx, data); 5500 kvm_inject_gp(vcpu, 0); 5501 return 1; 5502 } 5503 5504 trace_kvm_msr_write(ecx, data); 5505 skip_emulated_instruction(vcpu); 5506 return 1; 5507} 5508 5509static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5510{ 5511 kvm_make_request(KVM_REQ_EVENT, vcpu); 5512 return 1; 5513} 5514 5515static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5516{ 5517 u32 cpu_based_vm_exec_control; 5518 5519 /* clear pending irq */ 5520 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5521 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 5522 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5523 5524 kvm_make_request(KVM_REQ_EVENT, vcpu); 5525 5526 ++vcpu->stat.irq_window_exits; 5527 5528 /* 5529 * If the user space waits to inject interrupts, exit as soon as 5530 * possible 5531 */ 5532 if (!irqchip_in_kernel(vcpu->kvm) && 5533 vcpu->run->request_interrupt_window && 5534 !kvm_cpu_has_interrupt(vcpu)) { 5535 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 5536 return 0; 5537 } 5538 return 1; 5539} 5540 5541static int handle_halt(struct kvm_vcpu *vcpu) 5542{ 5543 return kvm_emulate_halt(vcpu); 5544} 5545 5546static int handle_vmcall(struct kvm_vcpu *vcpu) 5547{ 5548 kvm_emulate_hypercall(vcpu); 5549 return 1; 5550} 5551 5552static int handle_invd(struct kvm_vcpu *vcpu) 5553{ 5554 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5555} 5556 5557static int handle_invlpg(struct kvm_vcpu *vcpu) 5558{ 5559 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5560 5561 kvm_mmu_invlpg(vcpu, exit_qualification); 5562 skip_emulated_instruction(vcpu); 5563 return 1; 5564} 5565 5566static int handle_rdpmc(struct kvm_vcpu *vcpu) 5567{ 5568 int err; 5569 5570 err = kvm_rdpmc(vcpu); 5571 kvm_complete_insn_gp(vcpu, err); 5572 5573 return 1; 5574} 5575 5576static int handle_wbinvd(struct kvm_vcpu *vcpu) 5577{ 5578 kvm_emulate_wbinvd(vcpu); 5579 return 1; 5580} 5581 5582static int handle_xsetbv(struct kvm_vcpu *vcpu) 5583{ 5584 u64 new_bv = kvm_read_edx_eax(vcpu); 5585 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 5586 5587 if (kvm_set_xcr(vcpu, index, new_bv) == 0) 5588 skip_emulated_instruction(vcpu); 5589 return 1; 5590} 5591 5592static int handle_xsaves(struct kvm_vcpu *vcpu) 5593{ 5594 skip_emulated_instruction(vcpu); 5595 WARN(1, "this should never happen\n"); 5596 return 1; 5597} 5598 5599static int handle_xrstors(struct kvm_vcpu *vcpu) 5600{ 5601 skip_emulated_instruction(vcpu); 5602 WARN(1, "this should never happen\n"); 5603 return 1; 5604} 5605 5606static int handle_apic_access(struct kvm_vcpu *vcpu) 5607{ 5608 if (likely(fasteoi)) { 5609 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5610 int access_type, offset; 5611 5612 access_type = exit_qualification & APIC_ACCESS_TYPE; 5613 offset = exit_qualification & APIC_ACCESS_OFFSET; 5614 /* 5615 * Sane guest uses MOV to write EOI, with written value 5616 * not cared. So make a short-circuit here by avoiding 5617 * heavy instruction emulation. 5618 */ 5619 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5620 (offset == APIC_EOI)) { 5621 kvm_lapic_set_eoi(vcpu); 5622 skip_emulated_instruction(vcpu); 5623 return 1; 5624 } 5625 } 5626 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5627} 5628 5629static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5630{ 5631 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5632 int vector = exit_qualification & 0xff; 5633 5634 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5635 kvm_apic_set_eoi_accelerated(vcpu, vector); 5636 return 1; 5637} 5638 5639static int handle_apic_write(struct kvm_vcpu *vcpu) 5640{ 5641 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5642 u32 offset = exit_qualification & 0xfff; 5643 5644 /* APIC-write VM exit is trap-like and thus no need to adjust IP */ 5645 kvm_apic_write_nodecode(vcpu, offset); 5646 return 1; 5647} 5648 5649static int handle_task_switch(struct kvm_vcpu *vcpu) 5650{ 5651 struct vcpu_vmx *vmx = to_vmx(vcpu); 5652 unsigned long exit_qualification; 5653 bool has_error_code = false; 5654 u32 error_code = 0; 5655 u16 tss_selector; 5656 int reason, type, idt_v, idt_index; 5657 5658 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5659 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5660 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5661 5662 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5663 5664 reason = (u32)exit_qualification >> 30; 5665 if (reason == TASK_SWITCH_GATE && idt_v) { 5666 switch (type) { 5667 case INTR_TYPE_NMI_INTR: 5668 vcpu->arch.nmi_injected = false; 5669 vmx_set_nmi_mask(vcpu, true); 5670 break; 5671 case INTR_TYPE_EXT_INTR: 5672 case INTR_TYPE_SOFT_INTR: 5673 kvm_clear_interrupt_queue(vcpu); 5674 break; 5675 case INTR_TYPE_HARD_EXCEPTION: 5676 if (vmx->idt_vectoring_info & 5677 VECTORING_INFO_DELIVER_CODE_MASK) { 5678 has_error_code = true; 5679 error_code = 5680 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5681 } 5682 /* fall through */ 5683 case INTR_TYPE_SOFT_EXCEPTION: 5684 kvm_clear_exception_queue(vcpu); 5685 break; 5686 default: 5687 break; 5688 } 5689 } 5690 tss_selector = exit_qualification; 5691 5692 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5693 type != INTR_TYPE_EXT_INTR && 5694 type != INTR_TYPE_NMI_INTR)) 5695 skip_emulated_instruction(vcpu); 5696 5697 if (kvm_task_switch(vcpu, tss_selector, 5698 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, 5699 has_error_code, error_code) == EMULATE_FAIL) { 5700 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5701 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5702 vcpu->run->internal.ndata = 0; 5703 return 0; 5704 } 5705 5706 /* 5707 * TODO: What about debug traps on tss switch? 5708 * Are we supposed to inject them and update dr6? 5709 */ 5710 5711 return 1; 5712} 5713 5714static int handle_ept_violation(struct kvm_vcpu *vcpu) 5715{ 5716 unsigned long exit_qualification; 5717 gpa_t gpa; 5718 u32 error_code; 5719 int gla_validity; 5720 5721 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5722 5723 gla_validity = (exit_qualification >> 7) & 0x3; 5724 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { 5725 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 5726 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 5727 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 5728 vmcs_readl(GUEST_LINEAR_ADDRESS)); 5729 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 5730 (long unsigned int)exit_qualification); 5731 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 5732 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 5733 return 0; 5734 } 5735 5736 /* 5737 * EPT violation happened while executing iret from NMI, 5738 * "blocked by NMI" bit has to be set before next VM entry. 5739 * There are errata that may cause this bit to not be set: 5740 * AAK134, BY25. 5741 */ 5742 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5743 cpu_has_virtual_nmis() && 5744 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5745 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5746 5747 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5748 trace_kvm_page_fault(gpa, exit_qualification); 5749 5750 /* It is a write fault? */ 5751 error_code = exit_qualification & PFERR_WRITE_MASK; 5752 /* It is a fetch fault? */ 5753 error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; 5754 /* ept page table is present? */ 5755 error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; 5756 5757 vcpu->arch.exit_qualification = exit_qualification; 5758 5759 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5760} 5761 5762static u64 ept_rsvd_mask(u64 spte, int level) 5763{ 5764 int i; 5765 u64 mask = 0; 5766 5767 for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) 5768 mask |= (1ULL << i); 5769 5770 if (level == 4) 5771 /* bits 7:3 reserved */ 5772 mask |= 0xf8; 5773 else if (spte & (1ULL << 7)) 5774 /* 5775 * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, 5776 * level == 1 if the hypervisor is using the ignored bit 7. 5777 */ 5778 mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; 5779 else if (level > 1) 5780 /* bits 6:3 reserved */ 5781 mask |= 0x78; 5782 5783 return mask; 5784} 5785 5786static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, 5787 int level) 5788{ 5789 printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); 5790 5791 /* 010b (write-only) */ 5792 WARN_ON((spte & 0x7) == 0x2); 5793 5794 /* 110b (write/execute) */ 5795 WARN_ON((spte & 0x7) == 0x6); 5796 5797 /* 100b (execute-only) and value not supported by logical processor */ 5798 if (!cpu_has_vmx_ept_execute_only()) 5799 WARN_ON((spte & 0x7) == 0x4); 5800 5801 /* not 000b */ 5802 if ((spte & 0x7)) { 5803 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); 5804 5805 if (rsvd_bits != 0) { 5806 printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", 5807 __func__, rsvd_bits); 5808 WARN_ON(1); 5809 } 5810 5811 /* bits 5:3 are _not_ reserved for large page or leaf page */ 5812 if ((rsvd_bits & 0x38) == 0) { 5813 u64 ept_mem_type = (spte & 0x38) >> 3; 5814 5815 if (ept_mem_type == 2 || ept_mem_type == 3 || 5816 ept_mem_type == 7) { 5817 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", 5818 __func__, ept_mem_type); 5819 WARN_ON(1); 5820 } 5821 } 5822 } 5823} 5824 5825static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5826{ 5827 u64 sptes[4]; 5828 int nr_sptes, i, ret; 5829 gpa_t gpa; 5830 5831 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5832 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5833 skip_emulated_instruction(vcpu); 5834 return 1; 5835 } 5836 5837 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5838 if (likely(ret == RET_MMIO_PF_EMULATE)) 5839 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 5840 EMULATE_DONE; 5841 5842 if (unlikely(ret == RET_MMIO_PF_INVALID)) 5843 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); 5844 5845 if (unlikely(ret == RET_MMIO_PF_RETRY)) 5846 return 1; 5847 5848 /* It is the real ept misconfig */ 5849 printk(KERN_ERR "EPT: Misconfiguration.\n"); 5850 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); 5851 5852 nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); 5853 5854 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 5855 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 5856 5857 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 5858 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 5859 5860 return 0; 5861} 5862 5863static int handle_nmi_window(struct kvm_vcpu *vcpu) 5864{ 5865 u32 cpu_based_vm_exec_control; 5866 5867 /* clear pending NMI */ 5868 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5869 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 5870 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5871 ++vcpu->stat.nmi_window_exits; 5872 kvm_make_request(KVM_REQ_EVENT, vcpu); 5873 5874 return 1; 5875} 5876 5877static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5878{ 5879 struct vcpu_vmx *vmx = to_vmx(vcpu); 5880 enum emulation_result err = EMULATE_DONE; 5881 int ret = 1; 5882 u32 cpu_exec_ctrl; 5883 bool intr_window_requested; 5884 unsigned count = 130; 5885 5886 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5887 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5888 5889 while (vmx->emulation_required && count-- != 0) { 5890 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 5891 return handle_interrupt_window(&vmx->vcpu); 5892 5893 if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 5894 return 1; 5895 5896 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 5897 5898 if (err == EMULATE_USER_EXIT) { 5899 ++vcpu->stat.mmio_exits; 5900 ret = 0; 5901 goto out; 5902 } 5903 5904 if (err != EMULATE_DONE) { 5905 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5906 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5907 vcpu->run->internal.ndata = 0; 5908 return 0; 5909 } 5910 5911 if (vcpu->arch.halt_request) { 5912 vcpu->arch.halt_request = 0; 5913 ret = kvm_vcpu_halt(vcpu); 5914 goto out; 5915 } 5916 5917 if (signal_pending(current)) 5918 goto out; 5919 if (need_resched()) 5920 schedule(); 5921 } 5922 5923out: 5924 return ret; 5925} 5926 5927static int __grow_ple_window(int val) 5928{ 5929 if (ple_window_grow < 1) 5930 return ple_window; 5931 5932 val = min(val, ple_window_actual_max); 5933 5934 if (ple_window_grow < ple_window) 5935 val *= ple_window_grow; 5936 else 5937 val += ple_window_grow; 5938 5939 return val; 5940} 5941 5942static int __shrink_ple_window(int val, int modifier, int minimum) 5943{ 5944 if (modifier < 1) 5945 return ple_window; 5946 5947 if (modifier < ple_window) 5948 val /= modifier; 5949 else 5950 val -= modifier; 5951 5952 return max(val, minimum); 5953} 5954 5955static void grow_ple_window(struct kvm_vcpu *vcpu) 5956{ 5957 struct vcpu_vmx *vmx = to_vmx(vcpu); 5958 int old = vmx->ple_window; 5959 5960 vmx->ple_window = __grow_ple_window(old); 5961 5962 if (vmx->ple_window != old) 5963 vmx->ple_window_dirty = true; 5964 5965 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); 5966} 5967 5968static void shrink_ple_window(struct kvm_vcpu *vcpu) 5969{ 5970 struct vcpu_vmx *vmx = to_vmx(vcpu); 5971 int old = vmx->ple_window; 5972 5973 vmx->ple_window = __shrink_ple_window(old, 5974 ple_window_shrink, ple_window); 5975 5976 if (vmx->ple_window != old) 5977 vmx->ple_window_dirty = true; 5978 5979 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); 5980} 5981 5982/* 5983 * ple_window_actual_max is computed to be one grow_ple_window() below 5984 * ple_window_max. (See __grow_ple_window for the reason.) 5985 * This prevents overflows, because ple_window_max is int. 5986 * ple_window_max effectively rounded down to a multiple of ple_window_grow in 5987 * this process. 5988 * ple_window_max is also prevented from setting vmx->ple_window < ple_window. 5989 */ 5990static void update_ple_window_actual_max(void) 5991{ 5992 ple_window_actual_max = 5993 __shrink_ple_window(max(ple_window_max, ple_window), 5994 ple_window_grow, INT_MIN); 5995} 5996 5997static __init int hardware_setup(void) 5998{ 5999 int r = -ENOMEM, i, msr; 6000 6001 rdmsrl_safe(MSR_EFER, &host_efer); 6002 6003 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) 6004 kvm_define_shared_msr(i, vmx_msr_index[i]); 6005 6006 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 6007 if (!vmx_io_bitmap_a) 6008 return r; 6009 6010 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 6011 if (!vmx_io_bitmap_b) 6012 goto out; 6013 6014 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 6015 if (!vmx_msr_bitmap_legacy) 6016 goto out1; 6017 6018 vmx_msr_bitmap_legacy_x2apic = 6019 (unsigned long *)__get_free_page(GFP_KERNEL); 6020 if (!vmx_msr_bitmap_legacy_x2apic) 6021 goto out2; 6022 6023 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 6024 if (!vmx_msr_bitmap_longmode) 6025 goto out3; 6026 6027 vmx_msr_bitmap_longmode_x2apic = 6028 (unsigned long *)__get_free_page(GFP_KERNEL); 6029 if (!vmx_msr_bitmap_longmode_x2apic) 6030 goto out4; 6031 6032 if (nested) { 6033 vmx_msr_bitmap_nested = 6034 (unsigned long *)__get_free_page(GFP_KERNEL); 6035 if (!vmx_msr_bitmap_nested) 6036 goto out5; 6037 } 6038 6039 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6040 if (!vmx_vmread_bitmap) 6041 goto out6; 6042 6043 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6044 if (!vmx_vmwrite_bitmap) 6045 goto out7; 6046 6047 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 6048 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 6049 6050 /* 6051 * Allow direct access to the PC debug port (it is often used for I/O 6052 * delays, but the vmexits simply slow things down). 6053 */ 6054 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); 6055 clear_bit(0x80, vmx_io_bitmap_a); 6056 6057 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); 6058 6059 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); 6060 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); 6061 if (nested) 6062 memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); 6063 6064 if (setup_vmcs_config(&vmcs_config) < 0) { 6065 r = -EIO; 6066 goto out8; 6067 } 6068 6069 if (boot_cpu_has(X86_FEATURE_NX)) 6070 kvm_enable_efer_bits(EFER_NX); 6071 6072 if (!cpu_has_vmx_vpid()) 6073 enable_vpid = 0; 6074 if (!cpu_has_vmx_shadow_vmcs()) 6075 enable_shadow_vmcs = 0; 6076 if (enable_shadow_vmcs) 6077 init_vmcs_shadow_fields(); 6078 6079 if (!cpu_has_vmx_ept() || 6080 !cpu_has_vmx_ept_4levels()) { 6081 enable_ept = 0; 6082 enable_unrestricted_guest = 0; 6083 enable_ept_ad_bits = 0; 6084 } 6085 6086 if (!cpu_has_vmx_ept_ad_bits()) 6087 enable_ept_ad_bits = 0; 6088 6089 if (!cpu_has_vmx_unrestricted_guest()) 6090 enable_unrestricted_guest = 0; 6091 6092 if (!cpu_has_vmx_flexpriority()) 6093 flexpriority_enabled = 0; 6094 6095 /* 6096 * set_apic_access_page_addr() is used to reload apic access 6097 * page upon invalidation. No need to do anything if not 6098 * using the APIC_ACCESS_ADDR VMCS field. 6099 */ 6100 if (!flexpriority_enabled) 6101 kvm_x86_ops->set_apic_access_page_addr = NULL; 6102 6103 if (!cpu_has_vmx_tpr_shadow()) 6104 kvm_x86_ops->update_cr8_intercept = NULL; 6105 6106 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 6107 kvm_disable_largepages(); 6108 6109 if (!cpu_has_vmx_ple()) 6110 ple_gap = 0; 6111 6112 if (!cpu_has_vmx_apicv()) 6113 enable_apicv = 0; 6114 6115 if (enable_apicv) 6116 kvm_x86_ops->update_cr8_intercept = NULL; 6117 else { 6118 kvm_x86_ops->hwapic_irr_update = NULL; 6119 kvm_x86_ops->hwapic_isr_update = NULL; 6120 kvm_x86_ops->deliver_posted_interrupt = NULL; 6121 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; 6122 } 6123 6124 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 6125 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 6126 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 6127 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 6128 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 6129 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 6130 vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); 6131 6132 memcpy(vmx_msr_bitmap_legacy_x2apic, 6133 vmx_msr_bitmap_legacy, PAGE_SIZE); 6134 memcpy(vmx_msr_bitmap_longmode_x2apic, 6135 vmx_msr_bitmap_longmode, PAGE_SIZE); 6136 6137 if (enable_apicv) { 6138 for (msr = 0x800; msr <= 0x8ff; msr++) 6139 vmx_disable_intercept_msr_read_x2apic(msr); 6140 6141 /* According SDM, in x2apic mode, the whole id reg is used. 6142 * But in KVM, it only use the highest eight bits. Need to 6143 * intercept it */ 6144 vmx_enable_intercept_msr_read_x2apic(0x802); 6145 /* TMCCT */ 6146 vmx_enable_intercept_msr_read_x2apic(0x839); 6147 /* TPR */ 6148 vmx_disable_intercept_msr_write_x2apic(0x808); 6149 /* EOI */ 6150 vmx_disable_intercept_msr_write_x2apic(0x80b); 6151 /* SELF-IPI */ 6152 vmx_disable_intercept_msr_write_x2apic(0x83f); 6153 } 6154 6155 if (enable_ept) { 6156 kvm_mmu_set_mask_ptes(0ull, 6157 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, 6158 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, 6159 0ull, VMX_EPT_EXECUTABLE_MASK); 6160 ept_set_mmio_spte_mask(); 6161 kvm_enable_tdp(); 6162 } else 6163 kvm_disable_tdp(); 6164 6165 update_ple_window_actual_max(); 6166 6167 /* 6168 * Only enable PML when hardware supports PML feature, and both EPT 6169 * and EPT A/D bit features are enabled -- PML depends on them to work. 6170 */ 6171 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 6172 enable_pml = 0; 6173 6174 if (!enable_pml) { 6175 kvm_x86_ops->slot_enable_log_dirty = NULL; 6176 kvm_x86_ops->slot_disable_log_dirty = NULL; 6177 kvm_x86_ops->flush_log_dirty = NULL; 6178 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 6179 } 6180 6181 return alloc_kvm_area(); 6182 6183out8: 6184 free_page((unsigned long)vmx_vmwrite_bitmap); 6185out7: 6186 free_page((unsigned long)vmx_vmread_bitmap); 6187out6: 6188 if (nested) 6189 free_page((unsigned long)vmx_msr_bitmap_nested); 6190out5: 6191 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6192out4: 6193 free_page((unsigned long)vmx_msr_bitmap_longmode); 6194out3: 6195 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 6196out2: 6197 free_page((unsigned long)vmx_msr_bitmap_legacy); 6198out1: 6199 free_page((unsigned long)vmx_io_bitmap_b); 6200out: 6201 free_page((unsigned long)vmx_io_bitmap_a); 6202 6203 return r; 6204} 6205 6206static __exit void hardware_unsetup(void) 6207{ 6208 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 6209 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6210 free_page((unsigned long)vmx_msr_bitmap_legacy); 6211 free_page((unsigned long)vmx_msr_bitmap_longmode); 6212 free_page((unsigned long)vmx_io_bitmap_b); 6213 free_page((unsigned long)vmx_io_bitmap_a); 6214 free_page((unsigned long)vmx_vmwrite_bitmap); 6215 free_page((unsigned long)vmx_vmread_bitmap); 6216 if (nested) 6217 free_page((unsigned long)vmx_msr_bitmap_nested); 6218 6219 free_kvm_area(); 6220} 6221 6222/* 6223 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 6224 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 6225 */ 6226static int handle_pause(struct kvm_vcpu *vcpu) 6227{ 6228 if (ple_gap) 6229 grow_ple_window(vcpu); 6230 6231 skip_emulated_instruction(vcpu); 6232 kvm_vcpu_on_spin(vcpu); 6233 6234 return 1; 6235} 6236 6237static int handle_nop(struct kvm_vcpu *vcpu) 6238{ 6239 skip_emulated_instruction(vcpu); 6240 return 1; 6241} 6242 6243static int handle_mwait(struct kvm_vcpu *vcpu) 6244{ 6245 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 6246 return handle_nop(vcpu); 6247} 6248 6249static int handle_monitor(struct kvm_vcpu *vcpu) 6250{ 6251 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 6252 return handle_nop(vcpu); 6253} 6254 6255/* 6256 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. 6257 * We could reuse a single VMCS for all the L2 guests, but we also want the 6258 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this 6259 * allows keeping them loaded on the processor, and in the future will allow 6260 * optimizations where prepare_vmcs02 doesn't need to set all the fields on 6261 * every entry if they never change. 6262 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE 6263 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. 6264 * 6265 * The following functions allocate and free a vmcs02 in this pool. 6266 */ 6267 6268/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ 6269static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) 6270{ 6271 struct vmcs02_list *item; 6272 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 6273 if (item->vmptr == vmx->nested.current_vmptr) { 6274 list_move(&item->list, &vmx->nested.vmcs02_pool); 6275 return &item->vmcs02; 6276 } 6277 6278 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { 6279 /* Recycle the least recently used VMCS. */ 6280 item = list_entry(vmx->nested.vmcs02_pool.prev, 6281 struct vmcs02_list, list); 6282 item->vmptr = vmx->nested.current_vmptr; 6283 list_move(&item->list, &vmx->nested.vmcs02_pool); 6284 return &item->vmcs02; 6285 } 6286 6287 /* Create a new VMCS */ 6288 item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); 6289 if (!item) 6290 return NULL; 6291 item->vmcs02.vmcs = alloc_vmcs(); 6292 if (!item->vmcs02.vmcs) { 6293 kfree(item); 6294 return NULL; 6295 } 6296 loaded_vmcs_init(&item->vmcs02); 6297 item->vmptr = vmx->nested.current_vmptr; 6298 list_add(&(item->list), &(vmx->nested.vmcs02_pool)); 6299 vmx->nested.vmcs02_num++; 6300 return &item->vmcs02; 6301} 6302 6303/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ 6304static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) 6305{ 6306 struct vmcs02_list *item; 6307 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 6308 if (item->vmptr == vmptr) { 6309 free_loaded_vmcs(&item->vmcs02); 6310 list_del(&item->list); 6311 kfree(item); 6312 vmx->nested.vmcs02_num--; 6313 return; 6314 } 6315} 6316 6317/* 6318 * Free all VMCSs saved for this vcpu, except the one pointed by 6319 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs 6320 * must be &vmx->vmcs01. 6321 */ 6322static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) 6323{ 6324 struct vmcs02_list *item, *n; 6325 6326 WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); 6327 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { 6328 /* 6329 * Something will leak if the above WARN triggers. Better than 6330 * a use-after-free. 6331 */ 6332 if (vmx->loaded_vmcs == &item->vmcs02) 6333 continue; 6334 6335 free_loaded_vmcs(&item->vmcs02); 6336 list_del(&item->list); 6337 kfree(item); 6338 vmx->nested.vmcs02_num--; 6339 } 6340} 6341 6342/* 6343 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 6344 * set the success or error code of an emulated VMX instruction, as specified 6345 * by Vol 2B, VMX Instruction Reference, "Conventions". 6346 */ 6347static void nested_vmx_succeed(struct kvm_vcpu *vcpu) 6348{ 6349 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 6350 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 6351 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 6352} 6353 6354static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 6355{ 6356 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 6357 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 6358 X86_EFLAGS_SF | X86_EFLAGS_OF)) 6359 | X86_EFLAGS_CF); 6360} 6361 6362static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 6363 u32 vm_instruction_error) 6364{ 6365 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { 6366 /* 6367 * failValid writes the error number to the current VMCS, which 6368 * can't be done there isn't a current VMCS. 6369 */ 6370 nested_vmx_failInvalid(vcpu); 6371 return; 6372 } 6373 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 6374 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 6375 X86_EFLAGS_SF | X86_EFLAGS_OF)) 6376 | X86_EFLAGS_ZF); 6377 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 6378 /* 6379 * We don't need to force a shadow sync because 6380 * VM_INSTRUCTION_ERROR is not shadowed 6381 */ 6382} 6383 6384static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 6385{ 6386 /* TODO: not to reset guest simply here. */ 6387 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 6388 pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); 6389} 6390 6391static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 6392{ 6393 struct vcpu_vmx *vmx = 6394 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 6395 6396 vmx->nested.preemption_timer_expired = true; 6397 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 6398 kvm_vcpu_kick(&vmx->vcpu); 6399 6400 return HRTIMER_NORESTART; 6401} 6402 6403/* 6404 * Decode the memory-address operand of a vmx instruction, as recorded on an 6405 * exit caused by such an instruction (run by a guest hypervisor). 6406 * On success, returns 0. When the operand is invalid, returns 1 and throws 6407 * #UD or #GP. 6408 */ 6409static int get_vmx_mem_address(struct kvm_vcpu *vcpu, 6410 unsigned long exit_qualification, 6411 u32 vmx_instruction_info, gva_t *ret) 6412{ 6413 /* 6414 * According to Vol. 3B, "Information for VM Exits Due to Instruction 6415 * Execution", on an exit, vmx_instruction_info holds most of the 6416 * addressing components of the operand. Only the displacement part 6417 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 6418 * For how an actual address is calculated from all these components, 6419 * refer to Vol. 1, "Operand Addressing". 6420 */ 6421 int scaling = vmx_instruction_info & 3; 6422 int addr_size = (vmx_instruction_info >> 7) & 7; 6423 bool is_reg = vmx_instruction_info & (1u << 10); 6424 int seg_reg = (vmx_instruction_info >> 15) & 7; 6425 int index_reg = (vmx_instruction_info >> 18) & 0xf; 6426 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 6427 int base_reg = (vmx_instruction_info >> 23) & 0xf; 6428 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 6429 6430 if (is_reg) { 6431 kvm_queue_exception(vcpu, UD_VECTOR); 6432 return 1; 6433 } 6434 6435 /* Addr = segment_base + offset */ 6436 /* offset = base + [index * scale] + displacement */ 6437 *ret = vmx_get_segment_base(vcpu, seg_reg); 6438 if (base_is_valid) 6439 *ret += kvm_register_read(vcpu, base_reg); 6440 if (index_is_valid) 6441 *ret += kvm_register_read(vcpu, index_reg)<<scaling; 6442 *ret += exit_qualification; /* holds the displacement */ 6443 6444 if (addr_size == 1) /* 32 bit */ 6445 *ret &= 0xffffffff; 6446 6447 /* 6448 * TODO: throw #GP (and return 1) in various cases that the VM* 6449 * instructions require it - e.g., offset beyond segment limit, 6450 * unusable or unreadable/unwritable segment, non-canonical 64-bit 6451 * address, and so on. Currently these are not checked. 6452 */ 6453 return 0; 6454} 6455 6456/* 6457 * This function performs the various checks including 6458 * - if it's 4KB aligned 6459 * - No bits beyond the physical address width are set 6460 * - Returns 0 on success or else 1 6461 * (Intel SDM Section 30.3) 6462 */ 6463static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, 6464 gpa_t *vmpointer) 6465{ 6466 gva_t gva; 6467 gpa_t vmptr; 6468 struct x86_exception e; 6469 struct page *page; 6470 struct vcpu_vmx *vmx = to_vmx(vcpu); 6471 int maxphyaddr = cpuid_maxphyaddr(vcpu); 6472 6473 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6474 vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) 6475 return 1; 6476 6477 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, 6478 sizeof(vmptr), &e)) { 6479 kvm_inject_page_fault(vcpu, &e); 6480 return 1; 6481 } 6482 6483 switch (exit_reason) { 6484 case EXIT_REASON_VMON: 6485 /* 6486 * SDM 3: 24.11.5 6487 * The first 4 bytes of VMXON region contain the supported 6488 * VMCS revision identifier 6489 * 6490 * Note - IA32_VMX_BASIC[48] will never be 1 6491 * for the nested case; 6492 * which replaces physical address width with 32 6493 * 6494 */ 6495 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6496 nested_vmx_failInvalid(vcpu); 6497 skip_emulated_instruction(vcpu); 6498 return 1; 6499 } 6500 6501 page = nested_get_page(vcpu, vmptr); 6502 if (page == NULL || 6503 *(u32 *)kmap(page) != VMCS12_REVISION) { 6504 nested_vmx_failInvalid(vcpu); 6505 kunmap(page); 6506 skip_emulated_instruction(vcpu); 6507 return 1; 6508 } 6509 kunmap(page); 6510 vmx->nested.vmxon_ptr = vmptr; 6511 break; 6512 case EXIT_REASON_VMCLEAR: 6513 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6514 nested_vmx_failValid(vcpu, 6515 VMXERR_VMCLEAR_INVALID_ADDRESS); 6516 skip_emulated_instruction(vcpu); 6517 return 1; 6518 } 6519 6520 if (vmptr == vmx->nested.vmxon_ptr) { 6521 nested_vmx_failValid(vcpu, 6522 VMXERR_VMCLEAR_VMXON_POINTER); 6523 skip_emulated_instruction(vcpu); 6524 return 1; 6525 } 6526 break; 6527 case EXIT_REASON_VMPTRLD: 6528 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6529 nested_vmx_failValid(vcpu, 6530 VMXERR_VMPTRLD_INVALID_ADDRESS); 6531 skip_emulated_instruction(vcpu); 6532 return 1; 6533 } 6534 6535 if (vmptr == vmx->nested.vmxon_ptr) { 6536 nested_vmx_failValid(vcpu, 6537 VMXERR_VMCLEAR_VMXON_POINTER); 6538 skip_emulated_instruction(vcpu); 6539 return 1; 6540 } 6541 break; 6542 default: 6543 return 1; /* shouldn't happen */ 6544 } 6545 6546 if (vmpointer) 6547 *vmpointer = vmptr; 6548 return 0; 6549} 6550 6551/* 6552 * Emulate the VMXON instruction. 6553 * Currently, we just remember that VMX is active, and do not save or even 6554 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 6555 * do not currently need to store anything in that guest-allocated memory 6556 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 6557 * argument is different from the VMXON pointer (which the spec says they do). 6558 */ 6559static int handle_vmon(struct kvm_vcpu *vcpu) 6560{ 6561 struct kvm_segment cs; 6562 struct vcpu_vmx *vmx = to_vmx(vcpu); 6563 struct vmcs *shadow_vmcs; 6564 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 6565 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 6566 6567 /* The Intel VMX Instruction Reference lists a bunch of bits that 6568 * are prerequisite to running VMXON, most notably cr4.VMXE must be 6569 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). 6570 * Otherwise, we should fail with #UD. We test these now: 6571 */ 6572 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || 6573 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || 6574 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 6575 kvm_queue_exception(vcpu, UD_VECTOR); 6576 return 1; 6577 } 6578 6579 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6580 if (is_long_mode(vcpu) && !cs.l) { 6581 kvm_queue_exception(vcpu, UD_VECTOR); 6582 return 1; 6583 } 6584 6585 if (vmx_get_cpl(vcpu)) { 6586 kvm_inject_gp(vcpu, 0); 6587 return 1; 6588 } 6589 6590 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) 6591 return 1; 6592 6593 if (vmx->nested.vmxon) { 6594 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 6595 skip_emulated_instruction(vcpu); 6596 return 1; 6597 } 6598 6599 if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 6600 != VMXON_NEEDED_FEATURES) { 6601 kvm_inject_gp(vcpu, 0); 6602 return 1; 6603 } 6604 6605 if (enable_shadow_vmcs) { 6606 shadow_vmcs = alloc_vmcs(); 6607 if (!shadow_vmcs) 6608 return -ENOMEM; 6609 /* mark vmcs as shadow */ 6610 shadow_vmcs->revision_id |= (1u << 31); 6611 /* init shadow vmcs */ 6612 vmcs_clear(shadow_vmcs); 6613 vmx->nested.current_shadow_vmcs = shadow_vmcs; 6614 } 6615 6616 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 6617 vmx->nested.vmcs02_num = 0; 6618 6619 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 6620 HRTIMER_MODE_REL); 6621 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 6622 6623 vmx->nested.vmxon = true; 6624 6625 skip_emulated_instruction(vcpu); 6626 nested_vmx_succeed(vcpu); 6627 return 1; 6628} 6629 6630/* 6631 * Intel's VMX Instruction Reference specifies a common set of prerequisites 6632 * for running VMX instructions (except VMXON, whose prerequisites are 6633 * slightly different). It also specifies what exception to inject otherwise. 6634 */ 6635static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 6636{ 6637 struct kvm_segment cs; 6638 struct vcpu_vmx *vmx = to_vmx(vcpu); 6639 6640 if (!vmx->nested.vmxon) { 6641 kvm_queue_exception(vcpu, UD_VECTOR); 6642 return 0; 6643 } 6644 6645 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6646 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || 6647 (is_long_mode(vcpu) && !cs.l)) { 6648 kvm_queue_exception(vcpu, UD_VECTOR); 6649 return 0; 6650 } 6651 6652 if (vmx_get_cpl(vcpu)) { 6653 kvm_inject_gp(vcpu, 0); 6654 return 0; 6655 } 6656 6657 return 1; 6658} 6659 6660static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) 6661{ 6662 u32 exec_control; 6663 if (vmx->nested.current_vmptr == -1ull) 6664 return; 6665 6666 /* current_vmptr and current_vmcs12 are always set/reset together */ 6667 if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) 6668 return; 6669 6670 if (enable_shadow_vmcs) { 6671 /* copy to memory all shadowed fields in case 6672 they were modified */ 6673 copy_shadow_to_vmcs12(vmx); 6674 vmx->nested.sync_shadow_vmcs = false; 6675 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6676 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 6677 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 6678 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6679 } 6680 vmx->nested.posted_intr_nv = -1; 6681 kunmap(vmx->nested.current_vmcs12_page); 6682 nested_release_page(vmx->nested.current_vmcs12_page); 6683 vmx->nested.current_vmptr = -1ull; 6684 vmx->nested.current_vmcs12 = NULL; 6685} 6686 6687/* 6688 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 6689 * just stops using VMX. 6690 */ 6691static void free_nested(struct vcpu_vmx *vmx) 6692{ 6693 if (!vmx->nested.vmxon) 6694 return; 6695 6696 vmx->nested.vmxon = false; 6697 nested_release_vmcs12(vmx); 6698 if (enable_shadow_vmcs) 6699 free_vmcs(vmx->nested.current_shadow_vmcs); 6700 /* Unpin physical memory we referred to in current vmcs02 */ 6701 if (vmx->nested.apic_access_page) { 6702 nested_release_page(vmx->nested.apic_access_page); 6703 vmx->nested.apic_access_page = NULL; 6704 } 6705 if (vmx->nested.virtual_apic_page) { 6706 nested_release_page(vmx->nested.virtual_apic_page); 6707 vmx->nested.virtual_apic_page = NULL; 6708 } 6709 if (vmx->nested.pi_desc_page) { 6710 kunmap(vmx->nested.pi_desc_page); 6711 nested_release_page(vmx->nested.pi_desc_page); 6712 vmx->nested.pi_desc_page = NULL; 6713 vmx->nested.pi_desc = NULL; 6714 } 6715 6716 nested_free_all_saved_vmcss(vmx); 6717} 6718 6719/* Emulate the VMXOFF instruction */ 6720static int handle_vmoff(struct kvm_vcpu *vcpu) 6721{ 6722 if (!nested_vmx_check_permission(vcpu)) 6723 return 1; 6724 free_nested(to_vmx(vcpu)); 6725 skip_emulated_instruction(vcpu); 6726 nested_vmx_succeed(vcpu); 6727 return 1; 6728} 6729 6730/* Emulate the VMCLEAR instruction */ 6731static int handle_vmclear(struct kvm_vcpu *vcpu) 6732{ 6733 struct vcpu_vmx *vmx = to_vmx(vcpu); 6734 gpa_t vmptr; 6735 struct vmcs12 *vmcs12; 6736 struct page *page; 6737 6738 if (!nested_vmx_check_permission(vcpu)) 6739 return 1; 6740 6741 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) 6742 return 1; 6743 6744 if (vmptr == vmx->nested.current_vmptr) 6745 nested_release_vmcs12(vmx); 6746 6747 page = nested_get_page(vcpu, vmptr); 6748 if (page == NULL) { 6749 /* 6750 * For accurate processor emulation, VMCLEAR beyond available 6751 * physical memory should do nothing at all. However, it is 6752 * possible that a nested vmx bug, not a guest hypervisor bug, 6753 * resulted in this case, so let's shut down before doing any 6754 * more damage: 6755 */ 6756 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 6757 return 1; 6758 } 6759 vmcs12 = kmap(page); 6760 vmcs12->launch_state = 0; 6761 kunmap(page); 6762 nested_release_page(page); 6763 6764 nested_free_vmcs02(vmx, vmptr); 6765 6766 skip_emulated_instruction(vcpu); 6767 nested_vmx_succeed(vcpu); 6768 return 1; 6769} 6770 6771static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 6772 6773/* Emulate the VMLAUNCH instruction */ 6774static int handle_vmlaunch(struct kvm_vcpu *vcpu) 6775{ 6776 return nested_vmx_run(vcpu, true); 6777} 6778 6779/* Emulate the VMRESUME instruction */ 6780static int handle_vmresume(struct kvm_vcpu *vcpu) 6781{ 6782 6783 return nested_vmx_run(vcpu, false); 6784} 6785 6786enum vmcs_field_type { 6787 VMCS_FIELD_TYPE_U16 = 0, 6788 VMCS_FIELD_TYPE_U64 = 1, 6789 VMCS_FIELD_TYPE_U32 = 2, 6790 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 6791}; 6792 6793static inline int vmcs_field_type(unsigned long field) 6794{ 6795 if (0x1 & field) /* the *_HIGH fields are all 32 bit */ 6796 return VMCS_FIELD_TYPE_U32; 6797 return (field >> 13) & 0x3 ; 6798} 6799 6800static inline int vmcs_field_readonly(unsigned long field) 6801{ 6802 return (((field >> 10) & 0x3) == 1); 6803} 6804 6805/* 6806 * Read a vmcs12 field. Since these can have varying lengths and we return 6807 * one type, we chose the biggest type (u64) and zero-extend the return value 6808 * to that size. Note that the caller, handle_vmread, might need to use only 6809 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of 6810 * 64-bit fields are to be returned). 6811 */ 6812static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, 6813 unsigned long field, u64 *ret) 6814{ 6815 short offset = vmcs_field_to_offset(field); 6816 char *p; 6817 6818 if (offset < 0) 6819 return offset; 6820 6821 p = ((char *)(get_vmcs12(vcpu))) + offset; 6822 6823 switch (vmcs_field_type(field)) { 6824 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6825 *ret = *((natural_width *)p); 6826 return 0; 6827 case VMCS_FIELD_TYPE_U16: 6828 *ret = *((u16 *)p); 6829 return 0; 6830 case VMCS_FIELD_TYPE_U32: 6831 *ret = *((u32 *)p); 6832 return 0; 6833 case VMCS_FIELD_TYPE_U64: 6834 *ret = *((u64 *)p); 6835 return 0; 6836 default: 6837 WARN_ON(1); 6838 return -ENOENT; 6839 } 6840} 6841 6842 6843static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, 6844 unsigned long field, u64 field_value){ 6845 short offset = vmcs_field_to_offset(field); 6846 char *p = ((char *) get_vmcs12(vcpu)) + offset; 6847 if (offset < 0) 6848 return offset; 6849 6850 switch (vmcs_field_type(field)) { 6851 case VMCS_FIELD_TYPE_U16: 6852 *(u16 *)p = field_value; 6853 return 0; 6854 case VMCS_FIELD_TYPE_U32: 6855 *(u32 *)p = field_value; 6856 return 0; 6857 case VMCS_FIELD_TYPE_U64: 6858 *(u64 *)p = field_value; 6859 return 0; 6860 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6861 *(natural_width *)p = field_value; 6862 return 0; 6863 default: 6864 WARN_ON(1); 6865 return -ENOENT; 6866 } 6867 6868} 6869 6870static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 6871{ 6872 int i; 6873 unsigned long field; 6874 u64 field_value; 6875 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6876 const unsigned long *fields = shadow_read_write_fields; 6877 const int num_fields = max_shadow_read_write_fields; 6878 6879 preempt_disable(); 6880 6881 vmcs_load(shadow_vmcs); 6882 6883 for (i = 0; i < num_fields; i++) { 6884 field = fields[i]; 6885 switch (vmcs_field_type(field)) { 6886 case VMCS_FIELD_TYPE_U16: 6887 field_value = vmcs_read16(field); 6888 break; 6889 case VMCS_FIELD_TYPE_U32: 6890 field_value = vmcs_read32(field); 6891 break; 6892 case VMCS_FIELD_TYPE_U64: 6893 field_value = vmcs_read64(field); 6894 break; 6895 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6896 field_value = vmcs_readl(field); 6897 break; 6898 default: 6899 WARN_ON(1); 6900 continue; 6901 } 6902 vmcs12_write_any(&vmx->vcpu, field, field_value); 6903 } 6904 6905 vmcs_clear(shadow_vmcs); 6906 vmcs_load(vmx->loaded_vmcs->vmcs); 6907 6908 preempt_enable(); 6909} 6910 6911static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 6912{ 6913 const unsigned long *fields[] = { 6914 shadow_read_write_fields, 6915 shadow_read_only_fields 6916 }; 6917 const int max_fields[] = { 6918 max_shadow_read_write_fields, 6919 max_shadow_read_only_fields 6920 }; 6921 int i, q; 6922 unsigned long field; 6923 u64 field_value = 0; 6924 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6925 6926 vmcs_load(shadow_vmcs); 6927 6928 for (q = 0; q < ARRAY_SIZE(fields); q++) { 6929 for (i = 0; i < max_fields[q]; i++) { 6930 field = fields[q][i]; 6931 vmcs12_read_any(&vmx->vcpu, field, &field_value); 6932 6933 switch (vmcs_field_type(field)) { 6934 case VMCS_FIELD_TYPE_U16: 6935 vmcs_write16(field, (u16)field_value); 6936 break; 6937 case VMCS_FIELD_TYPE_U32: 6938 vmcs_write32(field, (u32)field_value); 6939 break; 6940 case VMCS_FIELD_TYPE_U64: 6941 vmcs_write64(field, (u64)field_value); 6942 break; 6943 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6944 vmcs_writel(field, (long)field_value); 6945 break; 6946 default: 6947 WARN_ON(1); 6948 break; 6949 } 6950 } 6951 } 6952 6953 vmcs_clear(shadow_vmcs); 6954 vmcs_load(vmx->loaded_vmcs->vmcs); 6955} 6956 6957/* 6958 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was 6959 * used before) all generate the same failure when it is missing. 6960 */ 6961static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) 6962{ 6963 struct vcpu_vmx *vmx = to_vmx(vcpu); 6964 if (vmx->nested.current_vmptr == -1ull) { 6965 nested_vmx_failInvalid(vcpu); 6966 skip_emulated_instruction(vcpu); 6967 return 0; 6968 } 6969 return 1; 6970} 6971 6972static int handle_vmread(struct kvm_vcpu *vcpu) 6973{ 6974 unsigned long field; 6975 u64 field_value; 6976 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6977 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6978 gva_t gva = 0; 6979 6980 if (!nested_vmx_check_permission(vcpu) || 6981 !nested_vmx_check_vmcs12(vcpu)) 6982 return 1; 6983 6984 /* Decode instruction info and find the field to read */ 6985 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6986 /* Read the field, zero-extended to a u64 field_value */ 6987 if (vmcs12_read_any(vcpu, field, &field_value) < 0) { 6988 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 6989 skip_emulated_instruction(vcpu); 6990 return 1; 6991 } 6992 /* 6993 * Now copy part of this value to register or memory, as requested. 6994 * Note that the number of bits actually copied is 32 or 64 depending 6995 * on the guest's mode (32 or 64 bit), not on the given field's length. 6996 */ 6997 if (vmx_instruction_info & (1u << 10)) { 6998 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 6999 field_value); 7000 } else { 7001 if (get_vmx_mem_address(vcpu, exit_qualification, 7002 vmx_instruction_info, &gva)) 7003 return 1; 7004 /* _system ok, as nested_vmx_check_permission verified cpl=0 */ 7005 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, 7006 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); 7007 } 7008 7009 nested_vmx_succeed(vcpu); 7010 skip_emulated_instruction(vcpu); 7011 return 1; 7012} 7013 7014 7015static int handle_vmwrite(struct kvm_vcpu *vcpu) 7016{ 7017 unsigned long field; 7018 gva_t gva; 7019 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7020 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7021 /* The value to write might be 32 or 64 bits, depending on L1's long 7022 * mode, and eventually we need to write that into a field of several 7023 * possible lengths. The code below first zero-extends the value to 64 7024 * bit (field_value), and then copies only the approriate number of 7025 * bits into the vmcs12 field. 7026 */ 7027 u64 field_value = 0; 7028 struct x86_exception e; 7029 7030 if (!nested_vmx_check_permission(vcpu) || 7031 !nested_vmx_check_vmcs12(vcpu)) 7032 return 1; 7033 7034 if (vmx_instruction_info & (1u << 10)) 7035 field_value = kvm_register_readl(vcpu, 7036 (((vmx_instruction_info) >> 3) & 0xf)); 7037 else { 7038 if (get_vmx_mem_address(vcpu, exit_qualification, 7039 vmx_instruction_info, &gva)) 7040 return 1; 7041 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, 7042 &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { 7043 kvm_inject_page_fault(vcpu, &e); 7044 return 1; 7045 } 7046 } 7047 7048 7049 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 7050 if (vmcs_field_readonly(field)) { 7051 nested_vmx_failValid(vcpu, 7052 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 7053 skip_emulated_instruction(vcpu); 7054 return 1; 7055 } 7056 7057 if (vmcs12_write_any(vcpu, field, field_value) < 0) { 7058 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 7059 skip_emulated_instruction(vcpu); 7060 return 1; 7061 } 7062 7063 nested_vmx_succeed(vcpu); 7064 skip_emulated_instruction(vcpu); 7065 return 1; 7066} 7067 7068/* Emulate the VMPTRLD instruction */ 7069static int handle_vmptrld(struct kvm_vcpu *vcpu) 7070{ 7071 struct vcpu_vmx *vmx = to_vmx(vcpu); 7072 gpa_t vmptr; 7073 u32 exec_control; 7074 7075 if (!nested_vmx_check_permission(vcpu)) 7076 return 1; 7077 7078 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) 7079 return 1; 7080 7081 if (vmx->nested.current_vmptr != vmptr) { 7082 struct vmcs12 *new_vmcs12; 7083 struct page *page; 7084 page = nested_get_page(vcpu, vmptr); 7085 if (page == NULL) { 7086 nested_vmx_failInvalid(vcpu); 7087 skip_emulated_instruction(vcpu); 7088 return 1; 7089 } 7090 new_vmcs12 = kmap(page); 7091 if (new_vmcs12->revision_id != VMCS12_REVISION) { 7092 kunmap(page); 7093 nested_release_page_clean(page); 7094 nested_vmx_failValid(vcpu, 7095 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 7096 skip_emulated_instruction(vcpu); 7097 return 1; 7098 } 7099 7100 nested_release_vmcs12(vmx); 7101 vmx->nested.current_vmptr = vmptr; 7102 vmx->nested.current_vmcs12 = new_vmcs12; 7103 vmx->nested.current_vmcs12_page = page; 7104 if (enable_shadow_vmcs) { 7105 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7106 exec_control |= SECONDARY_EXEC_SHADOW_VMCS; 7107 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7108 vmcs_write64(VMCS_LINK_POINTER, 7109 __pa(vmx->nested.current_shadow_vmcs)); 7110 vmx->nested.sync_shadow_vmcs = true; 7111 } 7112 } 7113 7114 nested_vmx_succeed(vcpu); 7115 skip_emulated_instruction(vcpu); 7116 return 1; 7117} 7118 7119/* Emulate the VMPTRST instruction */ 7120static int handle_vmptrst(struct kvm_vcpu *vcpu) 7121{ 7122 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7123 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7124 gva_t vmcs_gva; 7125 struct x86_exception e; 7126 7127 if (!nested_vmx_check_permission(vcpu)) 7128 return 1; 7129 7130 if (get_vmx_mem_address(vcpu, exit_qualification, 7131 vmx_instruction_info, &vmcs_gva)) 7132 return 1; 7133 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ 7134 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, 7135 (void *)&to_vmx(vcpu)->nested.current_vmptr, 7136 sizeof(u64), &e)) { 7137 kvm_inject_page_fault(vcpu, &e); 7138 return 1; 7139 } 7140 nested_vmx_succeed(vcpu); 7141 skip_emulated_instruction(vcpu); 7142 return 1; 7143} 7144 7145/* Emulate the INVEPT instruction */ 7146static int handle_invept(struct kvm_vcpu *vcpu) 7147{ 7148 struct vcpu_vmx *vmx = to_vmx(vcpu); 7149 u32 vmx_instruction_info, types; 7150 unsigned long type; 7151 gva_t gva; 7152 struct x86_exception e; 7153 struct { 7154 u64 eptp, gpa; 7155 } operand; 7156 7157 if (!(vmx->nested.nested_vmx_secondary_ctls_high & 7158 SECONDARY_EXEC_ENABLE_EPT) || 7159 !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 7160 kvm_queue_exception(vcpu, UD_VECTOR); 7161 return 1; 7162 } 7163 7164 if (!nested_vmx_check_permission(vcpu)) 7165 return 1; 7166 7167 if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { 7168 kvm_queue_exception(vcpu, UD_VECTOR); 7169 return 1; 7170 } 7171 7172 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7173 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 7174 7175 types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 7176 7177 if (!(types & (1UL << type))) { 7178 nested_vmx_failValid(vcpu, 7179 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7180 return 1; 7181 } 7182 7183 /* According to the Intel VMX instruction reference, the memory 7184 * operand is read even if it isn't needed (e.g., for type==global) 7185 */ 7186 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 7187 vmx_instruction_info, &gva)) 7188 return 1; 7189 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, 7190 sizeof(operand), &e)) { 7191 kvm_inject_page_fault(vcpu, &e); 7192 return 1; 7193 } 7194 7195 switch (type) { 7196 case VMX_EPT_EXTENT_GLOBAL: 7197 kvm_mmu_sync_roots(vcpu); 7198 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 7199 nested_vmx_succeed(vcpu); 7200 break; 7201 default: 7202 /* Trap single context invalidation invept calls */ 7203 BUG_ON(1); 7204 break; 7205 } 7206 7207 skip_emulated_instruction(vcpu); 7208 return 1; 7209} 7210 7211static int handle_invvpid(struct kvm_vcpu *vcpu) 7212{ 7213 kvm_queue_exception(vcpu, UD_VECTOR); 7214 return 1; 7215} 7216 7217static int handle_pml_full(struct kvm_vcpu *vcpu) 7218{ 7219 unsigned long exit_qualification; 7220 7221 trace_kvm_pml_full(vcpu->vcpu_id); 7222 7223 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7224 7225 /* 7226 * PML buffer FULL happened while executing iret from NMI, 7227 * "blocked by NMI" bit has to be set before next VM entry. 7228 */ 7229 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 7230 cpu_has_virtual_nmis() && 7231 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 7232 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7233 GUEST_INTR_STATE_NMI); 7234 7235 /* 7236 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 7237 * here.., and there's no userspace involvement needed for PML. 7238 */ 7239 return 1; 7240} 7241 7242/* 7243 * The exit handlers return 1 if the exit was handled fully and guest execution 7244 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 7245 * to be done to userspace and return 0. 7246 */ 7247static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 7248 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 7249 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 7250 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 7251 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 7252 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 7253 [EXIT_REASON_CR_ACCESS] = handle_cr, 7254 [EXIT_REASON_DR_ACCESS] = handle_dr, 7255 [EXIT_REASON_CPUID] = handle_cpuid, 7256 [EXIT_REASON_MSR_READ] = handle_rdmsr, 7257 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 7258 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 7259 [EXIT_REASON_HLT] = handle_halt, 7260 [EXIT_REASON_INVD] = handle_invd, 7261 [EXIT_REASON_INVLPG] = handle_invlpg, 7262 [EXIT_REASON_RDPMC] = handle_rdpmc, 7263 [EXIT_REASON_VMCALL] = handle_vmcall, 7264 [EXIT_REASON_VMCLEAR] = handle_vmclear, 7265 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 7266 [EXIT_REASON_VMPTRLD] = handle_vmptrld, 7267 [EXIT_REASON_VMPTRST] = handle_vmptrst, 7268 [EXIT_REASON_VMREAD] = handle_vmread, 7269 [EXIT_REASON_VMRESUME] = handle_vmresume, 7270 [EXIT_REASON_VMWRITE] = handle_vmwrite, 7271 [EXIT_REASON_VMOFF] = handle_vmoff, 7272 [EXIT_REASON_VMON] = handle_vmon, 7273 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 7274 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 7275 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 7276 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 7277 [EXIT_REASON_WBINVD] = handle_wbinvd, 7278 [EXIT_REASON_XSETBV] = handle_xsetbv, 7279 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 7280 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 7281 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 7282 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 7283 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 7284 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 7285 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 7286 [EXIT_REASON_INVEPT] = handle_invept, 7287 [EXIT_REASON_INVVPID] = handle_invvpid, 7288 [EXIT_REASON_XSAVES] = handle_xsaves, 7289 [EXIT_REASON_XRSTORS] = handle_xrstors, 7290 [EXIT_REASON_PML_FULL] = handle_pml_full, 7291}; 7292 7293static const int kvm_vmx_max_exit_handlers = 7294 ARRAY_SIZE(kvm_vmx_exit_handlers); 7295 7296static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 7297 struct vmcs12 *vmcs12) 7298{ 7299 unsigned long exit_qualification; 7300 gpa_t bitmap, last_bitmap; 7301 unsigned int port; 7302 int size; 7303 u8 b; 7304 7305 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 7306 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 7307 7308 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7309 7310 port = exit_qualification >> 16; 7311 size = (exit_qualification & 7) + 1; 7312 7313 last_bitmap = (gpa_t)-1; 7314 b = -1; 7315 7316 while (size > 0) { 7317 if (port < 0x8000) 7318 bitmap = vmcs12->io_bitmap_a; 7319 else if (port < 0x10000) 7320 bitmap = vmcs12->io_bitmap_b; 7321 else 7322 return true; 7323 bitmap += (port & 0x7fff) / 8; 7324 7325 if (last_bitmap != bitmap) 7326 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 7327 return true; 7328 if (b & (1 << (port & 7))) 7329 return true; 7330 7331 port++; 7332 size--; 7333 last_bitmap = bitmap; 7334 } 7335 7336 return false; 7337} 7338 7339/* 7340 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 7341 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 7342 * disinterest in the current event (read or write a specific MSR) by using an 7343 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 7344 */ 7345static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 7346 struct vmcs12 *vmcs12, u32 exit_reason) 7347{ 7348 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 7349 gpa_t bitmap; 7350 7351 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 7352 return true; 7353 7354 /* 7355 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 7356 * for the four combinations of read/write and low/high MSR numbers. 7357 * First we need to figure out which of the four to use: 7358 */ 7359 bitmap = vmcs12->msr_bitmap; 7360 if (exit_reason == EXIT_REASON_MSR_WRITE) 7361 bitmap += 2048; 7362 if (msr_index >= 0xc0000000) { 7363 msr_index -= 0xc0000000; 7364 bitmap += 1024; 7365 } 7366 7367 /* Then read the msr_index'th bit from this bitmap: */ 7368 if (msr_index < 1024*8) { 7369 unsigned char b; 7370 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 7371 return true; 7372 return 1 & (b >> (msr_index & 7)); 7373 } else 7374 return true; /* let L1 handle the wrong parameter */ 7375} 7376 7377/* 7378 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 7379 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 7380 * intercept (via guest_host_mask etc.) the current event. 7381 */ 7382static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 7383 struct vmcs12 *vmcs12) 7384{ 7385 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7386 int cr = exit_qualification & 15; 7387 int reg = (exit_qualification >> 8) & 15; 7388 unsigned long val = kvm_register_readl(vcpu, reg); 7389 7390 switch ((exit_qualification >> 4) & 3) { 7391 case 0: /* mov to cr */ 7392 switch (cr) { 7393 case 0: 7394 if (vmcs12->cr0_guest_host_mask & 7395 (val ^ vmcs12->cr0_read_shadow)) 7396 return true; 7397 break; 7398 case 3: 7399 if ((vmcs12->cr3_target_count >= 1 && 7400 vmcs12->cr3_target_value0 == val) || 7401 (vmcs12->cr3_target_count >= 2 && 7402 vmcs12->cr3_target_value1 == val) || 7403 (vmcs12->cr3_target_count >= 3 && 7404 vmcs12->cr3_target_value2 == val) || 7405 (vmcs12->cr3_target_count >= 4 && 7406 vmcs12->cr3_target_value3 == val)) 7407 return false; 7408 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 7409 return true; 7410 break; 7411 case 4: 7412 if (vmcs12->cr4_guest_host_mask & 7413 (vmcs12->cr4_read_shadow ^ val)) 7414 return true; 7415 break; 7416 case 8: 7417 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 7418 return true; 7419 break; 7420 } 7421 break; 7422 case 2: /* clts */ 7423 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 7424 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 7425 return true; 7426 break; 7427 case 1: /* mov from cr */ 7428 switch (cr) { 7429 case 3: 7430 if (vmcs12->cpu_based_vm_exec_control & 7431 CPU_BASED_CR3_STORE_EXITING) 7432 return true; 7433 break; 7434 case 8: 7435 if (vmcs12->cpu_based_vm_exec_control & 7436 CPU_BASED_CR8_STORE_EXITING) 7437 return true; 7438 break; 7439 } 7440 break; 7441 case 3: /* lmsw */ 7442 /* 7443 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 7444 * cr0. Other attempted changes are ignored, with no exit. 7445 */ 7446 if (vmcs12->cr0_guest_host_mask & 0xe & 7447 (val ^ vmcs12->cr0_read_shadow)) 7448 return true; 7449 if ((vmcs12->cr0_guest_host_mask & 0x1) && 7450 !(vmcs12->cr0_read_shadow & 0x1) && 7451 (val & 0x1)) 7452 return true; 7453 break; 7454 } 7455 return false; 7456} 7457 7458/* 7459 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 7460 * should handle it ourselves in L0 (and then continue L2). Only call this 7461 * when in is_guest_mode (L2). 7462 */ 7463static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) 7464{ 7465 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7466 struct vcpu_vmx *vmx = to_vmx(vcpu); 7467 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7468 u32 exit_reason = vmx->exit_reason; 7469 7470 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 7471 vmcs_readl(EXIT_QUALIFICATION), 7472 vmx->idt_vectoring_info, 7473 intr_info, 7474 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 7475 KVM_ISA_VMX); 7476 7477 if (vmx->nested.nested_run_pending) 7478 return false; 7479 7480 if (unlikely(vmx->fail)) { 7481 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 7482 vmcs_read32(VM_INSTRUCTION_ERROR)); 7483 return true; 7484 } 7485 7486 switch (exit_reason) { 7487 case EXIT_REASON_EXCEPTION_NMI: 7488 if (!is_exception(intr_info)) 7489 return false; 7490 else if (is_page_fault(intr_info)) 7491 return enable_ept; 7492 else if (is_no_device(intr_info) && 7493 !(vmcs12->guest_cr0 & X86_CR0_TS)) 7494 return false; 7495 return vmcs12->exception_bitmap & 7496 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 7497 case EXIT_REASON_EXTERNAL_INTERRUPT: 7498 return false; 7499 case EXIT_REASON_TRIPLE_FAULT: 7500 return true; 7501 case EXIT_REASON_PENDING_INTERRUPT: 7502 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 7503 case EXIT_REASON_NMI_WINDOW: 7504 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 7505 case EXIT_REASON_TASK_SWITCH: 7506 return true; 7507 case EXIT_REASON_CPUID: 7508 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) 7509 return false; 7510 return true; 7511 case EXIT_REASON_HLT: 7512 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 7513 case EXIT_REASON_INVD: 7514 return true; 7515 case EXIT_REASON_INVLPG: 7516 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 7517 case EXIT_REASON_RDPMC: 7518 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 7519 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 7520 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 7521 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 7522 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 7523 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 7524 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 7525 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 7526 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 7527 /* 7528 * VMX instructions trap unconditionally. This allows L1 to 7529 * emulate them for its L2 guest, i.e., allows 3-level nesting! 7530 */ 7531 return true; 7532 case EXIT_REASON_CR_ACCESS: 7533 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 7534 case EXIT_REASON_DR_ACCESS: 7535 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 7536 case EXIT_REASON_IO_INSTRUCTION: 7537 return nested_vmx_exit_handled_io(vcpu, vmcs12); 7538 case EXIT_REASON_MSR_READ: 7539 case EXIT_REASON_MSR_WRITE: 7540 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 7541 case EXIT_REASON_INVALID_STATE: 7542 return true; 7543 case EXIT_REASON_MWAIT_INSTRUCTION: 7544 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 7545 case EXIT_REASON_MONITOR_INSTRUCTION: 7546 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 7547 case EXIT_REASON_PAUSE_INSTRUCTION: 7548 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 7549 nested_cpu_has2(vmcs12, 7550 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 7551 case EXIT_REASON_MCE_DURING_VMENTRY: 7552 return false; 7553 case EXIT_REASON_TPR_BELOW_THRESHOLD: 7554 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 7555 case EXIT_REASON_APIC_ACCESS: 7556 return nested_cpu_has2(vmcs12, 7557 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 7558 case EXIT_REASON_APIC_WRITE: 7559 case EXIT_REASON_EOI_INDUCED: 7560 /* apic_write and eoi_induced should exit unconditionally. */ 7561 return true; 7562 case EXIT_REASON_EPT_VIOLATION: 7563 /* 7564 * L0 always deals with the EPT violation. If nested EPT is 7565 * used, and the nested mmu code discovers that the address is 7566 * missing in the guest EPT table (EPT12), the EPT violation 7567 * will be injected with nested_ept_inject_page_fault() 7568 */ 7569 return false; 7570 case EXIT_REASON_EPT_MISCONFIG: 7571 /* 7572 * L2 never uses directly L1's EPT, but rather L0's own EPT 7573 * table (shadow on EPT) or a merged EPT table that L0 built 7574 * (EPT on EPT). So any problems with the structure of the 7575 * table is L0's fault. 7576 */ 7577 return false; 7578 case EXIT_REASON_WBINVD: 7579 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 7580 case EXIT_REASON_XSETBV: 7581 return true; 7582 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 7583 /* 7584 * This should never happen, since it is not possible to 7585 * set XSS to a non-zero value---neither in L1 nor in L2. 7586 * If if it were, XSS would have to be checked against 7587 * the XSS exit bitmap in vmcs12. 7588 */ 7589 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 7590 default: 7591 return true; 7592 } 7593} 7594 7595static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 7596{ 7597 *info1 = vmcs_readl(EXIT_QUALIFICATION); 7598 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7599} 7600 7601static int vmx_enable_pml(struct vcpu_vmx *vmx) 7602{ 7603 struct page *pml_pg; 7604 u32 exec_control; 7605 7606 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); 7607 if (!pml_pg) 7608 return -ENOMEM; 7609 7610 vmx->pml_pg = pml_pg; 7611 7612 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 7613 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7614 7615 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7616 exec_control |= SECONDARY_EXEC_ENABLE_PML; 7617 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7618 7619 return 0; 7620} 7621 7622static void vmx_disable_pml(struct vcpu_vmx *vmx) 7623{ 7624 u32 exec_control; 7625 7626 ASSERT(vmx->pml_pg); 7627 __free_page(vmx->pml_pg); 7628 vmx->pml_pg = NULL; 7629 7630 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7631 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 7632 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7633} 7634 7635static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 7636{ 7637 struct vcpu_vmx *vmx = to_vmx(vcpu); 7638 u64 *pml_buf; 7639 u16 pml_idx; 7640 7641 pml_idx = vmcs_read16(GUEST_PML_INDEX); 7642 7643 /* Do nothing if PML buffer is empty */ 7644 if (pml_idx == (PML_ENTITY_NUM - 1)) 7645 return; 7646 7647 /* PML index always points to next available PML buffer entity */ 7648 if (pml_idx >= PML_ENTITY_NUM) 7649 pml_idx = 0; 7650 else 7651 pml_idx++; 7652 7653 pml_buf = page_address(vmx->pml_pg); 7654 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 7655 u64 gpa; 7656 7657 gpa = pml_buf[pml_idx]; 7658 WARN_ON(gpa & (PAGE_SIZE - 1)); 7659 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 7660 } 7661 7662 /* reset PML index */ 7663 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7664} 7665 7666/* 7667 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. 7668 * Called before reporting dirty_bitmap to userspace. 7669 */ 7670static void kvm_flush_pml_buffers(struct kvm *kvm) 7671{ 7672 int i; 7673 struct kvm_vcpu *vcpu; 7674 /* 7675 * We only need to kick vcpu out of guest mode here, as PML buffer 7676 * is flushed at beginning of all VMEXITs, and it's obvious that only 7677 * vcpus running in guest are possible to have unflushed GPAs in PML 7678 * buffer. 7679 */ 7680 kvm_for_each_vcpu(i, vcpu, kvm) 7681 kvm_vcpu_kick(vcpu); 7682} 7683 7684static void vmx_dump_sel(char *name, uint32_t sel) 7685{ 7686 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 7687 name, vmcs_read32(sel), 7688 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 7689 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 7690 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 7691} 7692 7693static void vmx_dump_dtsel(char *name, uint32_t limit) 7694{ 7695 pr_err("%s limit=0x%08x, base=0x%016lx\n", 7696 name, vmcs_read32(limit), 7697 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 7698} 7699 7700static void dump_vmcs(void) 7701{ 7702 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 7703 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 7704 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 7705 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 7706 u32 secondary_exec_control = 0; 7707 unsigned long cr4 = vmcs_readl(GUEST_CR4); 7708 u64 efer = vmcs_readl(GUEST_IA32_EFER); 7709 int i, n; 7710 7711 if (cpu_has_secondary_exec_ctrls()) 7712 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7713 7714 pr_err("*** Guest State ***\n"); 7715 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 7716 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 7717 vmcs_readl(CR0_GUEST_HOST_MASK)); 7718 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 7719 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 7720 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 7721 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 7722 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) 7723 { 7724 pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n", 7725 vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1)); 7726 pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n", 7727 vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3)); 7728 } 7729 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 7730 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 7731 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 7732 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 7733 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 7734 vmcs_readl(GUEST_SYSENTER_ESP), 7735 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 7736 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 7737 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 7738 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 7739 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 7740 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 7741 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 7742 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 7743 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 7744 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 7745 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 7746 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || 7747 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) 7748 pr_err("EFER = 0x%016llx PAT = 0x%016lx\n", 7749 efer, vmcs_readl(GUEST_IA32_PAT)); 7750 pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n", 7751 vmcs_readl(GUEST_IA32_DEBUGCTL), 7752 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 7753 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 7754 pr_err("PerfGlobCtl = 0x%016lx\n", 7755 vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL)); 7756 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 7757 pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS)); 7758 pr_err("Interruptibility = %08x ActivityState = %08x\n", 7759 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 7760 vmcs_read32(GUEST_ACTIVITY_STATE)); 7761 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 7762 pr_err("InterruptStatus = %04x\n", 7763 vmcs_read16(GUEST_INTR_STATUS)); 7764 7765 pr_err("*** Host State ***\n"); 7766 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 7767 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 7768 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 7769 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 7770 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 7771 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 7772 vmcs_read16(HOST_TR_SELECTOR)); 7773 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 7774 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 7775 vmcs_readl(HOST_TR_BASE)); 7776 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 7777 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 7778 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 7779 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 7780 vmcs_readl(HOST_CR4)); 7781 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 7782 vmcs_readl(HOST_IA32_SYSENTER_ESP), 7783 vmcs_read32(HOST_IA32_SYSENTER_CS), 7784 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 7785 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) 7786 pr_err("EFER = 0x%016lx PAT = 0x%016lx\n", 7787 vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT)); 7788 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 7789 pr_err("PerfGlobCtl = 0x%016lx\n", 7790 vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL)); 7791 7792 pr_err("*** Control State ***\n"); 7793 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", 7794 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); 7795 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); 7796 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 7797 vmcs_read32(EXCEPTION_BITMAP), 7798 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 7799 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 7800 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 7801 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7802 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 7803 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 7804 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 7805 vmcs_read32(VM_EXIT_INTR_INFO), 7806 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 7807 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 7808 pr_err(" reason=%08x qualification=%016lx\n", 7809 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 7810 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 7811 vmcs_read32(IDT_VECTORING_INFO_FIELD), 7812 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 7813 pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); 7814 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) 7815 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 7816 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 7817 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 7818 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 7819 pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER)); 7820 n = vmcs_read32(CR3_TARGET_COUNT); 7821 for (i = 0; i + 1 < n; i += 4) 7822 pr_err("CR3 target%u=%016lx target%u=%016lx\n", 7823 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), 7824 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); 7825 if (i < n) 7826 pr_err("CR3 target%u=%016lx\n", 7827 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); 7828 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 7829 pr_err("PLE Gap=%08x Window=%08x\n", 7830 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 7831 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 7832 pr_err("Virtual processor ID = 0x%04x\n", 7833 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 7834} 7835 7836/* 7837 * The guest has exited. See if we can fix it or if we need userspace 7838 * assistance. 7839 */ 7840static int vmx_handle_exit(struct kvm_vcpu *vcpu) 7841{ 7842 struct vcpu_vmx *vmx = to_vmx(vcpu); 7843 u32 exit_reason = vmx->exit_reason; 7844 u32 vectoring_info = vmx->idt_vectoring_info; 7845 7846 /* 7847 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 7848 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 7849 * querying dirty_bitmap, we only need to kick all vcpus out of guest 7850 * mode as if vcpus is in root mode, the PML buffer must has been 7851 * flushed already. 7852 */ 7853 if (enable_pml) 7854 vmx_flush_pml_buffer(vcpu); 7855 7856 /* If guest state is invalid, start emulating */ 7857 if (vmx->emulation_required) 7858 return handle_invalid_guest_state(vcpu); 7859 7860 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { 7861 nested_vmx_vmexit(vcpu, exit_reason, 7862 vmcs_read32(VM_EXIT_INTR_INFO), 7863 vmcs_readl(EXIT_QUALIFICATION)); 7864 return 1; 7865 } 7866 7867 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 7868 dump_vmcs(); 7869 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 7870 vcpu->run->fail_entry.hardware_entry_failure_reason 7871 = exit_reason; 7872 return 0; 7873 } 7874 7875 if (unlikely(vmx->fail)) { 7876 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 7877 vcpu->run->fail_entry.hardware_entry_failure_reason 7878 = vmcs_read32(VM_INSTRUCTION_ERROR); 7879 return 0; 7880 } 7881 7882 /* 7883 * Note: 7884 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 7885 * delivery event since it indicates guest is accessing MMIO. 7886 * The vm-exit can be triggered again after return to guest that 7887 * will cause infinite loop. 7888 */ 7889 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 7890 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 7891 exit_reason != EXIT_REASON_EPT_VIOLATION && 7892 exit_reason != EXIT_REASON_TASK_SWITCH)) { 7893 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 7894 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 7895 vcpu->run->internal.ndata = 2; 7896 vcpu->run->internal.data[0] = vectoring_info; 7897 vcpu->run->internal.data[1] = exit_reason; 7898 return 0; 7899 } 7900 7901 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 7902 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 7903 get_vmcs12(vcpu))))) { 7904 if (vmx_interrupt_allowed(vcpu)) { 7905 vmx->soft_vnmi_blocked = 0; 7906 } else if (vmx->vnmi_blocked_time > 1000000000LL && 7907 vcpu->arch.nmi_pending) { 7908 /* 7909 * This CPU don't support us in finding the end of an 7910 * NMI-blocked window if the guest runs with IRQs 7911 * disabled. So we pull the trigger after 1 s of 7912 * futile waiting, but inform the user about this. 7913 */ 7914 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 7915 "state on VCPU %d after 1 s timeout\n", 7916 __func__, vcpu->vcpu_id); 7917 vmx->soft_vnmi_blocked = 0; 7918 } 7919 } 7920 7921 if (exit_reason < kvm_vmx_max_exit_handlers 7922 && kvm_vmx_exit_handlers[exit_reason]) 7923 return kvm_vmx_exit_handlers[exit_reason](vcpu); 7924 else { 7925 WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); 7926 kvm_queue_exception(vcpu, UD_VECTOR); 7927 return 1; 7928 } 7929} 7930 7931static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 7932{ 7933 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7934 7935 if (is_guest_mode(vcpu) && 7936 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 7937 return; 7938 7939 if (irr == -1 || tpr < irr) { 7940 vmcs_write32(TPR_THRESHOLD, 0); 7941 return; 7942 } 7943 7944 vmcs_write32(TPR_THRESHOLD, irr); 7945} 7946 7947static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) 7948{ 7949 u32 sec_exec_control; 7950 7951 /* 7952 * There is not point to enable virtualize x2apic without enable 7953 * apicv 7954 */ 7955 if (!cpu_has_vmx_virtualize_x2apic_mode() || 7956 !vmx_vm_has_apicv(vcpu->kvm)) 7957 return; 7958 7959 if (!vm_need_tpr_shadow(vcpu->kvm)) 7960 return; 7961 7962 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7963 7964 if (set) { 7965 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7966 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 7967 } else { 7968 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 7969 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7970 } 7971 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 7972 7973 vmx_set_msr_bitmap(vcpu); 7974} 7975 7976static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) 7977{ 7978 struct vcpu_vmx *vmx = to_vmx(vcpu); 7979 7980 /* 7981 * Currently we do not handle the nested case where L2 has an 7982 * APIC access page of its own; that page is still pinned. 7983 * Hence, we skip the case where the VCPU is in guest mode _and_ 7984 * L1 prepared an APIC access page for L2. 7985 * 7986 * For the case where L1 and L2 share the same APIC access page 7987 * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear 7988 * in the vmcs12), this function will only update either the vmcs01 7989 * or the vmcs02. If the former, the vmcs02 will be updated by 7990 * prepare_vmcs02. If the latter, the vmcs01 will be updated in 7991 * the next L2->L1 exit. 7992 */ 7993 if (!is_guest_mode(vcpu) || 7994 !nested_cpu_has2(vmx->nested.current_vmcs12, 7995 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 7996 vmcs_write64(APIC_ACCESS_ADDR, hpa); 7997} 7998 7999static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) 8000{ 8001 u16 status; 8002 u8 old; 8003 8004 if (isr == -1) 8005 isr = 0; 8006 8007 status = vmcs_read16(GUEST_INTR_STATUS); 8008 old = status >> 8; 8009 if (isr != old) { 8010 status &= 0xff; 8011 status |= isr << 8; 8012 vmcs_write16(GUEST_INTR_STATUS, status); 8013 } 8014} 8015 8016static void vmx_set_rvi(int vector) 8017{ 8018 u16 status; 8019 u8 old; 8020 8021 if (vector == -1) 8022 vector = 0; 8023 8024 status = vmcs_read16(GUEST_INTR_STATUS); 8025 old = (u8)status & 0xff; 8026 if ((u8)vector != old) { 8027 status &= ~0xff; 8028 status |= (u8)vector; 8029 vmcs_write16(GUEST_INTR_STATUS, status); 8030 } 8031} 8032 8033static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 8034{ 8035 if (!is_guest_mode(vcpu)) { 8036 vmx_set_rvi(max_irr); 8037 return; 8038 } 8039 8040 if (max_irr == -1) 8041 return; 8042 8043 /* 8044 * In guest mode. If a vmexit is needed, vmx_check_nested_events 8045 * handles it. 8046 */ 8047 if (nested_exit_on_intr(vcpu)) 8048 return; 8049 8050 /* 8051 * Else, fall back to pre-APICv interrupt injection since L2 8052 * is run without virtual interrupt delivery. 8053 */ 8054 if (!kvm_event_needs_reinjection(vcpu) && 8055 vmx_interrupt_allowed(vcpu)) { 8056 kvm_queue_interrupt(vcpu, max_irr, false); 8057 vmx_inject_irq(vcpu); 8058 } 8059} 8060 8061static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 8062{ 8063 if (!vmx_vm_has_apicv(vcpu->kvm)) 8064 return; 8065 8066 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 8067 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 8068 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 8069 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 8070} 8071 8072static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 8073{ 8074 u32 exit_intr_info; 8075 8076 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY 8077 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) 8078 return; 8079 8080 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8081 exit_intr_info = vmx->exit_intr_info; 8082 8083 /* Handle machine checks before interrupts are enabled */ 8084 if (is_machine_check(exit_intr_info)) 8085 kvm_machine_check(); 8086 8087 /* We need to handle NMIs before interrupts are enabled */ 8088 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 8089 (exit_intr_info & INTR_INFO_VALID_MASK)) { 8090 kvm_before_handle_nmi(&vmx->vcpu); 8091 asm("int $2"); 8092 kvm_after_handle_nmi(&vmx->vcpu); 8093 } 8094} 8095 8096static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) 8097{ 8098 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8099 8100 /* 8101 * If external interrupt exists, IF bit is set in rflags/eflags on the 8102 * interrupt stack frame, and interrupt will be enabled on a return 8103 * from interrupt handler. 8104 */ 8105 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) 8106 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { 8107 unsigned int vector; 8108 unsigned long entry; 8109 gate_desc *desc; 8110 struct vcpu_vmx *vmx = to_vmx(vcpu); 8111#ifdef CONFIG_X86_64 8112 unsigned long tmp; 8113#endif 8114 8115 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 8116 desc = (gate_desc *)vmx->host_idt_base + vector; 8117 entry = gate_offset(*desc); 8118 asm volatile( 8119#ifdef CONFIG_X86_64 8120 "mov %%" _ASM_SP ", %[sp]\n\t" 8121 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 8122 "push $%c[ss]\n\t" 8123 "push %[sp]\n\t" 8124#endif 8125 "pushf\n\t" 8126 "orl $0x200, (%%" _ASM_SP ")\n\t" 8127 __ASM_SIZE(push) " $%c[cs]\n\t" 8128 "call *%[entry]\n\t" 8129 : 8130#ifdef CONFIG_X86_64 8131 [sp]"=&r"(tmp) 8132#endif 8133 : 8134 [entry]"r"(entry), 8135 [ss]"i"(__KERNEL_DS), 8136 [cs]"i"(__KERNEL_CS) 8137 ); 8138 } else 8139 local_irq_enable(); 8140} 8141 8142static bool vmx_has_high_real_mode_segbase(void) 8143{ 8144 return enable_unrestricted_guest || emulate_invalid_guest_state; 8145} 8146 8147static bool vmx_mpx_supported(void) 8148{ 8149 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && 8150 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); 8151} 8152 8153static bool vmx_xsaves_supported(void) 8154{ 8155 return vmcs_config.cpu_based_2nd_exec_ctrl & 8156 SECONDARY_EXEC_XSAVES; 8157} 8158 8159static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 8160{ 8161 u32 exit_intr_info; 8162 bool unblock_nmi; 8163 u8 vector; 8164 bool idtv_info_valid; 8165 8166 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 8167 8168 if (cpu_has_virtual_nmis()) { 8169 if (vmx->nmi_known_unmasked) 8170 return; 8171 /* 8172 * Can't use vmx->exit_intr_info since we're not sure what 8173 * the exit reason is. 8174 */ 8175 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8176 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 8177 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 8178 /* 8179 * SDM 3: 27.7.1.2 (September 2008) 8180 * Re-set bit "block by NMI" before VM entry if vmexit caused by 8181 * a guest IRET fault. 8182 * SDM 3: 23.2.2 (September 2008) 8183 * Bit 12 is undefined in any of the following cases: 8184 * If the VM exit sets the valid bit in the IDT-vectoring 8185 * information field. 8186 * If the VM exit is due to a double fault. 8187 */ 8188 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 8189 vector != DF_VECTOR && !idtv_info_valid) 8190 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 8191 GUEST_INTR_STATE_NMI); 8192 else 8193 vmx->nmi_known_unmasked = 8194 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 8195 & GUEST_INTR_STATE_NMI); 8196 } else if (unlikely(vmx->soft_vnmi_blocked)) 8197 vmx->vnmi_blocked_time += 8198 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 8199} 8200 8201static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 8202 u32 idt_vectoring_info, 8203 int instr_len_field, 8204 int error_code_field) 8205{ 8206 u8 vector; 8207 int type; 8208 bool idtv_info_valid; 8209 8210 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 8211 8212 vcpu->arch.nmi_injected = false; 8213 kvm_clear_exception_queue(vcpu); 8214 kvm_clear_interrupt_queue(vcpu); 8215 8216 if (!idtv_info_valid) 8217 return; 8218 8219 kvm_make_request(KVM_REQ_EVENT, vcpu); 8220 8221 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 8222 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 8223 8224 switch (type) { 8225 case INTR_TYPE_NMI_INTR: 8226 vcpu->arch.nmi_injected = true; 8227 /* 8228 * SDM 3: 27.7.1.2 (September 2008) 8229 * Clear bit "block by NMI" before VM entry if a NMI 8230 * delivery faulted. 8231 */ 8232 vmx_set_nmi_mask(vcpu, false); 8233 break; 8234 case INTR_TYPE_SOFT_EXCEPTION: 8235 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 8236 /* fall through */ 8237 case INTR_TYPE_HARD_EXCEPTION: 8238 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 8239 u32 err = vmcs_read32(error_code_field); 8240 kvm_requeue_exception_e(vcpu, vector, err); 8241 } else 8242 kvm_requeue_exception(vcpu, vector); 8243 break; 8244 case INTR_TYPE_SOFT_INTR: 8245 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 8246 /* fall through */ 8247 case INTR_TYPE_EXT_INTR: 8248 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 8249 break; 8250 default: 8251 break; 8252 } 8253} 8254 8255static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 8256{ 8257 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 8258 VM_EXIT_INSTRUCTION_LEN, 8259 IDT_VECTORING_ERROR_CODE); 8260} 8261 8262static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 8263{ 8264 __vmx_complete_interrupts(vcpu, 8265 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 8266 VM_ENTRY_INSTRUCTION_LEN, 8267 VM_ENTRY_EXCEPTION_ERROR_CODE); 8268 8269 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 8270} 8271 8272static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 8273{ 8274 int i, nr_msrs; 8275 struct perf_guest_switch_msr *msrs; 8276 8277 msrs = perf_guest_get_msrs(&nr_msrs); 8278 8279 if (!msrs) 8280 return; 8281 8282 for (i = 0; i < nr_msrs; i++) 8283 if (msrs[i].host == msrs[i].guest) 8284 clear_atomic_switch_msr(vmx, msrs[i].msr); 8285 else 8286 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 8287 msrs[i].host); 8288} 8289 8290static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 8291{ 8292 struct vcpu_vmx *vmx = to_vmx(vcpu); 8293 unsigned long debugctlmsr, cr4; 8294 8295 /* Record the guest's net vcpu time for enforced NMI injections. */ 8296 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 8297 vmx->entry_time = ktime_get(); 8298 8299 /* Don't enter VMX if guest state is invalid, let the exit handler 8300 start emulation until we arrive back to a valid state */ 8301 if (vmx->emulation_required) 8302 return; 8303 8304 if (vmx->ple_window_dirty) { 8305 vmx->ple_window_dirty = false; 8306 vmcs_write32(PLE_WINDOW, vmx->ple_window); 8307 } 8308 8309 if (vmx->nested.sync_shadow_vmcs) { 8310 copy_vmcs12_to_shadow(vmx); 8311 vmx->nested.sync_shadow_vmcs = false; 8312 } 8313 8314 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 8315 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 8316 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 8317 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 8318 8319 cr4 = cr4_read_shadow(); 8320 if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { 8321 vmcs_writel(HOST_CR4, cr4); 8322 vmx->host_state.vmcs_host_cr4 = cr4; 8323 } 8324 8325 /* When single-stepping over STI and MOV SS, we must clear the 8326 * corresponding interruptibility bits in the guest state. Otherwise 8327 * vmentry fails as it then expects bit 14 (BS) in pending debug 8328 * exceptions being set, but that's not correct for the guest debugging 8329 * case. */ 8330 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 8331 vmx_set_interrupt_shadow(vcpu, 0); 8332 8333 atomic_switch_perf_msrs(vmx); 8334 debugctlmsr = get_debugctlmsr(); 8335 8336 vmx->__launched = vmx->loaded_vmcs->launched; 8337 asm( 8338 /* Store host registers */ 8339 "push %%" _ASM_DX "; push %%" _ASM_BP ";" 8340 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ 8341 "push %%" _ASM_CX " \n\t" 8342 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 8343 "je 1f \n\t" 8344 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 8345 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 8346 "1: \n\t" 8347 /* Reload cr2 if changed */ 8348 "mov %c[cr2](%0), %%" _ASM_AX " \n\t" 8349 "mov %%cr2, %%" _ASM_DX " \n\t" 8350 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" 8351 "je 2f \n\t" 8352 "mov %%" _ASM_AX", %%cr2 \n\t" 8353 "2: \n\t" 8354 /* Check if vmlaunch of vmresume is needed */ 8355 "cmpl $0, %c[launched](%0) \n\t" 8356 /* Load guest registers. Don't clobber flags. */ 8357 "mov %c[rax](%0), %%" _ASM_AX " \n\t" 8358 "mov %c[rbx](%0), %%" _ASM_BX " \n\t" 8359 "mov %c[rdx](%0), %%" _ASM_DX " \n\t" 8360 "mov %c[rsi](%0), %%" _ASM_SI " \n\t" 8361 "mov %c[rdi](%0), %%" _ASM_DI " \n\t" 8362 "mov %c[rbp](%0), %%" _ASM_BP " \n\t" 8363#ifdef CONFIG_X86_64 8364 "mov %c[r8](%0), %%r8 \n\t" 8365 "mov %c[r9](%0), %%r9 \n\t" 8366 "mov %c[r10](%0), %%r10 \n\t" 8367 "mov %c[r11](%0), %%r11 \n\t" 8368 "mov %c[r12](%0), %%r12 \n\t" 8369 "mov %c[r13](%0), %%r13 \n\t" 8370 "mov %c[r14](%0), %%r14 \n\t" 8371 "mov %c[r15](%0), %%r15 \n\t" 8372#endif 8373 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ 8374 8375 /* Enter guest mode */ 8376 "jne 1f \n\t" 8377 __ex(ASM_VMX_VMLAUNCH) "\n\t" 8378 "jmp 2f \n\t" 8379 "1: " __ex(ASM_VMX_VMRESUME) "\n\t" 8380 "2: " 8381 /* Save guest registers, load host registers, keep flags */ 8382 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" 8383 "pop %0 \n\t" 8384 "mov %%" _ASM_AX ", %c[rax](%0) \n\t" 8385 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" 8386 __ASM_SIZE(pop) " %c[rcx](%0) \n\t" 8387 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" 8388 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" 8389 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" 8390 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" 8391#ifdef CONFIG_X86_64 8392 "mov %%r8, %c[r8](%0) \n\t" 8393 "mov %%r9, %c[r9](%0) \n\t" 8394 "mov %%r10, %c[r10](%0) \n\t" 8395 "mov %%r11, %c[r11](%0) \n\t" 8396 "mov %%r12, %c[r12](%0) \n\t" 8397 "mov %%r13, %c[r13](%0) \n\t" 8398 "mov %%r14, %c[r14](%0) \n\t" 8399 "mov %%r15, %c[r15](%0) \n\t" 8400#endif 8401 "mov %%cr2, %%" _ASM_AX " \n\t" 8402 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" 8403 8404 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" 8405 "setbe %c[fail](%0) \n\t" 8406 ".pushsection .rodata \n\t" 8407 ".global vmx_return \n\t" 8408 "vmx_return: " _ASM_PTR " 2b \n\t" 8409 ".popsection" 8410 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 8411 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 8412 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 8413 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 8414 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 8415 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 8416 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 8417 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), 8418 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), 8419 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), 8420 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), 8421#ifdef CONFIG_X86_64 8422 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), 8423 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), 8424 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), 8425 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), 8426 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), 8427 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), 8428 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 8429 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 8430#endif 8431 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), 8432 [wordsize]"i"(sizeof(ulong)) 8433 : "cc", "memory" 8434#ifdef CONFIG_X86_64 8435 , "rax", "rbx", "rdi", "rsi" 8436 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 8437#else 8438 , "eax", "ebx", "edi", "esi" 8439#endif 8440 ); 8441 8442 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 8443 if (debugctlmsr) 8444 update_debugctlmsr(debugctlmsr); 8445 8446#ifndef CONFIG_X86_64 8447 /* 8448 * The sysexit path does not restore ds/es, so we must set them to 8449 * a reasonable value ourselves. 8450 * 8451 * We can't defer this to vmx_load_host_state() since that function 8452 * may be executed in interrupt context, which saves and restore segments 8453 * around it, nullifying its effect. 8454 */ 8455 loadsegment(ds, __USER_DS); 8456 loadsegment(es, __USER_DS); 8457#endif 8458 8459 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 8460 | (1 << VCPU_EXREG_RFLAGS) 8461 | (1 << VCPU_EXREG_PDPTR) 8462 | (1 << VCPU_EXREG_SEGMENTS) 8463 | (1 << VCPU_EXREG_CR3)); 8464 vcpu->arch.regs_dirty = 0; 8465 8466 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 8467 8468 vmx->loaded_vmcs->launched = 1; 8469 8470 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 8471 trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); 8472 8473 /* 8474 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if 8475 * we did not inject a still-pending event to L1 now because of 8476 * nested_run_pending, we need to re-enable this bit. 8477 */ 8478 if (vmx->nested.nested_run_pending) 8479 kvm_make_request(KVM_REQ_EVENT, vcpu); 8480 8481 vmx->nested.nested_run_pending = 0; 8482 8483 vmx_complete_atomic_exit(vmx); 8484 vmx_recover_nmi_blocking(vmx); 8485 vmx_complete_interrupts(vmx); 8486} 8487 8488static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) 8489{ 8490 struct vcpu_vmx *vmx = to_vmx(vcpu); 8491 int cpu; 8492 8493 if (vmx->loaded_vmcs == &vmx->vmcs01) 8494 return; 8495 8496 cpu = get_cpu(); 8497 vmx->loaded_vmcs = &vmx->vmcs01; 8498 vmx_vcpu_put(vcpu); 8499 vmx_vcpu_load(vcpu, cpu); 8500 vcpu->cpu = cpu; 8501 put_cpu(); 8502} 8503 8504static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 8505{ 8506 struct vcpu_vmx *vmx = to_vmx(vcpu); 8507 8508 if (enable_pml) 8509 vmx_disable_pml(vmx); 8510 free_vpid(vmx); 8511 leave_guest_mode(vcpu); 8512 vmx_load_vmcs01(vcpu); 8513 free_nested(vmx); 8514 free_loaded_vmcs(vmx->loaded_vmcs); 8515 kfree(vmx->guest_msrs); 8516 kvm_vcpu_uninit(vcpu); 8517 kmem_cache_free(kvm_vcpu_cache, vmx); 8518} 8519 8520static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 8521{ 8522 int err; 8523 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 8524 int cpu; 8525 8526 if (!vmx) 8527 return ERR_PTR(-ENOMEM); 8528 8529 allocate_vpid(vmx); 8530 8531 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 8532 if (err) 8533 goto free_vcpu; 8534 8535 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 8536 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) 8537 > PAGE_SIZE); 8538 8539 err = -ENOMEM; 8540 if (!vmx->guest_msrs) { 8541 goto uninit_vcpu; 8542 } 8543 8544 vmx->loaded_vmcs = &vmx->vmcs01; 8545 vmx->loaded_vmcs->vmcs = alloc_vmcs(); 8546 if (!vmx->loaded_vmcs->vmcs) 8547 goto free_msrs; 8548 if (!vmm_exclusive) 8549 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); 8550 loaded_vmcs_init(vmx->loaded_vmcs); 8551 if (!vmm_exclusive) 8552 kvm_cpu_vmxoff(); 8553 8554 cpu = get_cpu(); 8555 vmx_vcpu_load(&vmx->vcpu, cpu); 8556 vmx->vcpu.cpu = cpu; 8557 err = vmx_vcpu_setup(vmx); 8558 vmx_vcpu_put(&vmx->vcpu); 8559 put_cpu(); 8560 if (err) 8561 goto free_vmcs; 8562 if (vm_need_virtualize_apic_accesses(kvm)) { 8563 err = alloc_apic_access_page(kvm); 8564 if (err) 8565 goto free_vmcs; 8566 } 8567 8568 if (enable_ept) { 8569 if (!kvm->arch.ept_identity_map_addr) 8570 kvm->arch.ept_identity_map_addr = 8571 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 8572 err = init_rmode_identity_map(kvm); 8573 if (err) 8574 goto free_vmcs; 8575 } 8576 8577 if (nested) 8578 nested_vmx_setup_ctls_msrs(vmx); 8579 8580 vmx->nested.posted_intr_nv = -1; 8581 vmx->nested.current_vmptr = -1ull; 8582 vmx->nested.current_vmcs12 = NULL; 8583 8584 /* 8585 * If PML is turned on, failure on enabling PML just results in failure 8586 * of creating the vcpu, therefore we can simplify PML logic (by 8587 * avoiding dealing with cases, such as enabling PML partially on vcpus 8588 * for the guest, etc. 8589 */ 8590 if (enable_pml) { 8591 err = vmx_enable_pml(vmx); 8592 if (err) 8593 goto free_vmcs; 8594 } 8595 8596 return &vmx->vcpu; 8597 8598free_vmcs: 8599 free_loaded_vmcs(vmx->loaded_vmcs); 8600free_msrs: 8601 kfree(vmx->guest_msrs); 8602uninit_vcpu: 8603 kvm_vcpu_uninit(&vmx->vcpu); 8604free_vcpu: 8605 free_vpid(vmx); 8606 kmem_cache_free(kvm_vcpu_cache, vmx); 8607 return ERR_PTR(err); 8608} 8609 8610static void __init vmx_check_processor_compat(void *rtn) 8611{ 8612 struct vmcs_config vmcs_conf; 8613 8614 *(int *)rtn = 0; 8615 if (setup_vmcs_config(&vmcs_conf) < 0) 8616 *(int *)rtn = -EIO; 8617 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 8618 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 8619 smp_processor_id()); 8620 *(int *)rtn = -EIO; 8621 } 8622} 8623 8624static int get_ept_level(void) 8625{ 8626 return VMX_EPT_DEFAULT_GAW + 1; 8627} 8628 8629static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 8630{ 8631 u8 cache; 8632 u64 ipat = 0; 8633 8634 /* For VT-d and EPT combination 8635 * 1. MMIO: guest may want to apply WC, trust it. 8636 * 2. EPT with VT-d: 8637 * a. VT-d without snooping control feature: can't guarantee the 8638 * result, try to trust guest. So the same as item 1. 8639 * b. VT-d with snooping control feature: snooping control feature of 8640 * VT-d engine can guarantee the cache correctness. Just set it 8641 * to WB to keep consistent with host. So the same as item 3. 8642 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep 8643 * consistent with host MTRR 8644 */ 8645 if (!is_mmio && !kvm_arch_has_noncoherent_dma(vcpu->kvm)) { 8646 ipat = VMX_EPT_IPAT_BIT; 8647 cache = MTRR_TYPE_WRBACK; 8648 goto exit; 8649 } 8650 8651 if (kvm_read_cr0(vcpu) & X86_CR0_CD) { 8652 ipat = VMX_EPT_IPAT_BIT; 8653 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 8654 cache = MTRR_TYPE_WRBACK; 8655 else 8656 cache = MTRR_TYPE_UNCACHABLE; 8657 goto exit; 8658 } 8659 8660 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); 8661 8662exit: 8663 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; 8664} 8665 8666static int vmx_get_lpage_level(void) 8667{ 8668 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 8669 return PT_DIRECTORY_LEVEL; 8670 else 8671 /* For shadow and EPT supported 1GB page */ 8672 return PT_PDPE_LEVEL; 8673} 8674 8675static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 8676{ 8677 struct kvm_cpuid_entry2 *best; 8678 struct vcpu_vmx *vmx = to_vmx(vcpu); 8679 u32 exec_control; 8680 8681 vmx->rdtscp_enabled = false; 8682 if (vmx_rdtscp_supported()) { 8683 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8684 if (exec_control & SECONDARY_EXEC_RDTSCP) { 8685 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 8686 if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) 8687 vmx->rdtscp_enabled = true; 8688 else { 8689 exec_control &= ~SECONDARY_EXEC_RDTSCP; 8690 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8691 exec_control); 8692 } 8693 } 8694 if (nested && !vmx->rdtscp_enabled) 8695 vmx->nested.nested_vmx_secondary_ctls_high &= 8696 ~SECONDARY_EXEC_RDTSCP; 8697 } 8698 8699 /* Exposing INVPCID only when PCID is exposed */ 8700 best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 8701 if (vmx_invpcid_supported() && 8702 best && (best->ebx & bit(X86_FEATURE_INVPCID)) && 8703 guest_cpuid_has_pcid(vcpu)) { 8704 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8705 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; 8706 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8707 exec_control); 8708 } else { 8709 if (cpu_has_secondary_exec_ctrls()) { 8710 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8711 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 8712 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8713 exec_control); 8714 } 8715 if (best) 8716 best->ebx &= ~bit(X86_FEATURE_INVPCID); 8717 } 8718} 8719 8720static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 8721{ 8722 if (func == 1 && nested) 8723 entry->ecx |= bit(X86_FEATURE_VMX); 8724} 8725 8726static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 8727 struct x86_exception *fault) 8728{ 8729 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8730 u32 exit_reason; 8731 8732 if (fault->error_code & PFERR_RSVD_MASK) 8733 exit_reason = EXIT_REASON_EPT_MISCONFIG; 8734 else 8735 exit_reason = EXIT_REASON_EPT_VIOLATION; 8736 nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); 8737 vmcs12->guest_physical_address = fault->address; 8738} 8739 8740/* Callbacks for nested_ept_init_mmu_context: */ 8741 8742static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) 8743{ 8744 /* return the page table to be shadowed - in our case, EPT12 */ 8745 return get_vmcs12(vcpu)->ept_pointer; 8746} 8747 8748static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 8749{ 8750 WARN_ON(mmu_is_nested(vcpu)); 8751 kvm_init_shadow_ept_mmu(vcpu, 8752 to_vmx(vcpu)->nested.nested_vmx_ept_caps & 8753 VMX_EPT_EXECUTE_ONLY_BIT); 8754 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 8755 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 8756 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 8757 8758 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 8759} 8760 8761static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 8762{ 8763 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 8764} 8765 8766static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 8767 u16 error_code) 8768{ 8769 bool inequality, bit; 8770 8771 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 8772 inequality = 8773 (error_code & vmcs12->page_fault_error_code_mask) != 8774 vmcs12->page_fault_error_code_match; 8775 return inequality ^ bit; 8776} 8777 8778static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 8779 struct x86_exception *fault) 8780{ 8781 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8782 8783 WARN_ON(!is_guest_mode(vcpu)); 8784 8785 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) 8786 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 8787 vmcs_read32(VM_EXIT_INTR_INFO), 8788 vmcs_readl(EXIT_QUALIFICATION)); 8789 else 8790 kvm_inject_page_fault(vcpu, fault); 8791} 8792 8793static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, 8794 struct vmcs12 *vmcs12) 8795{ 8796 struct vcpu_vmx *vmx = to_vmx(vcpu); 8797 int maxphyaddr = cpuid_maxphyaddr(vcpu); 8798 8799 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 8800 if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || 8801 vmcs12->apic_access_addr >> maxphyaddr) 8802 return false; 8803 8804 /* 8805 * Translate L1 physical address to host physical 8806 * address for vmcs02. Keep the page pinned, so this 8807 * physical address remains valid. We keep a reference 8808 * to it so we can release it later. 8809 */ 8810 if (vmx->nested.apic_access_page) /* shouldn't happen */ 8811 nested_release_page(vmx->nested.apic_access_page); 8812 vmx->nested.apic_access_page = 8813 nested_get_page(vcpu, vmcs12->apic_access_addr); 8814 } 8815 8816 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 8817 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || 8818 vmcs12->virtual_apic_page_addr >> maxphyaddr) 8819 return false; 8820 8821 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 8822 nested_release_page(vmx->nested.virtual_apic_page); 8823 vmx->nested.virtual_apic_page = 8824 nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); 8825 8826 /* 8827 * Failing the vm entry is _not_ what the processor does 8828 * but it's basically the only possibility we have. 8829 * We could still enter the guest if CR8 load exits are 8830 * enabled, CR8 store exits are enabled, and virtualize APIC 8831 * access is disabled; in this case the processor would never 8832 * use the TPR shadow and we could simply clear the bit from 8833 * the execution control. But such a configuration is useless, 8834 * so let's keep the code simple. 8835 */ 8836 if (!vmx->nested.virtual_apic_page) 8837 return false; 8838 } 8839 8840 if (nested_cpu_has_posted_intr(vmcs12)) { 8841 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || 8842 vmcs12->posted_intr_desc_addr >> maxphyaddr) 8843 return false; 8844 8845 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 8846 kunmap(vmx->nested.pi_desc_page); 8847 nested_release_page(vmx->nested.pi_desc_page); 8848 } 8849 vmx->nested.pi_desc_page = 8850 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); 8851 if (!vmx->nested.pi_desc_page) 8852 return false; 8853 8854 vmx->nested.pi_desc = 8855 (struct pi_desc *)kmap(vmx->nested.pi_desc_page); 8856 if (!vmx->nested.pi_desc) { 8857 nested_release_page_clean(vmx->nested.pi_desc_page); 8858 return false; 8859 } 8860 vmx->nested.pi_desc = 8861 (struct pi_desc *)((void *)vmx->nested.pi_desc + 8862 (unsigned long)(vmcs12->posted_intr_desc_addr & 8863 (PAGE_SIZE - 1))); 8864 } 8865 8866 return true; 8867} 8868 8869static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 8870{ 8871 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 8872 struct vcpu_vmx *vmx = to_vmx(vcpu); 8873 8874 if (vcpu->arch.virtual_tsc_khz == 0) 8875 return; 8876 8877 /* Make sure short timeouts reliably trigger an immediate vmexit. 8878 * hrtimer_start does not guarantee this. */ 8879 if (preemption_timeout <= 1) { 8880 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 8881 return; 8882 } 8883 8884 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 8885 preemption_timeout *= 1000000; 8886 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 8887 hrtimer_start(&vmx->nested.preemption_timer, 8888 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 8889} 8890 8891static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 8892 struct vmcs12 *vmcs12) 8893{ 8894 int maxphyaddr; 8895 u64 addr; 8896 8897 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 8898 return 0; 8899 8900 if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { 8901 WARN_ON(1); 8902 return -EINVAL; 8903 } 8904 maxphyaddr = cpuid_maxphyaddr(vcpu); 8905 8906 if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || 8907 ((addr + PAGE_SIZE) >> maxphyaddr)) 8908 return -EINVAL; 8909 8910 return 0; 8911} 8912 8913/* 8914 * Merge L0's and L1's MSR bitmap, return false to indicate that 8915 * we do not use the hardware. 8916 */ 8917static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, 8918 struct vmcs12 *vmcs12) 8919{ 8920 int msr; 8921 struct page *page; 8922 unsigned long *msr_bitmap; 8923 8924 if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) 8925 return false; 8926 8927 page = nested_get_page(vcpu, vmcs12->msr_bitmap); 8928 if (!page) { 8929 WARN_ON(1); 8930 return false; 8931 } 8932 msr_bitmap = (unsigned long *)kmap(page); 8933 if (!msr_bitmap) { 8934 nested_release_page_clean(page); 8935 WARN_ON(1); 8936 return false; 8937 } 8938 8939 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 8940 if (nested_cpu_has_apic_reg_virt(vmcs12)) 8941 for (msr = 0x800; msr <= 0x8ff; msr++) 8942 nested_vmx_disable_intercept_for_msr( 8943 msr_bitmap, 8944 vmx_msr_bitmap_nested, 8945 msr, MSR_TYPE_R); 8946 /* TPR is allowed */ 8947 nested_vmx_disable_intercept_for_msr(msr_bitmap, 8948 vmx_msr_bitmap_nested, 8949 APIC_BASE_MSR + (APIC_TASKPRI >> 4), 8950 MSR_TYPE_R | MSR_TYPE_W); 8951 if (nested_cpu_has_vid(vmcs12)) { 8952 /* EOI and self-IPI are allowed */ 8953 nested_vmx_disable_intercept_for_msr( 8954 msr_bitmap, 8955 vmx_msr_bitmap_nested, 8956 APIC_BASE_MSR + (APIC_EOI >> 4), 8957 MSR_TYPE_W); 8958 nested_vmx_disable_intercept_for_msr( 8959 msr_bitmap, 8960 vmx_msr_bitmap_nested, 8961 APIC_BASE_MSR + (APIC_SELF_IPI >> 4), 8962 MSR_TYPE_W); 8963 } 8964 } else { 8965 /* 8966 * Enable reading intercept of all the x2apic 8967 * MSRs. We should not rely on vmcs12 to do any 8968 * optimizations here, it may have been modified 8969 * by L1. 8970 */ 8971 for (msr = 0x800; msr <= 0x8ff; msr++) 8972 __vmx_enable_intercept_for_msr( 8973 vmx_msr_bitmap_nested, 8974 msr, 8975 MSR_TYPE_R); 8976 8977 __vmx_enable_intercept_for_msr( 8978 vmx_msr_bitmap_nested, 8979 APIC_BASE_MSR + (APIC_TASKPRI >> 4), 8980 MSR_TYPE_W); 8981 __vmx_enable_intercept_for_msr( 8982 vmx_msr_bitmap_nested, 8983 APIC_BASE_MSR + (APIC_EOI >> 4), 8984 MSR_TYPE_W); 8985 __vmx_enable_intercept_for_msr( 8986 vmx_msr_bitmap_nested, 8987 APIC_BASE_MSR + (APIC_SELF_IPI >> 4), 8988 MSR_TYPE_W); 8989 } 8990 kunmap(page); 8991 nested_release_page_clean(page); 8992 8993 return true; 8994} 8995 8996static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 8997 struct vmcs12 *vmcs12) 8998{ 8999 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 9000 !nested_cpu_has_apic_reg_virt(vmcs12) && 9001 !nested_cpu_has_vid(vmcs12) && 9002 !nested_cpu_has_posted_intr(vmcs12)) 9003 return 0; 9004 9005 /* 9006 * If virtualize x2apic mode is enabled, 9007 * virtualize apic access must be disabled. 9008 */ 9009 if (nested_cpu_has_virt_x2apic_mode(vmcs12) && 9010 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 9011 return -EINVAL; 9012 9013 /* 9014 * If virtual interrupt delivery is enabled, 9015 * we must exit on external interrupts. 9016 */ 9017 if (nested_cpu_has_vid(vmcs12) && 9018 !nested_exit_on_intr(vcpu)) 9019 return -EINVAL; 9020 9021 /* 9022 * bits 15:8 should be zero in posted_intr_nv, 9023 * the descriptor address has been already checked 9024 * in nested_get_vmcs12_pages. 9025 */ 9026 if (nested_cpu_has_posted_intr(vmcs12) && 9027 (!nested_cpu_has_vid(vmcs12) || 9028 !nested_exit_intr_ack_set(vcpu) || 9029 vmcs12->posted_intr_nv & 0xff00)) 9030 return -EINVAL; 9031 9032 /* tpr shadow is needed by all apicv features. */ 9033 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 9034 return -EINVAL; 9035 9036 return 0; 9037} 9038 9039static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 9040 unsigned long count_field, 9041 unsigned long addr_field) 9042{ 9043 int maxphyaddr; 9044 u64 count, addr; 9045 9046 if (vmcs12_read_any(vcpu, count_field, &count) || 9047 vmcs12_read_any(vcpu, addr_field, &addr)) { 9048 WARN_ON(1); 9049 return -EINVAL; 9050 } 9051 if (count == 0) 9052 return 0; 9053 maxphyaddr = cpuid_maxphyaddr(vcpu); 9054 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 9055 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { 9056 pr_warn_ratelimited( 9057 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", 9058 addr_field, maxphyaddr, count, addr); 9059 return -EINVAL; 9060 } 9061 return 0; 9062} 9063 9064static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, 9065 struct vmcs12 *vmcs12) 9066{ 9067 if (vmcs12->vm_exit_msr_load_count == 0 && 9068 vmcs12->vm_exit_msr_store_count == 0 && 9069 vmcs12->vm_entry_msr_load_count == 0) 9070 return 0; /* Fast path */ 9071 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, 9072 VM_EXIT_MSR_LOAD_ADDR) || 9073 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, 9074 VM_EXIT_MSR_STORE_ADDR) || 9075 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, 9076 VM_ENTRY_MSR_LOAD_ADDR)) 9077 return -EINVAL; 9078 return 0; 9079} 9080 9081static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 9082 struct vmx_msr_entry *e) 9083{ 9084 /* x2APIC MSR accesses are not allowed */ 9085 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) 9086 return -EINVAL; 9087 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ 9088 e->index == MSR_IA32_UCODE_REV) 9089 return -EINVAL; 9090 if (e->reserved != 0) 9091 return -EINVAL; 9092 return 0; 9093} 9094 9095static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 9096 struct vmx_msr_entry *e) 9097{ 9098 if (e->index == MSR_FS_BASE || 9099 e->index == MSR_GS_BASE || 9100 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ 9101 nested_vmx_msr_check_common(vcpu, e)) 9102 return -EINVAL; 9103 return 0; 9104} 9105 9106static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 9107 struct vmx_msr_entry *e) 9108{ 9109 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ 9110 nested_vmx_msr_check_common(vcpu, e)) 9111 return -EINVAL; 9112 return 0; 9113} 9114 9115/* 9116 * Load guest's/host's msr at nested entry/exit. 9117 * return 0 for success, entry index for failure. 9118 */ 9119static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 9120{ 9121 u32 i; 9122 struct vmx_msr_entry e; 9123 struct msr_data msr; 9124 9125 msr.host_initiated = false; 9126 for (i = 0; i < count; i++) { 9127 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 9128 &e, sizeof(e))) { 9129 pr_warn_ratelimited( 9130 "%s cannot read MSR entry (%u, 0x%08llx)\n", 9131 __func__, i, gpa + i * sizeof(e)); 9132 goto fail; 9133 } 9134 if (nested_vmx_load_msr_check(vcpu, &e)) { 9135 pr_warn_ratelimited( 9136 "%s check failed (%u, 0x%x, 0x%x)\n", 9137 __func__, i, e.index, e.reserved); 9138 goto fail; 9139 } 9140 msr.index = e.index; 9141 msr.data = e.value; 9142 if (kvm_set_msr(vcpu, &msr)) { 9143 pr_warn_ratelimited( 9144 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 9145 __func__, i, e.index, e.value); 9146 goto fail; 9147 } 9148 } 9149 return 0; 9150fail: 9151 return i + 1; 9152} 9153 9154static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 9155{ 9156 u32 i; 9157 struct vmx_msr_entry e; 9158 9159 for (i = 0; i < count; i++) { 9160 struct msr_data msr_info; 9161 if (kvm_vcpu_read_guest(vcpu, 9162 gpa + i * sizeof(e), 9163 &e, 2 * sizeof(u32))) { 9164 pr_warn_ratelimited( 9165 "%s cannot read MSR entry (%u, 0x%08llx)\n", 9166 __func__, i, gpa + i * sizeof(e)); 9167 return -EINVAL; 9168 } 9169 if (nested_vmx_store_msr_check(vcpu, &e)) { 9170 pr_warn_ratelimited( 9171 "%s check failed (%u, 0x%x, 0x%x)\n", 9172 __func__, i, e.index, e.reserved); 9173 return -EINVAL; 9174 } 9175 msr_info.host_initiated = false; 9176 msr_info.index = e.index; 9177 if (kvm_get_msr(vcpu, &msr_info)) { 9178 pr_warn_ratelimited( 9179 "%s cannot read MSR (%u, 0x%x)\n", 9180 __func__, i, e.index); 9181 return -EINVAL; 9182 } 9183 if (kvm_vcpu_write_guest(vcpu, 9184 gpa + i * sizeof(e) + 9185 offsetof(struct vmx_msr_entry, value), 9186 &msr_info.data, sizeof(msr_info.data))) { 9187 pr_warn_ratelimited( 9188 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 9189 __func__, i, e.index, msr_info.data); 9190 return -EINVAL; 9191 } 9192 } 9193 return 0; 9194} 9195 9196/* 9197 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 9198 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 9199 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 9200 * guest in a way that will both be appropriate to L1's requests, and our 9201 * needs. In addition to modifying the active vmcs (which is vmcs02), this 9202 * function also has additional necessary side-effects, like setting various 9203 * vcpu->arch fields. 9204 */ 9205static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 9206{ 9207 struct vcpu_vmx *vmx = to_vmx(vcpu); 9208 u32 exec_control; 9209 9210 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 9211 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 9212 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 9213 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 9214 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 9215 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 9216 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 9217 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 9218 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 9219 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 9220 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 9221 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 9222 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 9223 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 9224 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 9225 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 9226 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 9227 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 9228 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 9229 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 9230 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 9231 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 9232 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 9233 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 9234 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 9235 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 9236 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 9237 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 9238 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 9239 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 9240 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 9241 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 9242 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 9243 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 9244 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 9245 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 9246 9247 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 9248 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 9249 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 9250 } else { 9251 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 9252 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 9253 } 9254 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 9255 vmcs12->vm_entry_intr_info_field); 9256 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 9257 vmcs12->vm_entry_exception_error_code); 9258 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 9259 vmcs12->vm_entry_instruction_len); 9260 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 9261 vmcs12->guest_interruptibility_info); 9262 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 9263 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 9264 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 9265 vmcs12->guest_pending_dbg_exceptions); 9266 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 9267 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 9268 9269 if (nested_cpu_has_xsaves(vmcs12)) 9270 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 9271 vmcs_write64(VMCS_LINK_POINTER, -1ull); 9272 9273 exec_control = vmcs12->pin_based_vm_exec_control; 9274 exec_control |= vmcs_config.pin_based_exec_ctrl; 9275 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 9276 9277 if (nested_cpu_has_posted_intr(vmcs12)) { 9278 /* 9279 * Note that we use L0's vector here and in 9280 * vmx_deliver_nested_posted_interrupt. 9281 */ 9282 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 9283 vmx->nested.pi_pending = false; 9284 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); 9285 vmcs_write64(POSTED_INTR_DESC_ADDR, 9286 page_to_phys(vmx->nested.pi_desc_page) + 9287 (unsigned long)(vmcs12->posted_intr_desc_addr & 9288 (PAGE_SIZE - 1))); 9289 } else 9290 exec_control &= ~PIN_BASED_POSTED_INTR; 9291 9292 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 9293 9294 vmx->nested.preemption_timer_expired = false; 9295 if (nested_cpu_has_preemption_timer(vmcs12)) 9296 vmx_start_preemption_timer(vcpu); 9297 9298 /* 9299 * Whether page-faults are trapped is determined by a combination of 9300 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 9301 * If enable_ept, L0 doesn't care about page faults and we should 9302 * set all of these to L1's desires. However, if !enable_ept, L0 does 9303 * care about (at least some) page faults, and because it is not easy 9304 * (if at all possible?) to merge L0 and L1's desires, we simply ask 9305 * to exit on each and every L2 page fault. This is done by setting 9306 * MASK=MATCH=0 and (see below) EB.PF=1. 9307 * Note that below we don't need special code to set EB.PF beyond the 9308 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 9309 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 9310 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 9311 * 9312 * A problem with this approach (when !enable_ept) is that L1 may be 9313 * injected with more page faults than it asked for. This could have 9314 * caused problems, but in practice existing hypervisors don't care. 9315 * To fix this, we will need to emulate the PFEC checking (on the L1 9316 * page tables), using walk_addr(), when injecting PFs to L1. 9317 */ 9318 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 9319 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 9320 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 9321 enable_ept ? vmcs12->page_fault_error_code_match : 0); 9322 9323 if (cpu_has_secondary_exec_ctrls()) { 9324 exec_control = vmx_secondary_exec_control(vmx); 9325 if (!vmx->rdtscp_enabled) 9326 exec_control &= ~SECONDARY_EXEC_RDTSCP; 9327 /* Take the following fields only from vmcs12 */ 9328 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 9329 SECONDARY_EXEC_RDTSCP | 9330 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 9331 SECONDARY_EXEC_APIC_REGISTER_VIRT); 9332 if (nested_cpu_has(vmcs12, 9333 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 9334 exec_control |= vmcs12->secondary_vm_exec_control; 9335 9336 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { 9337 /* 9338 * If translation failed, no matter: This feature asks 9339 * to exit when accessing the given address, and if it 9340 * can never be accessed, this feature won't do 9341 * anything anyway. 9342 */ 9343 if (!vmx->nested.apic_access_page) 9344 exec_control &= 9345 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9346 else 9347 vmcs_write64(APIC_ACCESS_ADDR, 9348 page_to_phys(vmx->nested.apic_access_page)); 9349 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && 9350 (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { 9351 exec_control |= 9352 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9353 kvm_vcpu_reload_apic_access_page(vcpu); 9354 } 9355 9356 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 9357 vmcs_write64(EOI_EXIT_BITMAP0, 9358 vmcs12->eoi_exit_bitmap0); 9359 vmcs_write64(EOI_EXIT_BITMAP1, 9360 vmcs12->eoi_exit_bitmap1); 9361 vmcs_write64(EOI_EXIT_BITMAP2, 9362 vmcs12->eoi_exit_bitmap2); 9363 vmcs_write64(EOI_EXIT_BITMAP3, 9364 vmcs12->eoi_exit_bitmap3); 9365 vmcs_write16(GUEST_INTR_STATUS, 9366 vmcs12->guest_intr_status); 9367 } 9368 9369 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 9370 } 9371 9372 9373 /* 9374 * Set host-state according to L0's settings (vmcs12 is irrelevant here) 9375 * Some constant fields are set here by vmx_set_constant_host_state(). 9376 * Other fields are different per CPU, and will be set later when 9377 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 9378 */ 9379 vmx_set_constant_host_state(vmx); 9380 9381 /* 9382 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before 9383 * entry, but only if the current (host) sp changed from the value 9384 * we wrote last (vmx->host_rsp). This cache is no longer relevant 9385 * if we switch vmcs, and rather than hold a separate cache per vmcs, 9386 * here we just force the write to happen on entry. 9387 */ 9388 vmx->host_rsp = 0; 9389 9390 exec_control = vmx_exec_control(vmx); /* L0's desires */ 9391 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 9392 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 9393 exec_control &= ~CPU_BASED_TPR_SHADOW; 9394 exec_control |= vmcs12->cpu_based_vm_exec_control; 9395 9396 if (exec_control & CPU_BASED_TPR_SHADOW) { 9397 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 9398 page_to_phys(vmx->nested.virtual_apic_page)); 9399 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 9400 } 9401 9402 if (cpu_has_vmx_msr_bitmap() && 9403 exec_control & CPU_BASED_USE_MSR_BITMAPS) { 9404 nested_vmx_merge_msr_bitmap(vcpu, vmcs12); 9405 /* MSR_BITMAP will be set by following vmx_set_efer. */ 9406 } else 9407 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 9408 9409 /* 9410 * Merging of IO bitmap not currently supported. 9411 * Rather, exit every time. 9412 */ 9413 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 9414 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 9415 9416 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 9417 9418 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 9419 * bitwise-or of what L1 wants to trap for L2, and what we want to 9420 * trap. Note that CR0.TS also needs updating - we do this later. 9421 */ 9422 update_exception_bitmap(vcpu); 9423 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 9424 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 9425 9426 /* L2->L1 exit controls are emulated - the hardware exit is to L0 so 9427 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 9428 * bits are further modified by vmx_set_efer() below. 9429 */ 9430 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 9431 9432 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are 9433 * emulated by vmx_set_efer(), below. 9434 */ 9435 vm_entry_controls_init(vmx, 9436 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & 9437 ~VM_ENTRY_IA32E_MODE) | 9438 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 9439 9440 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { 9441 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 9442 vcpu->arch.pat = vmcs12->guest_ia32_pat; 9443 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 9444 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 9445 9446 9447 set_cr4_guest_host_mask(vmx); 9448 9449 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) 9450 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 9451 9452 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 9453 vmcs_write64(TSC_OFFSET, 9454 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 9455 else 9456 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 9457 9458 if (enable_vpid) { 9459 /* 9460 * Trivially support vpid by letting L2s share their parent 9461 * L1's vpid. TODO: move to a more elaborate solution, giving 9462 * each L2 its own vpid and exposing the vpid feature to L1. 9463 */ 9464 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 9465 vmx_flush_tlb(vcpu); 9466 } 9467 9468 if (nested_cpu_has_ept(vmcs12)) { 9469 kvm_mmu_unload(vcpu); 9470 nested_ept_init_mmu_context(vcpu); 9471 } 9472 9473 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 9474 vcpu->arch.efer = vmcs12->guest_ia32_efer; 9475 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 9476 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 9477 else 9478 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 9479 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 9480 vmx_set_efer(vcpu, vcpu->arch.efer); 9481 9482 /* 9483 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified 9484 * TS bit (for lazy fpu) and bits which we consider mandatory enabled. 9485 * The CR0_READ_SHADOW is what L2 should have expected to read given 9486 * the specifications by L1; It's not enough to take 9487 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 9488 * have more bits than L1 expected. 9489 */ 9490 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 9491 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 9492 9493 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 9494 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 9495 9496 /* shadow page tables on either EPT or shadow page tables */ 9497 kvm_set_cr3(vcpu, vmcs12->guest_cr3); 9498 kvm_mmu_reset_context(vcpu); 9499 9500 if (!enable_ept) 9501 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 9502 9503 /* 9504 * L1 may access the L2's PDPTR, so save them to construct vmcs12 9505 */ 9506 if (enable_ept) { 9507 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 9508 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 9509 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 9510 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 9511 } 9512 9513 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 9514 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 9515} 9516 9517/* 9518 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 9519 * for running an L2 nested guest. 9520 */ 9521static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 9522{ 9523 struct vmcs12 *vmcs12; 9524 struct vcpu_vmx *vmx = to_vmx(vcpu); 9525 int cpu; 9526 struct loaded_vmcs *vmcs02; 9527 bool ia32e; 9528 u32 msr_entry_idx; 9529 9530 if (!nested_vmx_check_permission(vcpu) || 9531 !nested_vmx_check_vmcs12(vcpu)) 9532 return 1; 9533 9534 skip_emulated_instruction(vcpu); 9535 vmcs12 = get_vmcs12(vcpu); 9536 9537 if (enable_shadow_vmcs) 9538 copy_shadow_to_vmcs12(vmx); 9539 9540 /* 9541 * The nested entry process starts with enforcing various prerequisites 9542 * on vmcs12 as required by the Intel SDM, and act appropriately when 9543 * they fail: As the SDM explains, some conditions should cause the 9544 * instruction to fail, while others will cause the instruction to seem 9545 * to succeed, but return an EXIT_REASON_INVALID_STATE. 9546 * To speed up the normal (success) code path, we should avoid checking 9547 * for misconfigurations which will anyway be caught by the processor 9548 * when using the merged vmcs02. 9549 */ 9550 if (vmcs12->launch_state == launch) { 9551 nested_vmx_failValid(vcpu, 9552 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 9553 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 9554 return 1; 9555 } 9556 9557 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 9558 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { 9559 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9560 return 1; 9561 } 9562 9563 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 9564 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9565 return 1; 9566 } 9567 9568 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { 9569 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9570 return 1; 9571 } 9572 9573 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { 9574 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9575 return 1; 9576 } 9577 9578 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { 9579 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9580 return 1; 9581 } 9582 9583 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 9584 vmx->nested.nested_vmx_true_procbased_ctls_low, 9585 vmx->nested.nested_vmx_procbased_ctls_high) || 9586 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 9587 vmx->nested.nested_vmx_secondary_ctls_low, 9588 vmx->nested.nested_vmx_secondary_ctls_high) || 9589 !vmx_control_verify(vmcs12->pin_based_vm_exec_control, 9590 vmx->nested.nested_vmx_pinbased_ctls_low, 9591 vmx->nested.nested_vmx_pinbased_ctls_high) || 9592 !vmx_control_verify(vmcs12->vm_exit_controls, 9593 vmx->nested.nested_vmx_true_exit_ctls_low, 9594 vmx->nested.nested_vmx_exit_ctls_high) || 9595 !vmx_control_verify(vmcs12->vm_entry_controls, 9596 vmx->nested.nested_vmx_true_entry_ctls_low, 9597 vmx->nested.nested_vmx_entry_ctls_high)) 9598 { 9599 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9600 return 1; 9601 } 9602 9603 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || 9604 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9605 nested_vmx_failValid(vcpu, 9606 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 9607 return 1; 9608 } 9609 9610 if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || 9611 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9612 nested_vmx_entry_failure(vcpu, vmcs12, 9613 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9614 return 1; 9615 } 9616 if (vmcs12->vmcs_link_pointer != -1ull) { 9617 nested_vmx_entry_failure(vcpu, vmcs12, 9618 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); 9619 return 1; 9620 } 9621 9622 /* 9623 * If the load IA32_EFER VM-entry control is 1, the following checks 9624 * are performed on the field for the IA32_EFER MSR: 9625 * - Bits reserved in the IA32_EFER MSR must be 0. 9626 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 9627 * the IA-32e mode guest VM-exit control. It must also be identical 9628 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 9629 * CR0.PG) is 1. 9630 */ 9631 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { 9632 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 9633 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 9634 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 9635 ((vmcs12->guest_cr0 & X86_CR0_PG) && 9636 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { 9637 nested_vmx_entry_failure(vcpu, vmcs12, 9638 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9639 return 1; 9640 } 9641 } 9642 9643 /* 9644 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 9645 * IA32_EFER MSR must be 0 in the field for that register. In addition, 9646 * the values of the LMA and LME bits in the field must each be that of 9647 * the host address-space size VM-exit control. 9648 */ 9649 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 9650 ia32e = (vmcs12->vm_exit_controls & 9651 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 9652 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 9653 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 9654 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { 9655 nested_vmx_entry_failure(vcpu, vmcs12, 9656 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9657 return 1; 9658 } 9659 } 9660 9661 /* 9662 * We're finally done with prerequisite checking, and can start with 9663 * the nested entry. 9664 */ 9665 9666 vmcs02 = nested_get_current_vmcs02(vmx); 9667 if (!vmcs02) 9668 return -ENOMEM; 9669 9670 enter_guest_mode(vcpu); 9671 9672 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); 9673 9674 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 9675 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 9676 9677 cpu = get_cpu(); 9678 vmx->loaded_vmcs = vmcs02; 9679 vmx_vcpu_put(vcpu); 9680 vmx_vcpu_load(vcpu, cpu); 9681 vcpu->cpu = cpu; 9682 put_cpu(); 9683 9684 vmx_segment_cache_clear(vmx); 9685 9686 prepare_vmcs02(vcpu, vmcs12); 9687 9688 msr_entry_idx = nested_vmx_load_msr(vcpu, 9689 vmcs12->vm_entry_msr_load_addr, 9690 vmcs12->vm_entry_msr_load_count); 9691 if (msr_entry_idx) { 9692 leave_guest_mode(vcpu); 9693 vmx_load_vmcs01(vcpu); 9694 nested_vmx_entry_failure(vcpu, vmcs12, 9695 EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); 9696 return 1; 9697 } 9698 9699 vmcs12->launch_state = 1; 9700 9701 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 9702 return kvm_vcpu_halt(vcpu); 9703 9704 vmx->nested.nested_run_pending = 1; 9705 9706 /* 9707 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 9708 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 9709 * returned as far as L1 is concerned. It will only return (and set 9710 * the success flag) when L2 exits (see nested_vmx_vmexit()). 9711 */ 9712 return 1; 9713} 9714 9715/* 9716 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 9717 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 9718 * This function returns the new value we should put in vmcs12.guest_cr0. 9719 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 9720 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 9721 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 9722 * didn't trap the bit, because if L1 did, so would L0). 9723 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 9724 * been modified by L2, and L1 knows it. So just leave the old value of 9725 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 9726 * isn't relevant, because if L0 traps this bit it can set it to anything. 9727 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 9728 * changed these bits, and therefore they need to be updated, but L0 9729 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 9730 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 9731 */ 9732static inline unsigned long 9733vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 9734{ 9735 return 9736 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 9737 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 9738 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 9739 vcpu->arch.cr0_guest_owned_bits)); 9740} 9741 9742static inline unsigned long 9743vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 9744{ 9745 return 9746 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 9747 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 9748 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 9749 vcpu->arch.cr4_guest_owned_bits)); 9750} 9751 9752static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 9753 struct vmcs12 *vmcs12) 9754{ 9755 u32 idt_vectoring; 9756 unsigned int nr; 9757 9758 if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { 9759 nr = vcpu->arch.exception.nr; 9760 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 9761 9762 if (kvm_exception_is_soft(nr)) { 9763 vmcs12->vm_exit_instruction_len = 9764 vcpu->arch.event_exit_inst_len; 9765 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 9766 } else 9767 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 9768 9769 if (vcpu->arch.exception.has_error_code) { 9770 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 9771 vmcs12->idt_vectoring_error_code = 9772 vcpu->arch.exception.error_code; 9773 } 9774 9775 vmcs12->idt_vectoring_info_field = idt_vectoring; 9776 } else if (vcpu->arch.nmi_injected) { 9777 vmcs12->idt_vectoring_info_field = 9778 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 9779 } else if (vcpu->arch.interrupt.pending) { 9780 nr = vcpu->arch.interrupt.nr; 9781 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 9782 9783 if (vcpu->arch.interrupt.soft) { 9784 idt_vectoring |= INTR_TYPE_SOFT_INTR; 9785 vmcs12->vm_entry_instruction_len = 9786 vcpu->arch.event_exit_inst_len; 9787 } else 9788 idt_vectoring |= INTR_TYPE_EXT_INTR; 9789 9790 vmcs12->idt_vectoring_info_field = idt_vectoring; 9791 } 9792} 9793 9794static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 9795{ 9796 struct vcpu_vmx *vmx = to_vmx(vcpu); 9797 9798 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 9799 vmx->nested.preemption_timer_expired) { 9800 if (vmx->nested.nested_run_pending) 9801 return -EBUSY; 9802 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 9803 return 0; 9804 } 9805 9806 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 9807 if (vmx->nested.nested_run_pending || 9808 vcpu->arch.interrupt.pending) 9809 return -EBUSY; 9810 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 9811 NMI_VECTOR | INTR_TYPE_NMI_INTR | 9812 INTR_INFO_VALID_MASK, 0); 9813 /* 9814 * The NMI-triggered VM exit counts as injection: 9815 * clear this one and block further NMIs. 9816 */ 9817 vcpu->arch.nmi_pending = 0; 9818 vmx_set_nmi_mask(vcpu, true); 9819 return 0; 9820 } 9821 9822 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 9823 nested_exit_on_intr(vcpu)) { 9824 if (vmx->nested.nested_run_pending) 9825 return -EBUSY; 9826 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 9827 return 0; 9828 } 9829 9830 return vmx_complete_nested_posted_interrupt(vcpu); 9831} 9832 9833static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 9834{ 9835 ktime_t remaining = 9836 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 9837 u64 value; 9838 9839 if (ktime_to_ns(remaining) <= 0) 9840 return 0; 9841 9842 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 9843 do_div(value, 1000000); 9844 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 9845} 9846 9847/* 9848 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 9849 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 9850 * and this function updates it to reflect the changes to the guest state while 9851 * L2 was running (and perhaps made some exits which were handled directly by L0 9852 * without going back to L1), and to reflect the exit reason. 9853 * Note that we do not have to copy here all VMCS fields, just those that 9854 * could have changed by the L2 guest or the exit - i.e., the guest-state and 9855 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 9856 * which already writes to vmcs12 directly. 9857 */ 9858static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 9859 u32 exit_reason, u32 exit_intr_info, 9860 unsigned long exit_qualification) 9861{ 9862 /* update guest state fields: */ 9863 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 9864 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 9865 9866 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 9867 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); 9868 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 9869 9870 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 9871 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 9872 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 9873 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 9874 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 9875 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 9876 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 9877 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 9878 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 9879 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 9880 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 9881 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 9882 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 9883 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 9884 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 9885 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 9886 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 9887 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 9888 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 9889 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 9890 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 9891 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 9892 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 9893 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 9894 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 9895 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 9896 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 9897 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 9898 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 9899 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 9900 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 9901 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 9902 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 9903 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 9904 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 9905 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 9906 9907 vmcs12->guest_interruptibility_info = 9908 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 9909 vmcs12->guest_pending_dbg_exceptions = 9910 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 9911 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 9912 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 9913 else 9914 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 9915 9916 if (nested_cpu_has_preemption_timer(vmcs12)) { 9917 if (vmcs12->vm_exit_controls & 9918 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 9919 vmcs12->vmx_preemption_timer_value = 9920 vmx_get_preemption_timer_value(vcpu); 9921 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 9922 } 9923 9924 /* 9925 * In some cases (usually, nested EPT), L2 is allowed to change its 9926 * own CR3 without exiting. If it has changed it, we must keep it. 9927 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 9928 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 9929 * 9930 * Additionally, restore L2's PDPTR to vmcs12. 9931 */ 9932 if (enable_ept) { 9933 vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); 9934 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 9935 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 9936 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 9937 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 9938 } 9939 9940 if (nested_cpu_has_vid(vmcs12)) 9941 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 9942 9943 vmcs12->vm_entry_controls = 9944 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 9945 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 9946 9947 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { 9948 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 9949 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 9950 } 9951 9952 /* TODO: These cannot have changed unless we have MSR bitmaps and 9953 * the relevant bit asks not to trap the change */ 9954 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 9955 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); 9956 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 9957 vmcs12->guest_ia32_efer = vcpu->arch.efer; 9958 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 9959 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 9960 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 9961 if (vmx_mpx_supported()) 9962 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 9963 if (nested_cpu_has_xsaves(vmcs12)) 9964 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); 9965 9966 /* update exit information fields: */ 9967 9968 vmcs12->vm_exit_reason = exit_reason; 9969 vmcs12->exit_qualification = exit_qualification; 9970 9971 vmcs12->vm_exit_intr_info = exit_intr_info; 9972 if ((vmcs12->vm_exit_intr_info & 9973 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 9974 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) 9975 vmcs12->vm_exit_intr_error_code = 9976 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 9977 vmcs12->idt_vectoring_info_field = 0; 9978 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 9979 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 9980 9981 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 9982 /* vm_entry_intr_info_field is cleared on exit. Emulate this 9983 * instead of reading the real value. */ 9984 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 9985 9986 /* 9987 * Transfer the event that L0 or L1 may wanted to inject into 9988 * L2 to IDT_VECTORING_INFO_FIELD. 9989 */ 9990 vmcs12_save_pending_event(vcpu, vmcs12); 9991 } 9992 9993 /* 9994 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 9995 * preserved above and would only end up incorrectly in L1. 9996 */ 9997 vcpu->arch.nmi_injected = false; 9998 kvm_clear_exception_queue(vcpu); 9999 kvm_clear_interrupt_queue(vcpu); 10000} 10001 10002/* 10003 * A part of what we need to when the nested L2 guest exits and we want to 10004 * run its L1 parent, is to reset L1's guest state to the host state specified 10005 * in vmcs12. 10006 * This function is to be called not only on normal nested exit, but also on 10007 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 10008 * Failures During or After Loading Guest State"). 10009 * This function should be called when the active VMCS is L1's (vmcs01). 10010 */ 10011static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 10012 struct vmcs12 *vmcs12) 10013{ 10014 struct kvm_segment seg; 10015 10016 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 10017 vcpu->arch.efer = vmcs12->host_ia32_efer; 10018 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 10019 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 10020 else 10021 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 10022 vmx_set_efer(vcpu, vcpu->arch.efer); 10023 10024 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 10025 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 10026 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 10027 /* 10028 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 10029 * actually changed, because it depends on the current state of 10030 * fpu_active (which may have changed). 10031 * Note that vmx_set_cr0 refers to efer set above. 10032 */ 10033 vmx_set_cr0(vcpu, vmcs12->host_cr0); 10034 /* 10035 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need 10036 * to apply the same changes to L1's vmcs. We just set cr0 correctly, 10037 * but we also need to update cr0_guest_host_mask and exception_bitmap. 10038 */ 10039 update_exception_bitmap(vcpu); 10040 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); 10041 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 10042 10043 /* 10044 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 10045 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); 10046 */ 10047 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 10048 kvm_set_cr4(vcpu, vmcs12->host_cr4); 10049 10050 nested_ept_uninit_mmu_context(vcpu); 10051 10052 kvm_set_cr3(vcpu, vmcs12->host_cr3); 10053 kvm_mmu_reset_context(vcpu); 10054 10055 if (!enable_ept) 10056 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 10057 10058 if (enable_vpid) { 10059 /* 10060 * Trivially support vpid by letting L2s share their parent 10061 * L1's vpid. TODO: move to a more elaborate solution, giving 10062 * each L2 its own vpid and exposing the vpid feature to L1. 10063 */ 10064 vmx_flush_tlb(vcpu); 10065 } 10066 10067 10068 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 10069 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 10070 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 10071 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 10072 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 10073 10074 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 10075 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 10076 vmcs_write64(GUEST_BNDCFGS, 0); 10077 10078 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 10079 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 10080 vcpu->arch.pat = vmcs12->host_ia32_pat; 10081 } 10082 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 10083 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 10084 vmcs12->host_ia32_perf_global_ctrl); 10085 10086 /* Set L1 segment info according to Intel SDM 10087 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 10088 seg = (struct kvm_segment) { 10089 .base = 0, 10090 .limit = 0xFFFFFFFF, 10091 .selector = vmcs12->host_cs_selector, 10092 .type = 11, 10093 .present = 1, 10094 .s = 1, 10095 .g = 1 10096 }; 10097 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 10098 seg.l = 1; 10099 else 10100 seg.db = 1; 10101 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 10102 seg = (struct kvm_segment) { 10103 .base = 0, 10104 .limit = 0xFFFFFFFF, 10105 .type = 3, 10106 .present = 1, 10107 .s = 1, 10108 .db = 1, 10109 .g = 1 10110 }; 10111 seg.selector = vmcs12->host_ds_selector; 10112 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 10113 seg.selector = vmcs12->host_es_selector; 10114 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 10115 seg.selector = vmcs12->host_ss_selector; 10116 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 10117 seg.selector = vmcs12->host_fs_selector; 10118 seg.base = vmcs12->host_fs_base; 10119 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 10120 seg.selector = vmcs12->host_gs_selector; 10121 seg.base = vmcs12->host_gs_base; 10122 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 10123 seg = (struct kvm_segment) { 10124 .base = vmcs12->host_tr_base, 10125 .limit = 0x67, 10126 .selector = vmcs12->host_tr_selector, 10127 .type = 11, 10128 .present = 1 10129 }; 10130 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 10131 10132 kvm_set_dr(vcpu, 7, 0x400); 10133 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 10134 10135 if (cpu_has_vmx_msr_bitmap()) 10136 vmx_set_msr_bitmap(vcpu); 10137 10138 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 10139 vmcs12->vm_exit_msr_load_count)) 10140 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 10141} 10142 10143/* 10144 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 10145 * and modify vmcs12 to make it see what it would expect to see there if 10146 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 10147 */ 10148static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 10149 u32 exit_intr_info, 10150 unsigned long exit_qualification) 10151{ 10152 struct vcpu_vmx *vmx = to_vmx(vcpu); 10153 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 10154 10155 /* trying to cancel vmlaunch/vmresume is a bug */ 10156 WARN_ON_ONCE(vmx->nested.nested_run_pending); 10157 10158 leave_guest_mode(vcpu); 10159 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 10160 exit_qualification); 10161 10162 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, 10163 vmcs12->vm_exit_msr_store_count)) 10164 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); 10165 10166 vmx_load_vmcs01(vcpu); 10167 10168 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 10169 && nested_exit_intr_ack_set(vcpu)) { 10170 int irq = kvm_cpu_get_interrupt(vcpu); 10171 WARN_ON(irq < 0); 10172 vmcs12->vm_exit_intr_info = irq | 10173 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 10174 } 10175 10176 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 10177 vmcs12->exit_qualification, 10178 vmcs12->idt_vectoring_info_field, 10179 vmcs12->vm_exit_intr_info, 10180 vmcs12->vm_exit_intr_error_code, 10181 KVM_ISA_VMX); 10182 10183 vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); 10184 vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); 10185 vmx_segment_cache_clear(vmx); 10186 10187 /* if no vmcs02 cache requested, remove the one we used */ 10188 if (VMCS02_POOL_SIZE == 0) 10189 nested_free_vmcs02(vmx, vmx->nested.current_vmptr); 10190 10191 load_vmcs12_host_state(vcpu, vmcs12); 10192 10193 /* Update TSC_OFFSET if TSC was changed while L2 ran */ 10194 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 10195 10196 /* This is needed for same reason as it was needed in prepare_vmcs02 */ 10197 vmx->host_rsp = 0; 10198 10199 /* Unpin physical memory we referred to in vmcs02 */ 10200 if (vmx->nested.apic_access_page) { 10201 nested_release_page(vmx->nested.apic_access_page); 10202 vmx->nested.apic_access_page = NULL; 10203 } 10204 if (vmx->nested.virtual_apic_page) { 10205 nested_release_page(vmx->nested.virtual_apic_page); 10206 vmx->nested.virtual_apic_page = NULL; 10207 } 10208 if (vmx->nested.pi_desc_page) { 10209 kunmap(vmx->nested.pi_desc_page); 10210 nested_release_page(vmx->nested.pi_desc_page); 10211 vmx->nested.pi_desc_page = NULL; 10212 vmx->nested.pi_desc = NULL; 10213 } 10214 10215 /* 10216 * We are now running in L2, mmu_notifier will force to reload the 10217 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 10218 */ 10219 kvm_vcpu_reload_apic_access_page(vcpu); 10220 10221 /* 10222 * Exiting from L2 to L1, we're now back to L1 which thinks it just 10223 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the 10224 * success or failure flag accordingly. 10225 */ 10226 if (unlikely(vmx->fail)) { 10227 vmx->fail = 0; 10228 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); 10229 } else 10230 nested_vmx_succeed(vcpu); 10231 if (enable_shadow_vmcs) 10232 vmx->nested.sync_shadow_vmcs = true; 10233 10234 /* in case we halted in L2 */ 10235 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 10236} 10237 10238/* 10239 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 10240 */ 10241static void vmx_leave_nested(struct kvm_vcpu *vcpu) 10242{ 10243 if (is_guest_mode(vcpu)) 10244 nested_vmx_vmexit(vcpu, -1, 0, 0); 10245 free_nested(to_vmx(vcpu)); 10246} 10247 10248/* 10249 * L1's failure to enter L2 is a subset of a normal exit, as explained in 10250 * 23.7 "VM-entry failures during or after loading guest state" (this also 10251 * lists the acceptable exit-reason and exit-qualification parameters). 10252 * It should only be called before L2 actually succeeded to run, and when 10253 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). 10254 */ 10255static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 10256 struct vmcs12 *vmcs12, 10257 u32 reason, unsigned long qualification) 10258{ 10259 load_vmcs12_host_state(vcpu, vmcs12); 10260 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 10261 vmcs12->exit_qualification = qualification; 10262 nested_vmx_succeed(vcpu); 10263 if (enable_shadow_vmcs) 10264 to_vmx(vcpu)->nested.sync_shadow_vmcs = true; 10265} 10266 10267static int vmx_check_intercept(struct kvm_vcpu *vcpu, 10268 struct x86_instruction_info *info, 10269 enum x86_intercept_stage stage) 10270{ 10271 return X86EMUL_CONTINUE; 10272} 10273 10274static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 10275{ 10276 if (ple_gap) 10277 shrink_ple_window(vcpu); 10278} 10279 10280static void vmx_slot_enable_log_dirty(struct kvm *kvm, 10281 struct kvm_memory_slot *slot) 10282{ 10283 kvm_mmu_slot_leaf_clear_dirty(kvm, slot); 10284 kvm_mmu_slot_largepage_remove_write_access(kvm, slot); 10285} 10286 10287static void vmx_slot_disable_log_dirty(struct kvm *kvm, 10288 struct kvm_memory_slot *slot) 10289{ 10290 kvm_mmu_slot_set_dirty(kvm, slot); 10291} 10292 10293static void vmx_flush_log_dirty(struct kvm *kvm) 10294{ 10295 kvm_flush_pml_buffers(kvm); 10296} 10297 10298static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, 10299 struct kvm_memory_slot *memslot, 10300 gfn_t offset, unsigned long mask) 10301{ 10302 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 10303} 10304 10305static struct kvm_x86_ops vmx_x86_ops = { 10306 .cpu_has_kvm_support = cpu_has_kvm_support, 10307 .disabled_by_bios = vmx_disabled_by_bios, 10308 .hardware_setup = hardware_setup, 10309 .hardware_unsetup = hardware_unsetup, 10310 .check_processor_compatibility = vmx_check_processor_compat, 10311 .hardware_enable = hardware_enable, 10312 .hardware_disable = hardware_disable, 10313 .cpu_has_accelerated_tpr = report_flexpriority, 10314 .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, 10315 10316 .vcpu_create = vmx_create_vcpu, 10317 .vcpu_free = vmx_free_vcpu, 10318 .vcpu_reset = vmx_vcpu_reset, 10319 10320 .prepare_guest_switch = vmx_save_host_state, 10321 .vcpu_load = vmx_vcpu_load, 10322 .vcpu_put = vmx_vcpu_put, 10323 10324 .update_db_bp_intercept = update_exception_bitmap, 10325 .get_msr = vmx_get_msr, 10326 .set_msr = vmx_set_msr, 10327 .get_segment_base = vmx_get_segment_base, 10328 .get_segment = vmx_get_segment, 10329 .set_segment = vmx_set_segment, 10330 .get_cpl = vmx_get_cpl, 10331 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 10332 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 10333 .decache_cr3 = vmx_decache_cr3, 10334 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 10335 .set_cr0 = vmx_set_cr0, 10336 .set_cr3 = vmx_set_cr3, 10337 .set_cr4 = vmx_set_cr4, 10338 .set_efer = vmx_set_efer, 10339 .get_idt = vmx_get_idt, 10340 .set_idt = vmx_set_idt, 10341 .get_gdt = vmx_get_gdt, 10342 .set_gdt = vmx_set_gdt, 10343 .get_dr6 = vmx_get_dr6, 10344 .set_dr6 = vmx_set_dr6, 10345 .set_dr7 = vmx_set_dr7, 10346 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 10347 .cache_reg = vmx_cache_reg, 10348 .get_rflags = vmx_get_rflags, 10349 .set_rflags = vmx_set_rflags, 10350 .fpu_activate = vmx_fpu_activate, 10351 .fpu_deactivate = vmx_fpu_deactivate, 10352 10353 .tlb_flush = vmx_flush_tlb, 10354 10355 .run = vmx_vcpu_run, 10356 .handle_exit = vmx_handle_exit, 10357 .skip_emulated_instruction = skip_emulated_instruction, 10358 .set_interrupt_shadow = vmx_set_interrupt_shadow, 10359 .get_interrupt_shadow = vmx_get_interrupt_shadow, 10360 .patch_hypercall = vmx_patch_hypercall, 10361 .set_irq = vmx_inject_irq, 10362 .set_nmi = vmx_inject_nmi, 10363 .queue_exception = vmx_queue_exception, 10364 .cancel_injection = vmx_cancel_injection, 10365 .interrupt_allowed = vmx_interrupt_allowed, 10366 .nmi_allowed = vmx_nmi_allowed, 10367 .get_nmi_mask = vmx_get_nmi_mask, 10368 .set_nmi_mask = vmx_set_nmi_mask, 10369 .enable_nmi_window = enable_nmi_window, 10370 .enable_irq_window = enable_irq_window, 10371 .update_cr8_intercept = update_cr8_intercept, 10372 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, 10373 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 10374 .vm_has_apicv = vmx_vm_has_apicv, 10375 .load_eoi_exitmap = vmx_load_eoi_exitmap, 10376 .hwapic_irr_update = vmx_hwapic_irr_update, 10377 .hwapic_isr_update = vmx_hwapic_isr_update, 10378 .sync_pir_to_irr = vmx_sync_pir_to_irr, 10379 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 10380 10381 .set_tss_addr = vmx_set_tss_addr, 10382 .get_tdp_level = get_ept_level, 10383 .get_mt_mask = vmx_get_mt_mask, 10384 10385 .get_exit_info = vmx_get_exit_info, 10386 10387 .get_lpage_level = vmx_get_lpage_level, 10388 10389 .cpuid_update = vmx_cpuid_update, 10390 10391 .rdtscp_supported = vmx_rdtscp_supported, 10392 .invpcid_supported = vmx_invpcid_supported, 10393 10394 .set_supported_cpuid = vmx_set_supported_cpuid, 10395 10396 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 10397 10398 .set_tsc_khz = vmx_set_tsc_khz, 10399 .read_tsc_offset = vmx_read_tsc_offset, 10400 .write_tsc_offset = vmx_write_tsc_offset, 10401 .adjust_tsc_offset = vmx_adjust_tsc_offset, 10402 .compute_tsc_offset = vmx_compute_tsc_offset, 10403 .read_l1_tsc = vmx_read_l1_tsc, 10404 10405 .set_tdp_cr3 = vmx_set_cr3, 10406 10407 .check_intercept = vmx_check_intercept, 10408 .handle_external_intr = vmx_handle_external_intr, 10409 .mpx_supported = vmx_mpx_supported, 10410 .xsaves_supported = vmx_xsaves_supported, 10411 10412 .check_nested_events = vmx_check_nested_events, 10413 10414 .sched_in = vmx_sched_in, 10415 10416 .slot_enable_log_dirty = vmx_slot_enable_log_dirty, 10417 .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 10418 .flush_log_dirty = vmx_flush_log_dirty, 10419 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 10420 10421 .pmu_ops = &intel_pmu_ops, 10422}; 10423 10424static int __init vmx_init(void) 10425{ 10426 int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 10427 __alignof__(struct vcpu_vmx), THIS_MODULE); 10428 if (r) 10429 return r; 10430 10431#ifdef CONFIG_KEXEC 10432 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 10433 crash_vmclear_local_loaded_vmcss); 10434#endif 10435 10436 return 0; 10437} 10438 10439static void __exit vmx_exit(void) 10440{ 10441#ifdef CONFIG_KEXEC 10442 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 10443 synchronize_rcu(); 10444#endif 10445 10446 kvm_exit(); 10447} 10448 10449module_init(vmx_init) 10450module_exit(vmx_exit)