Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.5 10968 lines 317 kB view raw
1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19#include "irq.h" 20#include "mmu.h" 21#include "cpuid.h" 22#include "lapic.h" 23 24#include <linux/kvm_host.h> 25#include <linux/module.h> 26#include <linux/kernel.h> 27#include <linux/mm.h> 28#include <linux/highmem.h> 29#include <linux/sched.h> 30#include <linux/moduleparam.h> 31#include <linux/mod_devicetable.h> 32#include <linux/trace_events.h> 33#include <linux/slab.h> 34#include <linux/tboot.h> 35#include <linux/hrtimer.h> 36#include "kvm_cache_regs.h" 37#include "x86.h" 38 39#include <asm/cpu.h> 40#include <asm/io.h> 41#include <asm/desc.h> 42#include <asm/vmx.h> 43#include <asm/virtext.h> 44#include <asm/mce.h> 45#include <asm/fpu/internal.h> 46#include <asm/perf_event.h> 47#include <asm/debugreg.h> 48#include <asm/kexec.h> 49#include <asm/apic.h> 50#include <asm/irq_remapping.h> 51 52#include "trace.h" 53#include "pmu.h" 54 55#define __ex(x) __kvm_handle_fault_on_reboot(x) 56#define __ex_clear(x, reg) \ 57 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) 58 59MODULE_AUTHOR("Qumranet"); 60MODULE_LICENSE("GPL"); 61 62static const struct x86_cpu_id vmx_cpu_id[] = { 63 X86_FEATURE_MATCH(X86_FEATURE_VMX), 64 {} 65}; 66MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 67 68static bool __read_mostly enable_vpid = 1; 69module_param_named(vpid, enable_vpid, bool, 0444); 70 71static bool __read_mostly flexpriority_enabled = 1; 72module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 73 74static bool __read_mostly enable_ept = 1; 75module_param_named(ept, enable_ept, bool, S_IRUGO); 76 77static bool __read_mostly enable_unrestricted_guest = 1; 78module_param_named(unrestricted_guest, 79 enable_unrestricted_guest, bool, S_IRUGO); 80 81static bool __read_mostly enable_ept_ad_bits = 1; 82module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 83 84static bool __read_mostly emulate_invalid_guest_state = true; 85module_param(emulate_invalid_guest_state, bool, S_IRUGO); 86 87static bool __read_mostly vmm_exclusive = 1; 88module_param(vmm_exclusive, bool, S_IRUGO); 89 90static bool __read_mostly fasteoi = 1; 91module_param(fasteoi, bool, S_IRUGO); 92 93static bool __read_mostly enable_apicv = 1; 94module_param(enable_apicv, bool, S_IRUGO); 95 96static bool __read_mostly enable_shadow_vmcs = 1; 97module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 98/* 99 * If nested=1, nested virtualization is supported, i.e., guests may use 100 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 101 * use VMX instructions. 102 */ 103static bool __read_mostly nested = 0; 104module_param(nested, bool, S_IRUGO); 105 106static u64 __read_mostly host_xss; 107 108static bool __read_mostly enable_pml = 1; 109module_param_named(pml, enable_pml, bool, S_IRUGO); 110 111#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 112 113#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 114#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 115#define KVM_VM_CR0_ALWAYS_ON \ 116 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 117#define KVM_CR4_GUEST_OWNED_BITS \ 118 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 119 | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) 120 121#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 122#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 123 124#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 125 126#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 127 128/* 129 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 130 * ple_gap: upper bound on the amount of time between two successive 131 * executions of PAUSE in a loop. Also indicate if ple enabled. 132 * According to test, this time is usually smaller than 128 cycles. 133 * ple_window: upper bound on the amount of time a guest is allowed to execute 134 * in a PAUSE loop. Tests indicate that most spinlocks are held for 135 * less than 2^12 cycles 136 * Time is measured based on a counter that runs at the same rate as the TSC, 137 * refer SDM volume 3b section 21.6.13 & 22.1.3. 138 */ 139#define KVM_VMX_DEFAULT_PLE_GAP 128 140#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 141#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 142#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 143#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ 144 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW 145 146static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 147module_param(ple_gap, int, S_IRUGO); 148 149static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 150module_param(ple_window, int, S_IRUGO); 151 152/* Default doubles per-vcpu window every exit. */ 153static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; 154module_param(ple_window_grow, int, S_IRUGO); 155 156/* Default resets per-vcpu window every exit to ple_window. */ 157static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; 158module_param(ple_window_shrink, int, S_IRUGO); 159 160/* Default is to compute the maximum so we can never overflow. */ 161static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 162static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 163module_param(ple_window_max, int, S_IRUGO); 164 165extern const ulong vmx_return; 166 167#define NR_AUTOLOAD_MSRS 8 168#define VMCS02_POOL_SIZE 1 169 170struct vmcs { 171 u32 revision_id; 172 u32 abort; 173 char data[0]; 174}; 175 176/* 177 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also 178 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs 179 * loaded on this CPU (so we can clear them if the CPU goes down). 180 */ 181struct loaded_vmcs { 182 struct vmcs *vmcs; 183 int cpu; 184 int launched; 185 struct list_head loaded_vmcss_on_cpu_link; 186}; 187 188struct shared_msr_entry { 189 unsigned index; 190 u64 data; 191 u64 mask; 192}; 193 194/* 195 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a 196 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has 197 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is 198 * stored in guest memory specified by VMPTRLD, but is opaque to the guest, 199 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. 200 * More than one of these structures may exist, if L1 runs multiple L2 guests. 201 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the 202 * underlying hardware which will be used to run L2. 203 * This structure is packed to ensure that its layout is identical across 204 * machines (necessary for live migration). 205 * If there are changes in this struct, VMCS12_REVISION must be changed. 206 */ 207typedef u64 natural_width; 208struct __packed vmcs12 { 209 /* According to the Intel spec, a VMCS region must start with the 210 * following two fields. Then follow implementation-specific data. 211 */ 212 u32 revision_id; 213 u32 abort; 214 215 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ 216 u32 padding[7]; /* room for future expansion */ 217 218 u64 io_bitmap_a; 219 u64 io_bitmap_b; 220 u64 msr_bitmap; 221 u64 vm_exit_msr_store_addr; 222 u64 vm_exit_msr_load_addr; 223 u64 vm_entry_msr_load_addr; 224 u64 tsc_offset; 225 u64 virtual_apic_page_addr; 226 u64 apic_access_addr; 227 u64 posted_intr_desc_addr; 228 u64 ept_pointer; 229 u64 eoi_exit_bitmap0; 230 u64 eoi_exit_bitmap1; 231 u64 eoi_exit_bitmap2; 232 u64 eoi_exit_bitmap3; 233 u64 xss_exit_bitmap; 234 u64 guest_physical_address; 235 u64 vmcs_link_pointer; 236 u64 guest_ia32_debugctl; 237 u64 guest_ia32_pat; 238 u64 guest_ia32_efer; 239 u64 guest_ia32_perf_global_ctrl; 240 u64 guest_pdptr0; 241 u64 guest_pdptr1; 242 u64 guest_pdptr2; 243 u64 guest_pdptr3; 244 u64 guest_bndcfgs; 245 u64 host_ia32_pat; 246 u64 host_ia32_efer; 247 u64 host_ia32_perf_global_ctrl; 248 u64 padding64[8]; /* room for future expansion */ 249 /* 250 * To allow migration of L1 (complete with its L2 guests) between 251 * machines of different natural widths (32 or 64 bit), we cannot have 252 * unsigned long fields with no explict size. We use u64 (aliased 253 * natural_width) instead. Luckily, x86 is little-endian. 254 */ 255 natural_width cr0_guest_host_mask; 256 natural_width cr4_guest_host_mask; 257 natural_width cr0_read_shadow; 258 natural_width cr4_read_shadow; 259 natural_width cr3_target_value0; 260 natural_width cr3_target_value1; 261 natural_width cr3_target_value2; 262 natural_width cr3_target_value3; 263 natural_width exit_qualification; 264 natural_width guest_linear_address; 265 natural_width guest_cr0; 266 natural_width guest_cr3; 267 natural_width guest_cr4; 268 natural_width guest_es_base; 269 natural_width guest_cs_base; 270 natural_width guest_ss_base; 271 natural_width guest_ds_base; 272 natural_width guest_fs_base; 273 natural_width guest_gs_base; 274 natural_width guest_ldtr_base; 275 natural_width guest_tr_base; 276 natural_width guest_gdtr_base; 277 natural_width guest_idtr_base; 278 natural_width guest_dr7; 279 natural_width guest_rsp; 280 natural_width guest_rip; 281 natural_width guest_rflags; 282 natural_width guest_pending_dbg_exceptions; 283 natural_width guest_sysenter_esp; 284 natural_width guest_sysenter_eip; 285 natural_width host_cr0; 286 natural_width host_cr3; 287 natural_width host_cr4; 288 natural_width host_fs_base; 289 natural_width host_gs_base; 290 natural_width host_tr_base; 291 natural_width host_gdtr_base; 292 natural_width host_idtr_base; 293 natural_width host_ia32_sysenter_esp; 294 natural_width host_ia32_sysenter_eip; 295 natural_width host_rsp; 296 natural_width host_rip; 297 natural_width paddingl[8]; /* room for future expansion */ 298 u32 pin_based_vm_exec_control; 299 u32 cpu_based_vm_exec_control; 300 u32 exception_bitmap; 301 u32 page_fault_error_code_mask; 302 u32 page_fault_error_code_match; 303 u32 cr3_target_count; 304 u32 vm_exit_controls; 305 u32 vm_exit_msr_store_count; 306 u32 vm_exit_msr_load_count; 307 u32 vm_entry_controls; 308 u32 vm_entry_msr_load_count; 309 u32 vm_entry_intr_info_field; 310 u32 vm_entry_exception_error_code; 311 u32 vm_entry_instruction_len; 312 u32 tpr_threshold; 313 u32 secondary_vm_exec_control; 314 u32 vm_instruction_error; 315 u32 vm_exit_reason; 316 u32 vm_exit_intr_info; 317 u32 vm_exit_intr_error_code; 318 u32 idt_vectoring_info_field; 319 u32 idt_vectoring_error_code; 320 u32 vm_exit_instruction_len; 321 u32 vmx_instruction_info; 322 u32 guest_es_limit; 323 u32 guest_cs_limit; 324 u32 guest_ss_limit; 325 u32 guest_ds_limit; 326 u32 guest_fs_limit; 327 u32 guest_gs_limit; 328 u32 guest_ldtr_limit; 329 u32 guest_tr_limit; 330 u32 guest_gdtr_limit; 331 u32 guest_idtr_limit; 332 u32 guest_es_ar_bytes; 333 u32 guest_cs_ar_bytes; 334 u32 guest_ss_ar_bytes; 335 u32 guest_ds_ar_bytes; 336 u32 guest_fs_ar_bytes; 337 u32 guest_gs_ar_bytes; 338 u32 guest_ldtr_ar_bytes; 339 u32 guest_tr_ar_bytes; 340 u32 guest_interruptibility_info; 341 u32 guest_activity_state; 342 u32 guest_sysenter_cs; 343 u32 host_ia32_sysenter_cs; 344 u32 vmx_preemption_timer_value; 345 u32 padding32[7]; /* room for future expansion */ 346 u16 virtual_processor_id; 347 u16 posted_intr_nv; 348 u16 guest_es_selector; 349 u16 guest_cs_selector; 350 u16 guest_ss_selector; 351 u16 guest_ds_selector; 352 u16 guest_fs_selector; 353 u16 guest_gs_selector; 354 u16 guest_ldtr_selector; 355 u16 guest_tr_selector; 356 u16 guest_intr_status; 357 u16 host_es_selector; 358 u16 host_cs_selector; 359 u16 host_ss_selector; 360 u16 host_ds_selector; 361 u16 host_fs_selector; 362 u16 host_gs_selector; 363 u16 host_tr_selector; 364}; 365 366/* 367 * VMCS12_REVISION is an arbitrary id that should be changed if the content or 368 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and 369 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. 370 */ 371#define VMCS12_REVISION 0x11e57ed0 372 373/* 374 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region 375 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the 376 * current implementation, 4K are reserved to avoid future complications. 377 */ 378#define VMCS12_SIZE 0x1000 379 380/* Used to remember the last vmcs02 used for some recently used vmcs12s */ 381struct vmcs02_list { 382 struct list_head list; 383 gpa_t vmptr; 384 struct loaded_vmcs vmcs02; 385}; 386 387/* 388 * The nested_vmx structure is part of vcpu_vmx, and holds information we need 389 * for correct emulation of VMX (i.e., nested VMX) on this vcpu. 390 */ 391struct nested_vmx { 392 /* Has the level1 guest done vmxon? */ 393 bool vmxon; 394 gpa_t vmxon_ptr; 395 396 /* The guest-physical address of the current VMCS L1 keeps for L2 */ 397 gpa_t current_vmptr; 398 /* The host-usable pointer to the above */ 399 struct page *current_vmcs12_page; 400 struct vmcs12 *current_vmcs12; 401 struct vmcs *current_shadow_vmcs; 402 /* 403 * Indicates if the shadow vmcs must be updated with the 404 * data hold by vmcs12 405 */ 406 bool sync_shadow_vmcs; 407 408 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 409 struct list_head vmcs02_pool; 410 int vmcs02_num; 411 u64 vmcs01_tsc_offset; 412 /* L2 must run next, and mustn't decide to exit to L1. */ 413 bool nested_run_pending; 414 /* 415 * Guest pages referred to in vmcs02 with host-physical pointers, so 416 * we must keep them pinned while L2 runs. 417 */ 418 struct page *apic_access_page; 419 struct page *virtual_apic_page; 420 struct page *pi_desc_page; 421 struct pi_desc *pi_desc; 422 bool pi_pending; 423 u16 posted_intr_nv; 424 u64 msr_ia32_feature_control; 425 426 struct hrtimer preemption_timer; 427 bool preemption_timer_expired; 428 429 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 430 u64 vmcs01_debugctl; 431 432 u16 vpid02; 433 u16 last_vpid; 434 435 u32 nested_vmx_procbased_ctls_low; 436 u32 nested_vmx_procbased_ctls_high; 437 u32 nested_vmx_true_procbased_ctls_low; 438 u32 nested_vmx_secondary_ctls_low; 439 u32 nested_vmx_secondary_ctls_high; 440 u32 nested_vmx_pinbased_ctls_low; 441 u32 nested_vmx_pinbased_ctls_high; 442 u32 nested_vmx_exit_ctls_low; 443 u32 nested_vmx_exit_ctls_high; 444 u32 nested_vmx_true_exit_ctls_low; 445 u32 nested_vmx_entry_ctls_low; 446 u32 nested_vmx_entry_ctls_high; 447 u32 nested_vmx_true_entry_ctls_low; 448 u32 nested_vmx_misc_low; 449 u32 nested_vmx_misc_high; 450 u32 nested_vmx_ept_caps; 451 u32 nested_vmx_vpid_caps; 452}; 453 454#define POSTED_INTR_ON 0 455#define POSTED_INTR_SN 1 456 457/* Posted-Interrupt Descriptor */ 458struct pi_desc { 459 u32 pir[8]; /* Posted interrupt requested */ 460 union { 461 struct { 462 /* bit 256 - Outstanding Notification */ 463 u16 on : 1, 464 /* bit 257 - Suppress Notification */ 465 sn : 1, 466 /* bit 271:258 - Reserved */ 467 rsvd_1 : 14; 468 /* bit 279:272 - Notification Vector */ 469 u8 nv; 470 /* bit 287:280 - Reserved */ 471 u8 rsvd_2; 472 /* bit 319:288 - Notification Destination */ 473 u32 ndst; 474 }; 475 u64 control; 476 }; 477 u32 rsvd[6]; 478} __aligned(64); 479 480static bool pi_test_and_set_on(struct pi_desc *pi_desc) 481{ 482 return test_and_set_bit(POSTED_INTR_ON, 483 (unsigned long *)&pi_desc->control); 484} 485 486static bool pi_test_and_clear_on(struct pi_desc *pi_desc) 487{ 488 return test_and_clear_bit(POSTED_INTR_ON, 489 (unsigned long *)&pi_desc->control); 490} 491 492static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) 493{ 494 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 495} 496 497static inline void pi_clear_sn(struct pi_desc *pi_desc) 498{ 499 return clear_bit(POSTED_INTR_SN, 500 (unsigned long *)&pi_desc->control); 501} 502 503static inline void pi_set_sn(struct pi_desc *pi_desc) 504{ 505 return set_bit(POSTED_INTR_SN, 506 (unsigned long *)&pi_desc->control); 507} 508 509static inline int pi_test_on(struct pi_desc *pi_desc) 510{ 511 return test_bit(POSTED_INTR_ON, 512 (unsigned long *)&pi_desc->control); 513} 514 515static inline int pi_test_sn(struct pi_desc *pi_desc) 516{ 517 return test_bit(POSTED_INTR_SN, 518 (unsigned long *)&pi_desc->control); 519} 520 521struct vcpu_vmx { 522 struct kvm_vcpu vcpu; 523 unsigned long host_rsp; 524 u8 fail; 525 bool nmi_known_unmasked; 526 u32 exit_intr_info; 527 u32 idt_vectoring_info; 528 ulong rflags; 529 struct shared_msr_entry *guest_msrs; 530 int nmsrs; 531 int save_nmsrs; 532 unsigned long host_idt_base; 533#ifdef CONFIG_X86_64 534 u64 msr_host_kernel_gs_base; 535 u64 msr_guest_kernel_gs_base; 536#endif 537 u32 vm_entry_controls_shadow; 538 u32 vm_exit_controls_shadow; 539 /* 540 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 541 * non-nested (L1) guest, it always points to vmcs01. For a nested 542 * guest (L2), it points to a different VMCS. 543 */ 544 struct loaded_vmcs vmcs01; 545 struct loaded_vmcs *loaded_vmcs; 546 bool __launched; /* temporary, used in vmx_vcpu_run */ 547 struct msr_autoload { 548 unsigned nr; 549 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 550 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; 551 } msr_autoload; 552 struct { 553 int loaded; 554 u16 fs_sel, gs_sel, ldt_sel; 555#ifdef CONFIG_X86_64 556 u16 ds_sel, es_sel; 557#endif 558 int gs_ldt_reload_needed; 559 int fs_reload_needed; 560 u64 msr_host_bndcfgs; 561 unsigned long vmcs_host_cr4; /* May not match real cr4 */ 562 } host_state; 563 struct { 564 int vm86_active; 565 ulong save_rflags; 566 struct kvm_segment segs[8]; 567 } rmode; 568 struct { 569 u32 bitmask; /* 4 bits per segment (1 bit per field) */ 570 struct kvm_save_segment { 571 u16 selector; 572 unsigned long base; 573 u32 limit; 574 u32 ar; 575 } seg[8]; 576 } segment_cache; 577 int vpid; 578 bool emulation_required; 579 580 /* Support for vnmi-less CPUs */ 581 int soft_vnmi_blocked; 582 ktime_t entry_time; 583 s64 vnmi_blocked_time; 584 u32 exit_reason; 585 586 /* Posted interrupt descriptor */ 587 struct pi_desc pi_desc; 588 589 /* Support for a guest hypervisor (nested VMX) */ 590 struct nested_vmx nested; 591 592 /* Dynamic PLE window. */ 593 int ple_window; 594 bool ple_window_dirty; 595 596 /* Support for PML */ 597#define PML_ENTITY_NUM 512 598 struct page *pml_pg; 599 600 u64 current_tsc_ratio; 601}; 602 603enum segment_cache_field { 604 SEG_FIELD_SEL = 0, 605 SEG_FIELD_BASE = 1, 606 SEG_FIELD_LIMIT = 2, 607 SEG_FIELD_AR = 3, 608 609 SEG_FIELD_NR = 4 610}; 611 612static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 613{ 614 return container_of(vcpu, struct vcpu_vmx, vcpu); 615} 616 617static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) 618{ 619 return &(to_vmx(vcpu)->pi_desc); 620} 621 622#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) 623#define FIELD(number, name) [number] = VMCS12_OFFSET(name) 624#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 625 [number##_HIGH] = VMCS12_OFFSET(name)+4 626 627 628static unsigned long shadow_read_only_fields[] = { 629 /* 630 * We do NOT shadow fields that are modified when L0 631 * traps and emulates any vmx instruction (e.g. VMPTRLD, 632 * VMXON...) executed by L1. 633 * For example, VM_INSTRUCTION_ERROR is read 634 * by L1 if a vmx instruction fails (part of the error path). 635 * Note the code assumes this logic. If for some reason 636 * we start shadowing these fields then we need to 637 * force a shadow sync when L0 emulates vmx instructions 638 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified 639 * by nested_vmx_failValid) 640 */ 641 VM_EXIT_REASON, 642 VM_EXIT_INTR_INFO, 643 VM_EXIT_INSTRUCTION_LEN, 644 IDT_VECTORING_INFO_FIELD, 645 IDT_VECTORING_ERROR_CODE, 646 VM_EXIT_INTR_ERROR_CODE, 647 EXIT_QUALIFICATION, 648 GUEST_LINEAR_ADDRESS, 649 GUEST_PHYSICAL_ADDRESS 650}; 651static int max_shadow_read_only_fields = 652 ARRAY_SIZE(shadow_read_only_fields); 653 654static unsigned long shadow_read_write_fields[] = { 655 TPR_THRESHOLD, 656 GUEST_RIP, 657 GUEST_RSP, 658 GUEST_CR0, 659 GUEST_CR3, 660 GUEST_CR4, 661 GUEST_INTERRUPTIBILITY_INFO, 662 GUEST_RFLAGS, 663 GUEST_CS_SELECTOR, 664 GUEST_CS_AR_BYTES, 665 GUEST_CS_LIMIT, 666 GUEST_CS_BASE, 667 GUEST_ES_BASE, 668 GUEST_BNDCFGS, 669 CR0_GUEST_HOST_MASK, 670 CR0_READ_SHADOW, 671 CR4_READ_SHADOW, 672 TSC_OFFSET, 673 EXCEPTION_BITMAP, 674 CPU_BASED_VM_EXEC_CONTROL, 675 VM_ENTRY_EXCEPTION_ERROR_CODE, 676 VM_ENTRY_INTR_INFO_FIELD, 677 VM_ENTRY_INSTRUCTION_LEN, 678 VM_ENTRY_EXCEPTION_ERROR_CODE, 679 HOST_FS_BASE, 680 HOST_GS_BASE, 681 HOST_FS_SELECTOR, 682 HOST_GS_SELECTOR 683}; 684static int max_shadow_read_write_fields = 685 ARRAY_SIZE(shadow_read_write_fields); 686 687static const unsigned short vmcs_field_to_offset_table[] = { 688 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 689 FIELD(POSTED_INTR_NV, posted_intr_nv), 690 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 691 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 692 FIELD(GUEST_SS_SELECTOR, guest_ss_selector), 693 FIELD(GUEST_DS_SELECTOR, guest_ds_selector), 694 FIELD(GUEST_FS_SELECTOR, guest_fs_selector), 695 FIELD(GUEST_GS_SELECTOR, guest_gs_selector), 696 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), 697 FIELD(GUEST_TR_SELECTOR, guest_tr_selector), 698 FIELD(GUEST_INTR_STATUS, guest_intr_status), 699 FIELD(HOST_ES_SELECTOR, host_es_selector), 700 FIELD(HOST_CS_SELECTOR, host_cs_selector), 701 FIELD(HOST_SS_SELECTOR, host_ss_selector), 702 FIELD(HOST_DS_SELECTOR, host_ds_selector), 703 FIELD(HOST_FS_SELECTOR, host_fs_selector), 704 FIELD(HOST_GS_SELECTOR, host_gs_selector), 705 FIELD(HOST_TR_SELECTOR, host_tr_selector), 706 FIELD64(IO_BITMAP_A, io_bitmap_a), 707 FIELD64(IO_BITMAP_B, io_bitmap_b), 708 FIELD64(MSR_BITMAP, msr_bitmap), 709 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), 710 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), 711 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), 712 FIELD64(TSC_OFFSET, tsc_offset), 713 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 714 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 715 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), 716 FIELD64(EPT_POINTER, ept_pointer), 717 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), 718 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), 719 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), 720 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), 721 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), 722 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 723 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), 724 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), 725 FIELD64(GUEST_IA32_PAT, guest_ia32_pat), 726 FIELD64(GUEST_IA32_EFER, guest_ia32_efer), 727 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), 728 FIELD64(GUEST_PDPTR0, guest_pdptr0), 729 FIELD64(GUEST_PDPTR1, guest_pdptr1), 730 FIELD64(GUEST_PDPTR2, guest_pdptr2), 731 FIELD64(GUEST_PDPTR3, guest_pdptr3), 732 FIELD64(GUEST_BNDCFGS, guest_bndcfgs), 733 FIELD64(HOST_IA32_PAT, host_ia32_pat), 734 FIELD64(HOST_IA32_EFER, host_ia32_efer), 735 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), 736 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), 737 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), 738 FIELD(EXCEPTION_BITMAP, exception_bitmap), 739 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), 740 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), 741 FIELD(CR3_TARGET_COUNT, cr3_target_count), 742 FIELD(VM_EXIT_CONTROLS, vm_exit_controls), 743 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), 744 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), 745 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), 746 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), 747 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), 748 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), 749 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), 750 FIELD(TPR_THRESHOLD, tpr_threshold), 751 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), 752 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), 753 FIELD(VM_EXIT_REASON, vm_exit_reason), 754 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), 755 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), 756 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), 757 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), 758 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), 759 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), 760 FIELD(GUEST_ES_LIMIT, guest_es_limit), 761 FIELD(GUEST_CS_LIMIT, guest_cs_limit), 762 FIELD(GUEST_SS_LIMIT, guest_ss_limit), 763 FIELD(GUEST_DS_LIMIT, guest_ds_limit), 764 FIELD(GUEST_FS_LIMIT, guest_fs_limit), 765 FIELD(GUEST_GS_LIMIT, guest_gs_limit), 766 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), 767 FIELD(GUEST_TR_LIMIT, guest_tr_limit), 768 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), 769 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), 770 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), 771 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), 772 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), 773 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), 774 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), 775 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), 776 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), 777 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), 778 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), 779 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), 780 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), 781 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), 782 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), 783 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), 784 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), 785 FIELD(CR0_READ_SHADOW, cr0_read_shadow), 786 FIELD(CR4_READ_SHADOW, cr4_read_shadow), 787 FIELD(CR3_TARGET_VALUE0, cr3_target_value0), 788 FIELD(CR3_TARGET_VALUE1, cr3_target_value1), 789 FIELD(CR3_TARGET_VALUE2, cr3_target_value2), 790 FIELD(CR3_TARGET_VALUE3, cr3_target_value3), 791 FIELD(EXIT_QUALIFICATION, exit_qualification), 792 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), 793 FIELD(GUEST_CR0, guest_cr0), 794 FIELD(GUEST_CR3, guest_cr3), 795 FIELD(GUEST_CR4, guest_cr4), 796 FIELD(GUEST_ES_BASE, guest_es_base), 797 FIELD(GUEST_CS_BASE, guest_cs_base), 798 FIELD(GUEST_SS_BASE, guest_ss_base), 799 FIELD(GUEST_DS_BASE, guest_ds_base), 800 FIELD(GUEST_FS_BASE, guest_fs_base), 801 FIELD(GUEST_GS_BASE, guest_gs_base), 802 FIELD(GUEST_LDTR_BASE, guest_ldtr_base), 803 FIELD(GUEST_TR_BASE, guest_tr_base), 804 FIELD(GUEST_GDTR_BASE, guest_gdtr_base), 805 FIELD(GUEST_IDTR_BASE, guest_idtr_base), 806 FIELD(GUEST_DR7, guest_dr7), 807 FIELD(GUEST_RSP, guest_rsp), 808 FIELD(GUEST_RIP, guest_rip), 809 FIELD(GUEST_RFLAGS, guest_rflags), 810 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), 811 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), 812 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), 813 FIELD(HOST_CR0, host_cr0), 814 FIELD(HOST_CR3, host_cr3), 815 FIELD(HOST_CR4, host_cr4), 816 FIELD(HOST_FS_BASE, host_fs_base), 817 FIELD(HOST_GS_BASE, host_gs_base), 818 FIELD(HOST_TR_BASE, host_tr_base), 819 FIELD(HOST_GDTR_BASE, host_gdtr_base), 820 FIELD(HOST_IDTR_BASE, host_idtr_base), 821 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), 822 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), 823 FIELD(HOST_RSP, host_rsp), 824 FIELD(HOST_RIP, host_rip), 825}; 826 827static inline short vmcs_field_to_offset(unsigned long field) 828{ 829 BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); 830 831 if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || 832 vmcs_field_to_offset_table[field] == 0) 833 return -ENOENT; 834 835 return vmcs_field_to_offset_table[field]; 836} 837 838static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 839{ 840 return to_vmx(vcpu)->nested.current_vmcs12; 841} 842 843static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 844{ 845 struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); 846 if (is_error_page(page)) 847 return NULL; 848 849 return page; 850} 851 852static void nested_release_page(struct page *page) 853{ 854 kvm_release_page_dirty(page); 855} 856 857static void nested_release_page_clean(struct page *page) 858{ 859 kvm_release_page_clean(page); 860} 861 862static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 863static u64 construct_eptp(unsigned long root_hpa); 864static void kvm_cpu_vmxon(u64 addr); 865static void kvm_cpu_vmxoff(void); 866static bool vmx_mpx_supported(void); 867static bool vmx_xsaves_supported(void); 868static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 869static void vmx_set_segment(struct kvm_vcpu *vcpu, 870 struct kvm_segment *var, int seg); 871static void vmx_get_segment(struct kvm_vcpu *vcpu, 872 struct kvm_segment *var, int seg); 873static bool guest_state_valid(struct kvm_vcpu *vcpu); 874static u32 vmx_segment_access_rights(struct kvm_segment *var); 875static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); 876static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 877static int alloc_identity_pagetable(struct kvm *kvm); 878 879static DEFINE_PER_CPU(struct vmcs *, vmxarea); 880static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 881/* 882 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 883 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 884 */ 885static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 886static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 887 888/* 889 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we 890 * can find which vCPU should be waken up. 891 */ 892static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); 893static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); 894 895static unsigned long *vmx_io_bitmap_a; 896static unsigned long *vmx_io_bitmap_b; 897static unsigned long *vmx_msr_bitmap_legacy; 898static unsigned long *vmx_msr_bitmap_longmode; 899static unsigned long *vmx_msr_bitmap_legacy_x2apic; 900static unsigned long *vmx_msr_bitmap_longmode_x2apic; 901static unsigned long *vmx_msr_bitmap_nested; 902static unsigned long *vmx_vmread_bitmap; 903static unsigned long *vmx_vmwrite_bitmap; 904 905static bool cpu_has_load_ia32_efer; 906static bool cpu_has_load_perf_global_ctrl; 907 908static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 909static DEFINE_SPINLOCK(vmx_vpid_lock); 910 911static struct vmcs_config { 912 int size; 913 int order; 914 u32 revision_id; 915 u32 pin_based_exec_ctrl; 916 u32 cpu_based_exec_ctrl; 917 u32 cpu_based_2nd_exec_ctrl; 918 u32 vmexit_ctrl; 919 u32 vmentry_ctrl; 920} vmcs_config; 921 922static struct vmx_capability { 923 u32 ept; 924 u32 vpid; 925} vmx_capability; 926 927#define VMX_SEGMENT_FIELD(seg) \ 928 [VCPU_SREG_##seg] = { \ 929 .selector = GUEST_##seg##_SELECTOR, \ 930 .base = GUEST_##seg##_BASE, \ 931 .limit = GUEST_##seg##_LIMIT, \ 932 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 933 } 934 935static const struct kvm_vmx_segment_field { 936 unsigned selector; 937 unsigned base; 938 unsigned limit; 939 unsigned ar_bytes; 940} kvm_vmx_segment_fields[] = { 941 VMX_SEGMENT_FIELD(CS), 942 VMX_SEGMENT_FIELD(DS), 943 VMX_SEGMENT_FIELD(ES), 944 VMX_SEGMENT_FIELD(FS), 945 VMX_SEGMENT_FIELD(GS), 946 VMX_SEGMENT_FIELD(SS), 947 VMX_SEGMENT_FIELD(TR), 948 VMX_SEGMENT_FIELD(LDTR), 949}; 950 951static u64 host_efer; 952 953static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 954 955/* 956 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it 957 * away by decrementing the array size. 958 */ 959static const u32 vmx_msr_index[] = { 960#ifdef CONFIG_X86_64 961 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 962#endif 963 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 964}; 965 966static inline bool is_page_fault(u32 intr_info) 967{ 968 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 969 INTR_INFO_VALID_MASK)) == 970 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 971} 972 973static inline bool is_no_device(u32 intr_info) 974{ 975 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 976 INTR_INFO_VALID_MASK)) == 977 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 978} 979 980static inline bool is_invalid_opcode(u32 intr_info) 981{ 982 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 983 INTR_INFO_VALID_MASK)) == 984 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 985} 986 987static inline bool is_external_interrupt(u32 intr_info) 988{ 989 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 990 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 991} 992 993static inline bool is_machine_check(u32 intr_info) 994{ 995 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 996 INTR_INFO_VALID_MASK)) == 997 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 998} 999 1000static inline bool cpu_has_vmx_msr_bitmap(void) 1001{ 1002 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 1003} 1004 1005static inline bool cpu_has_vmx_tpr_shadow(void) 1006{ 1007 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 1008} 1009 1010static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) 1011{ 1012 return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); 1013} 1014 1015static inline bool cpu_has_secondary_exec_ctrls(void) 1016{ 1017 return vmcs_config.cpu_based_exec_ctrl & 1018 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1019} 1020 1021static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 1022{ 1023 return vmcs_config.cpu_based_2nd_exec_ctrl & 1024 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 1025} 1026 1027static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) 1028{ 1029 return vmcs_config.cpu_based_2nd_exec_ctrl & 1030 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 1031} 1032 1033static inline bool cpu_has_vmx_apic_register_virt(void) 1034{ 1035 return vmcs_config.cpu_based_2nd_exec_ctrl & 1036 SECONDARY_EXEC_APIC_REGISTER_VIRT; 1037} 1038 1039static inline bool cpu_has_vmx_virtual_intr_delivery(void) 1040{ 1041 return vmcs_config.cpu_based_2nd_exec_ctrl & 1042 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 1043} 1044 1045static inline bool cpu_has_vmx_posted_intr(void) 1046{ 1047 return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && 1048 vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 1049} 1050 1051static inline bool cpu_has_vmx_apicv(void) 1052{ 1053 return cpu_has_vmx_apic_register_virt() && 1054 cpu_has_vmx_virtual_intr_delivery() && 1055 cpu_has_vmx_posted_intr(); 1056} 1057 1058static inline bool cpu_has_vmx_flexpriority(void) 1059{ 1060 return cpu_has_vmx_tpr_shadow() && 1061 cpu_has_vmx_virtualize_apic_accesses(); 1062} 1063 1064static inline bool cpu_has_vmx_ept_execute_only(void) 1065{ 1066 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; 1067} 1068 1069static inline bool cpu_has_vmx_ept_2m_page(void) 1070{ 1071 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; 1072} 1073 1074static inline bool cpu_has_vmx_ept_1g_page(void) 1075{ 1076 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 1077} 1078 1079static inline bool cpu_has_vmx_ept_4levels(void) 1080{ 1081 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 1082} 1083 1084static inline bool cpu_has_vmx_ept_ad_bits(void) 1085{ 1086 return vmx_capability.ept & VMX_EPT_AD_BIT; 1087} 1088 1089static inline bool cpu_has_vmx_invept_context(void) 1090{ 1091 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; 1092} 1093 1094static inline bool cpu_has_vmx_invept_global(void) 1095{ 1096 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 1097} 1098 1099static inline bool cpu_has_vmx_invvpid_single(void) 1100{ 1101 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; 1102} 1103 1104static inline bool cpu_has_vmx_invvpid_global(void) 1105{ 1106 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; 1107} 1108 1109static inline bool cpu_has_vmx_ept(void) 1110{ 1111 return vmcs_config.cpu_based_2nd_exec_ctrl & 1112 SECONDARY_EXEC_ENABLE_EPT; 1113} 1114 1115static inline bool cpu_has_vmx_unrestricted_guest(void) 1116{ 1117 return vmcs_config.cpu_based_2nd_exec_ctrl & 1118 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1119} 1120 1121static inline bool cpu_has_vmx_ple(void) 1122{ 1123 return vmcs_config.cpu_based_2nd_exec_ctrl & 1124 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1125} 1126 1127static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 1128{ 1129 return flexpriority_enabled && lapic_in_kernel(vcpu); 1130} 1131 1132static inline bool cpu_has_vmx_vpid(void) 1133{ 1134 return vmcs_config.cpu_based_2nd_exec_ctrl & 1135 SECONDARY_EXEC_ENABLE_VPID; 1136} 1137 1138static inline bool cpu_has_vmx_rdtscp(void) 1139{ 1140 return vmcs_config.cpu_based_2nd_exec_ctrl & 1141 SECONDARY_EXEC_RDTSCP; 1142} 1143 1144static inline bool cpu_has_vmx_invpcid(void) 1145{ 1146 return vmcs_config.cpu_based_2nd_exec_ctrl & 1147 SECONDARY_EXEC_ENABLE_INVPCID; 1148} 1149 1150static inline bool cpu_has_virtual_nmis(void) 1151{ 1152 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 1153} 1154 1155static inline bool cpu_has_vmx_wbinvd_exit(void) 1156{ 1157 return vmcs_config.cpu_based_2nd_exec_ctrl & 1158 SECONDARY_EXEC_WBINVD_EXITING; 1159} 1160 1161static inline bool cpu_has_vmx_shadow_vmcs(void) 1162{ 1163 u64 vmx_msr; 1164 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 1165 /* check if the cpu supports writing r/o exit information fields */ 1166 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 1167 return false; 1168 1169 return vmcs_config.cpu_based_2nd_exec_ctrl & 1170 SECONDARY_EXEC_SHADOW_VMCS; 1171} 1172 1173static inline bool cpu_has_vmx_pml(void) 1174{ 1175 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; 1176} 1177 1178static inline bool cpu_has_vmx_tsc_scaling(void) 1179{ 1180 return vmcs_config.cpu_based_2nd_exec_ctrl & 1181 SECONDARY_EXEC_TSC_SCALING; 1182} 1183 1184static inline bool report_flexpriority(void) 1185{ 1186 return flexpriority_enabled; 1187} 1188 1189static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) 1190{ 1191 return vmcs12->cpu_based_vm_exec_control & bit; 1192} 1193 1194static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) 1195{ 1196 return (vmcs12->cpu_based_vm_exec_control & 1197 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 1198 (vmcs12->secondary_vm_exec_control & bit); 1199} 1200 1201static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) 1202{ 1203 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1204} 1205 1206static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) 1207{ 1208 return vmcs12->pin_based_vm_exec_control & 1209 PIN_BASED_VMX_PREEMPTION_TIMER; 1210} 1211 1212static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) 1213{ 1214 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); 1215} 1216 1217static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) 1218{ 1219 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && 1220 vmx_xsaves_supported(); 1221} 1222 1223static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) 1224{ 1225 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 1226} 1227 1228static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) 1229{ 1230 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); 1231} 1232 1233static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) 1234{ 1235 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); 1236} 1237 1238static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) 1239{ 1240 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 1241} 1242 1243static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) 1244{ 1245 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; 1246} 1247 1248static inline bool is_exception(u32 intr_info) 1249{ 1250 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1251 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); 1252} 1253 1254static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 1255 u32 exit_intr_info, 1256 unsigned long exit_qualification); 1257static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 1258 struct vmcs12 *vmcs12, 1259 u32 reason, unsigned long qualification); 1260 1261static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 1262{ 1263 int i; 1264 1265 for (i = 0; i < vmx->nmsrs; ++i) 1266 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) 1267 return i; 1268 return -1; 1269} 1270 1271static inline void __invvpid(int ext, u16 vpid, gva_t gva) 1272{ 1273 struct { 1274 u64 vpid : 16; 1275 u64 rsvd : 48; 1276 u64 gva; 1277 } operand = { vpid, 0, gva }; 1278 1279 asm volatile (__ex(ASM_VMX_INVVPID) 1280 /* CF==1 or ZF==1 --> rc = -1 */ 1281 "; ja 1f ; ud2 ; 1:" 1282 : : "a"(&operand), "c"(ext) : "cc", "memory"); 1283} 1284 1285static inline void __invept(int ext, u64 eptp, gpa_t gpa) 1286{ 1287 struct { 1288 u64 eptp, gpa; 1289 } operand = {eptp, gpa}; 1290 1291 asm volatile (__ex(ASM_VMX_INVEPT) 1292 /* CF==1 or ZF==1 --> rc = -1 */ 1293 "; ja 1f ; ud2 ; 1:\n" 1294 : : "a" (&operand), "c" (ext) : "cc", "memory"); 1295} 1296 1297static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 1298{ 1299 int i; 1300 1301 i = __find_msr_index(vmx, msr); 1302 if (i >= 0) 1303 return &vmx->guest_msrs[i]; 1304 return NULL; 1305} 1306 1307static void vmcs_clear(struct vmcs *vmcs) 1308{ 1309 u64 phys_addr = __pa(vmcs); 1310 u8 error; 1311 1312 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 1313 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1314 : "cc", "memory"); 1315 if (error) 1316 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 1317 vmcs, phys_addr); 1318} 1319 1320static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) 1321{ 1322 vmcs_clear(loaded_vmcs->vmcs); 1323 loaded_vmcs->cpu = -1; 1324 loaded_vmcs->launched = 0; 1325} 1326 1327static void vmcs_load(struct vmcs *vmcs) 1328{ 1329 u64 phys_addr = __pa(vmcs); 1330 u8 error; 1331 1332 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 1333 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1334 : "cc", "memory"); 1335 if (error) 1336 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", 1337 vmcs, phys_addr); 1338} 1339 1340#ifdef CONFIG_KEXEC_CORE 1341/* 1342 * This bitmap is used to indicate whether the vmclear 1343 * operation is enabled on all cpus. All disabled by 1344 * default. 1345 */ 1346static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; 1347 1348static inline void crash_enable_local_vmclear(int cpu) 1349{ 1350 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); 1351} 1352 1353static inline void crash_disable_local_vmclear(int cpu) 1354{ 1355 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); 1356} 1357 1358static inline int crash_local_vmclear_enabled(int cpu) 1359{ 1360 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); 1361} 1362 1363static void crash_vmclear_local_loaded_vmcss(void) 1364{ 1365 int cpu = raw_smp_processor_id(); 1366 struct loaded_vmcs *v; 1367 1368 if (!crash_local_vmclear_enabled(cpu)) 1369 return; 1370 1371 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 1372 loaded_vmcss_on_cpu_link) 1373 vmcs_clear(v->vmcs); 1374} 1375#else 1376static inline void crash_enable_local_vmclear(int cpu) { } 1377static inline void crash_disable_local_vmclear(int cpu) { } 1378#endif /* CONFIG_KEXEC_CORE */ 1379 1380static void __loaded_vmcs_clear(void *arg) 1381{ 1382 struct loaded_vmcs *loaded_vmcs = arg; 1383 int cpu = raw_smp_processor_id(); 1384 1385 if (loaded_vmcs->cpu != cpu) 1386 return; /* vcpu migration can race with cpu offline */ 1387 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1388 per_cpu(current_vmcs, cpu) = NULL; 1389 crash_disable_local_vmclear(cpu); 1390 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1391 1392 /* 1393 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link 1394 * is before setting loaded_vmcs->vcpu to -1 which is done in 1395 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist 1396 * then adds the vmcs into percpu list before it is deleted. 1397 */ 1398 smp_wmb(); 1399 1400 loaded_vmcs_init(loaded_vmcs); 1401 crash_enable_local_vmclear(cpu); 1402} 1403 1404static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1405{ 1406 int cpu = loaded_vmcs->cpu; 1407 1408 if (cpu != -1) 1409 smp_call_function_single(cpu, 1410 __loaded_vmcs_clear, loaded_vmcs, 1); 1411} 1412 1413static inline void vpid_sync_vcpu_single(int vpid) 1414{ 1415 if (vpid == 0) 1416 return; 1417 1418 if (cpu_has_vmx_invvpid_single()) 1419 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); 1420} 1421 1422static inline void vpid_sync_vcpu_global(void) 1423{ 1424 if (cpu_has_vmx_invvpid_global()) 1425 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); 1426} 1427 1428static inline void vpid_sync_context(int vpid) 1429{ 1430 if (cpu_has_vmx_invvpid_single()) 1431 vpid_sync_vcpu_single(vpid); 1432 else 1433 vpid_sync_vcpu_global(); 1434} 1435 1436static inline void ept_sync_global(void) 1437{ 1438 if (cpu_has_vmx_invept_global()) 1439 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); 1440} 1441 1442static inline void ept_sync_context(u64 eptp) 1443{ 1444 if (enable_ept) { 1445 if (cpu_has_vmx_invept_context()) 1446 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1447 else 1448 ept_sync_global(); 1449 } 1450} 1451 1452static __always_inline void vmcs_check16(unsigned long field) 1453{ 1454 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, 1455 "16-bit accessor invalid for 64-bit field"); 1456 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, 1457 "16-bit accessor invalid for 64-bit high field"); 1458 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, 1459 "16-bit accessor invalid for 32-bit high field"); 1460 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, 1461 "16-bit accessor invalid for natural width field"); 1462} 1463 1464static __always_inline void vmcs_check32(unsigned long field) 1465{ 1466 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, 1467 "32-bit accessor invalid for 16-bit field"); 1468 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, 1469 "32-bit accessor invalid for natural width field"); 1470} 1471 1472static __always_inline void vmcs_check64(unsigned long field) 1473{ 1474 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, 1475 "64-bit accessor invalid for 16-bit field"); 1476 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, 1477 "64-bit accessor invalid for 64-bit high field"); 1478 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, 1479 "64-bit accessor invalid for 32-bit field"); 1480 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, 1481 "64-bit accessor invalid for natural width field"); 1482} 1483 1484static __always_inline void vmcs_checkl(unsigned long field) 1485{ 1486 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, 1487 "Natural width accessor invalid for 16-bit field"); 1488 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, 1489 "Natural width accessor invalid for 64-bit field"); 1490 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, 1491 "Natural width accessor invalid for 64-bit high field"); 1492 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, 1493 "Natural width accessor invalid for 32-bit field"); 1494} 1495 1496static __always_inline unsigned long __vmcs_readl(unsigned long field) 1497{ 1498 unsigned long value; 1499 1500 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") 1501 : "=a"(value) : "d"(field) : "cc"); 1502 return value; 1503} 1504 1505static __always_inline u16 vmcs_read16(unsigned long field) 1506{ 1507 vmcs_check16(field); 1508 return __vmcs_readl(field); 1509} 1510 1511static __always_inline u32 vmcs_read32(unsigned long field) 1512{ 1513 vmcs_check32(field); 1514 return __vmcs_readl(field); 1515} 1516 1517static __always_inline u64 vmcs_read64(unsigned long field) 1518{ 1519 vmcs_check64(field); 1520#ifdef CONFIG_X86_64 1521 return __vmcs_readl(field); 1522#else 1523 return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); 1524#endif 1525} 1526 1527static __always_inline unsigned long vmcs_readl(unsigned long field) 1528{ 1529 vmcs_checkl(field); 1530 return __vmcs_readl(field); 1531} 1532 1533static noinline void vmwrite_error(unsigned long field, unsigned long value) 1534{ 1535 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", 1536 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 1537 dump_stack(); 1538} 1539 1540static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) 1541{ 1542 u8 error; 1543 1544 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" 1545 : "=q"(error) : "a"(value), "d"(field) : "cc"); 1546 if (unlikely(error)) 1547 vmwrite_error(field, value); 1548} 1549 1550static __always_inline void vmcs_write16(unsigned long field, u16 value) 1551{ 1552 vmcs_check16(field); 1553 __vmcs_writel(field, value); 1554} 1555 1556static __always_inline void vmcs_write32(unsigned long field, u32 value) 1557{ 1558 vmcs_check32(field); 1559 __vmcs_writel(field, value); 1560} 1561 1562static __always_inline void vmcs_write64(unsigned long field, u64 value) 1563{ 1564 vmcs_check64(field); 1565 __vmcs_writel(field, value); 1566#ifndef CONFIG_X86_64 1567 asm volatile (""); 1568 __vmcs_writel(field+1, value >> 32); 1569#endif 1570} 1571 1572static __always_inline void vmcs_writel(unsigned long field, unsigned long value) 1573{ 1574 vmcs_checkl(field); 1575 __vmcs_writel(field, value); 1576} 1577 1578static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) 1579{ 1580 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, 1581 "vmcs_clear_bits does not support 64-bit fields"); 1582 __vmcs_writel(field, __vmcs_readl(field) & ~mask); 1583} 1584 1585static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) 1586{ 1587 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, 1588 "vmcs_set_bits does not support 64-bit fields"); 1589 __vmcs_writel(field, __vmcs_readl(field) | mask); 1590} 1591 1592static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) 1593{ 1594 vmcs_write32(VM_ENTRY_CONTROLS, val); 1595 vmx->vm_entry_controls_shadow = val; 1596} 1597 1598static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) 1599{ 1600 if (vmx->vm_entry_controls_shadow != val) 1601 vm_entry_controls_init(vmx, val); 1602} 1603 1604static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) 1605{ 1606 return vmx->vm_entry_controls_shadow; 1607} 1608 1609 1610static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1611{ 1612 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); 1613} 1614 1615static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1616{ 1617 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); 1618} 1619 1620static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) 1621{ 1622 vmcs_write32(VM_EXIT_CONTROLS, val); 1623 vmx->vm_exit_controls_shadow = val; 1624} 1625 1626static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) 1627{ 1628 if (vmx->vm_exit_controls_shadow != val) 1629 vm_exit_controls_init(vmx, val); 1630} 1631 1632static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) 1633{ 1634 return vmx->vm_exit_controls_shadow; 1635} 1636 1637 1638static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1639{ 1640 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); 1641} 1642 1643static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1644{ 1645 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); 1646} 1647 1648static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 1649{ 1650 vmx->segment_cache.bitmask = 0; 1651} 1652 1653static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 1654 unsigned field) 1655{ 1656 bool ret; 1657 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 1658 1659 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { 1660 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); 1661 vmx->segment_cache.bitmask = 0; 1662 } 1663 ret = vmx->segment_cache.bitmask & mask; 1664 vmx->segment_cache.bitmask |= mask; 1665 return ret; 1666} 1667 1668static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 1669{ 1670 u16 *p = &vmx->segment_cache.seg[seg].selector; 1671 1672 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 1673 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 1674 return *p; 1675} 1676 1677static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 1678{ 1679 ulong *p = &vmx->segment_cache.seg[seg].base; 1680 1681 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 1682 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 1683 return *p; 1684} 1685 1686static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 1687{ 1688 u32 *p = &vmx->segment_cache.seg[seg].limit; 1689 1690 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 1691 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 1692 return *p; 1693} 1694 1695static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 1696{ 1697 u32 *p = &vmx->segment_cache.seg[seg].ar; 1698 1699 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 1700 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 1701 return *p; 1702} 1703 1704static void update_exception_bitmap(struct kvm_vcpu *vcpu) 1705{ 1706 u32 eb; 1707 1708 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 1709 (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); 1710 if ((vcpu->guest_debug & 1711 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 1712 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 1713 eb |= 1u << BP_VECTOR; 1714 if (to_vmx(vcpu)->rmode.vm86_active) 1715 eb = ~0; 1716 if (enable_ept) 1717 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1718 if (vcpu->fpu_active) 1719 eb &= ~(1u << NM_VECTOR); 1720 1721 /* When we are running a nested L2 guest and L1 specified for it a 1722 * certain exception bitmap, we must trap the same exceptions and pass 1723 * them to L1. When running L2, we will only handle the exceptions 1724 * specified above if L1 did not want them. 1725 */ 1726 if (is_guest_mode(vcpu)) 1727 eb |= get_vmcs12(vcpu)->exception_bitmap; 1728 1729 vmcs_write32(EXCEPTION_BITMAP, eb); 1730} 1731 1732static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1733 unsigned long entry, unsigned long exit) 1734{ 1735 vm_entry_controls_clearbit(vmx, entry); 1736 vm_exit_controls_clearbit(vmx, exit); 1737} 1738 1739static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1740{ 1741 unsigned i; 1742 struct msr_autoload *m = &vmx->msr_autoload; 1743 1744 switch (msr) { 1745 case MSR_EFER: 1746 if (cpu_has_load_ia32_efer) { 1747 clear_atomic_switch_msr_special(vmx, 1748 VM_ENTRY_LOAD_IA32_EFER, 1749 VM_EXIT_LOAD_IA32_EFER); 1750 return; 1751 } 1752 break; 1753 case MSR_CORE_PERF_GLOBAL_CTRL: 1754 if (cpu_has_load_perf_global_ctrl) { 1755 clear_atomic_switch_msr_special(vmx, 1756 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1757 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1758 return; 1759 } 1760 break; 1761 } 1762 1763 for (i = 0; i < m->nr; ++i) 1764 if (m->guest[i].index == msr) 1765 break; 1766 1767 if (i == m->nr) 1768 return; 1769 --m->nr; 1770 m->guest[i] = m->guest[m->nr]; 1771 m->host[i] = m->host[m->nr]; 1772 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1773 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1774} 1775 1776static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1777 unsigned long entry, unsigned long exit, 1778 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1779 u64 guest_val, u64 host_val) 1780{ 1781 vmcs_write64(guest_val_vmcs, guest_val); 1782 vmcs_write64(host_val_vmcs, host_val); 1783 vm_entry_controls_setbit(vmx, entry); 1784 vm_exit_controls_setbit(vmx, exit); 1785} 1786 1787static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1788 u64 guest_val, u64 host_val) 1789{ 1790 unsigned i; 1791 struct msr_autoload *m = &vmx->msr_autoload; 1792 1793 switch (msr) { 1794 case MSR_EFER: 1795 if (cpu_has_load_ia32_efer) { 1796 add_atomic_switch_msr_special(vmx, 1797 VM_ENTRY_LOAD_IA32_EFER, 1798 VM_EXIT_LOAD_IA32_EFER, 1799 GUEST_IA32_EFER, 1800 HOST_IA32_EFER, 1801 guest_val, host_val); 1802 return; 1803 } 1804 break; 1805 case MSR_CORE_PERF_GLOBAL_CTRL: 1806 if (cpu_has_load_perf_global_ctrl) { 1807 add_atomic_switch_msr_special(vmx, 1808 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1809 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1810 GUEST_IA32_PERF_GLOBAL_CTRL, 1811 HOST_IA32_PERF_GLOBAL_CTRL, 1812 guest_val, host_val); 1813 return; 1814 } 1815 break; 1816 case MSR_IA32_PEBS_ENABLE: 1817 /* PEBS needs a quiescent period after being disabled (to write 1818 * a record). Disabling PEBS through VMX MSR swapping doesn't 1819 * provide that period, so a CPU could write host's record into 1820 * guest's memory. 1821 */ 1822 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 1823 } 1824 1825 for (i = 0; i < m->nr; ++i) 1826 if (m->guest[i].index == msr) 1827 break; 1828 1829 if (i == NR_AUTOLOAD_MSRS) { 1830 printk_once(KERN_WARNING "Not enough msr switch entries. " 1831 "Can't add msr %x\n", msr); 1832 return; 1833 } else if (i == m->nr) { 1834 ++m->nr; 1835 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1836 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1837 } 1838 1839 m->guest[i].index = msr; 1840 m->guest[i].value = guest_val; 1841 m->host[i].index = msr; 1842 m->host[i].value = host_val; 1843} 1844 1845static void reload_tss(void) 1846{ 1847 /* 1848 * VT restores TR but not its size. Useless. 1849 */ 1850 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1851 struct desc_struct *descs; 1852 1853 descs = (void *)gdt->address; 1854 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 1855 load_TR_desc(); 1856} 1857 1858static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) 1859{ 1860 u64 guest_efer = vmx->vcpu.arch.efer; 1861 u64 ignore_bits = 0; 1862 1863 if (!enable_ept) { 1864 /* 1865 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing 1866 * host CPUID is more efficient than testing guest CPUID 1867 * or CR4. Host SMEP is anyway a requirement for guest SMEP. 1868 */ 1869 if (boot_cpu_has(X86_FEATURE_SMEP)) 1870 guest_efer |= EFER_NX; 1871 else if (!(guest_efer & EFER_NX)) 1872 ignore_bits |= EFER_NX; 1873 } 1874 1875 /* 1876 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1877 */ 1878 ignore_bits |= EFER_SCE; 1879#ifdef CONFIG_X86_64 1880 ignore_bits |= EFER_LMA | EFER_LME; 1881 /* SCE is meaningful only in long mode on Intel */ 1882 if (guest_efer & EFER_LMA) 1883 ignore_bits &= ~(u64)EFER_SCE; 1884#endif 1885 1886 clear_atomic_switch_msr(vmx, MSR_EFER); 1887 1888 /* 1889 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1890 * On CPUs that support "load IA32_EFER", always switch EFER 1891 * atomically, since it's faster than switching it manually. 1892 */ 1893 if (cpu_has_load_ia32_efer || 1894 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 1895 if (!(guest_efer & EFER_LMA)) 1896 guest_efer &= ~EFER_LME; 1897 if (guest_efer != host_efer) 1898 add_atomic_switch_msr(vmx, MSR_EFER, 1899 guest_efer, host_efer); 1900 return false; 1901 } else { 1902 guest_efer &= ~ignore_bits; 1903 guest_efer |= host_efer & ignore_bits; 1904 1905 vmx->guest_msrs[efer_offset].data = guest_efer; 1906 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 1907 1908 return true; 1909 } 1910} 1911 1912static unsigned long segment_base(u16 selector) 1913{ 1914 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1915 struct desc_struct *d; 1916 unsigned long table_base; 1917 unsigned long v; 1918 1919 if (!(selector & ~3)) 1920 return 0; 1921 1922 table_base = gdt->address; 1923 1924 if (selector & 4) { /* from ldt */ 1925 u16 ldt_selector = kvm_read_ldt(); 1926 1927 if (!(ldt_selector & ~3)) 1928 return 0; 1929 1930 table_base = segment_base(ldt_selector); 1931 } 1932 d = (struct desc_struct *)(table_base + (selector & ~7)); 1933 v = get_desc_base(d); 1934#ifdef CONFIG_X86_64 1935 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 1936 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 1937#endif 1938 return v; 1939} 1940 1941static inline unsigned long kvm_read_tr_base(void) 1942{ 1943 u16 tr; 1944 asm("str %0" : "=g"(tr)); 1945 return segment_base(tr); 1946} 1947 1948static void vmx_save_host_state(struct kvm_vcpu *vcpu) 1949{ 1950 struct vcpu_vmx *vmx = to_vmx(vcpu); 1951 int i; 1952 1953 if (vmx->host_state.loaded) 1954 return; 1955 1956 vmx->host_state.loaded = 1; 1957 /* 1958 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1959 * allow segment selectors with cpl > 0 or ti == 1. 1960 */ 1961 vmx->host_state.ldt_sel = kvm_read_ldt(); 1962 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 1963 savesegment(fs, vmx->host_state.fs_sel); 1964 if (!(vmx->host_state.fs_sel & 7)) { 1965 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 1966 vmx->host_state.fs_reload_needed = 0; 1967 } else { 1968 vmcs_write16(HOST_FS_SELECTOR, 0); 1969 vmx->host_state.fs_reload_needed = 1; 1970 } 1971 savesegment(gs, vmx->host_state.gs_sel); 1972 if (!(vmx->host_state.gs_sel & 7)) 1973 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 1974 else { 1975 vmcs_write16(HOST_GS_SELECTOR, 0); 1976 vmx->host_state.gs_ldt_reload_needed = 1; 1977 } 1978 1979#ifdef CONFIG_X86_64 1980 savesegment(ds, vmx->host_state.ds_sel); 1981 savesegment(es, vmx->host_state.es_sel); 1982#endif 1983 1984#ifdef CONFIG_X86_64 1985 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1986 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1987#else 1988 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); 1989 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); 1990#endif 1991 1992#ifdef CONFIG_X86_64 1993 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1994 if (is_long_mode(&vmx->vcpu)) 1995 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1996#endif 1997 if (boot_cpu_has(X86_FEATURE_MPX)) 1998 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 1999 for (i = 0; i < vmx->save_nmsrs; ++i) 2000 kvm_set_shared_msr(vmx->guest_msrs[i].index, 2001 vmx->guest_msrs[i].data, 2002 vmx->guest_msrs[i].mask); 2003} 2004 2005static void __vmx_load_host_state(struct vcpu_vmx *vmx) 2006{ 2007 if (!vmx->host_state.loaded) 2008 return; 2009 2010 ++vmx->vcpu.stat.host_state_reload; 2011 vmx->host_state.loaded = 0; 2012#ifdef CONFIG_X86_64 2013 if (is_long_mode(&vmx->vcpu)) 2014 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2015#endif 2016 if (vmx->host_state.gs_ldt_reload_needed) { 2017 kvm_load_ldt(vmx->host_state.ldt_sel); 2018#ifdef CONFIG_X86_64 2019 load_gs_index(vmx->host_state.gs_sel); 2020#else 2021 loadsegment(gs, vmx->host_state.gs_sel); 2022#endif 2023 } 2024 if (vmx->host_state.fs_reload_needed) 2025 loadsegment(fs, vmx->host_state.fs_sel); 2026#ifdef CONFIG_X86_64 2027 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { 2028 loadsegment(ds, vmx->host_state.ds_sel); 2029 loadsegment(es, vmx->host_state.es_sel); 2030 } 2031#endif 2032 reload_tss(); 2033#ifdef CONFIG_X86_64 2034 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 2035#endif 2036 if (vmx->host_state.msr_host_bndcfgs) 2037 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 2038 /* 2039 * If the FPU is not active (through the host task or 2040 * the guest vcpu), then restore the cr0.TS bit. 2041 */ 2042 if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) 2043 stts(); 2044 load_gdt(this_cpu_ptr(&host_gdt)); 2045} 2046 2047static void vmx_load_host_state(struct vcpu_vmx *vmx) 2048{ 2049 preempt_disable(); 2050 __vmx_load_host_state(vmx); 2051 preempt_enable(); 2052} 2053 2054static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) 2055{ 2056 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 2057 struct pi_desc old, new; 2058 unsigned int dest; 2059 2060 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 2061 !irq_remapping_cap(IRQ_POSTING_CAP)) 2062 return; 2063 2064 do { 2065 old.control = new.control = pi_desc->control; 2066 2067 /* 2068 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there 2069 * are two possible cases: 2070 * 1. After running 'pre_block', context switch 2071 * happened. For this case, 'sn' was set in 2072 * vmx_vcpu_put(), so we need to clear it here. 2073 * 2. After running 'pre_block', we were blocked, 2074 * and woken up by some other guy. For this case, 2075 * we don't need to do anything, 'pi_post_block' 2076 * will do everything for us. However, we cannot 2077 * check whether it is case #1 or case #2 here 2078 * (maybe, not needed), so we also clear sn here, 2079 * I think it is not a big deal. 2080 */ 2081 if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { 2082 if (vcpu->cpu != cpu) { 2083 dest = cpu_physical_id(cpu); 2084 2085 if (x2apic_enabled()) 2086 new.ndst = dest; 2087 else 2088 new.ndst = (dest << 8) & 0xFF00; 2089 } 2090 2091 /* set 'NV' to 'notification vector' */ 2092 new.nv = POSTED_INTR_VECTOR; 2093 } 2094 2095 /* Allow posting non-urgent interrupts */ 2096 new.sn = 0; 2097 } while (cmpxchg(&pi_desc->control, old.control, 2098 new.control) != old.control); 2099} 2100/* 2101 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 2102 * vcpu mutex is already taken. 2103 */ 2104static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 2105{ 2106 struct vcpu_vmx *vmx = to_vmx(vcpu); 2107 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2108 2109 if (!vmm_exclusive) 2110 kvm_cpu_vmxon(phys_addr); 2111 else if (vmx->loaded_vmcs->cpu != cpu) 2112 loaded_vmcs_clear(vmx->loaded_vmcs); 2113 2114 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { 2115 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 2116 vmcs_load(vmx->loaded_vmcs->vmcs); 2117 } 2118 2119 if (vmx->loaded_vmcs->cpu != cpu) { 2120 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 2121 unsigned long sysenter_esp; 2122 2123 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2124 local_irq_disable(); 2125 crash_disable_local_vmclear(cpu); 2126 2127 /* 2128 * Read loaded_vmcs->cpu should be before fetching 2129 * loaded_vmcs->loaded_vmcss_on_cpu_link. 2130 * See the comments in __loaded_vmcs_clear(). 2131 */ 2132 smp_rmb(); 2133 2134 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 2135 &per_cpu(loaded_vmcss_on_cpu, cpu)); 2136 crash_enable_local_vmclear(cpu); 2137 local_irq_enable(); 2138 2139 /* 2140 * Linux uses per-cpu TSS and GDT, so set these when switching 2141 * processors. 2142 */ 2143 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 2144 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ 2145 2146 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 2147 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 2148 2149 vmx->loaded_vmcs->cpu = cpu; 2150 } 2151 2152 /* Setup TSC multiplier */ 2153 if (kvm_has_tsc_control && 2154 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) { 2155 vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio; 2156 vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); 2157 } 2158 2159 vmx_vcpu_pi_load(vcpu, cpu); 2160} 2161 2162static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) 2163{ 2164 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 2165 2166 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 2167 !irq_remapping_cap(IRQ_POSTING_CAP)) 2168 return; 2169 2170 /* Set SN when the vCPU is preempted */ 2171 if (vcpu->preempted) 2172 pi_set_sn(pi_desc); 2173} 2174 2175static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 2176{ 2177 vmx_vcpu_pi_put(vcpu); 2178 2179 __vmx_load_host_state(to_vmx(vcpu)); 2180 if (!vmm_exclusive) { 2181 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); 2182 vcpu->cpu = -1; 2183 kvm_cpu_vmxoff(); 2184 } 2185} 2186 2187static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 2188{ 2189 ulong cr0; 2190 2191 if (vcpu->fpu_active) 2192 return; 2193 vcpu->fpu_active = 1; 2194 cr0 = vmcs_readl(GUEST_CR0); 2195 cr0 &= ~(X86_CR0_TS | X86_CR0_MP); 2196 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); 2197 vmcs_writel(GUEST_CR0, cr0); 2198 update_exception_bitmap(vcpu); 2199 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 2200 if (is_guest_mode(vcpu)) 2201 vcpu->arch.cr0_guest_owned_bits &= 2202 ~get_vmcs12(vcpu)->cr0_guest_host_mask; 2203 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2204} 2205 2206static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 2207 2208/* 2209 * Return the cr0 value that a nested guest would read. This is a combination 2210 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by 2211 * its hypervisor (cr0_read_shadow). 2212 */ 2213static inline unsigned long nested_read_cr0(struct vmcs12 *fields) 2214{ 2215 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | 2216 (fields->cr0_read_shadow & fields->cr0_guest_host_mask); 2217} 2218static inline unsigned long nested_read_cr4(struct vmcs12 *fields) 2219{ 2220 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | 2221 (fields->cr4_read_shadow & fields->cr4_guest_host_mask); 2222} 2223 2224static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 2225{ 2226 /* Note that there is no vcpu->fpu_active = 0 here. The caller must 2227 * set this *before* calling this function. 2228 */ 2229 vmx_decache_cr0_guest_bits(vcpu); 2230 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 2231 update_exception_bitmap(vcpu); 2232 vcpu->arch.cr0_guest_owned_bits = 0; 2233 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2234 if (is_guest_mode(vcpu)) { 2235 /* 2236 * L1's specified read shadow might not contain the TS bit, 2237 * so now that we turned on shadowing of this bit, we need to 2238 * set this bit of the shadow. Like in nested_vmx_run we need 2239 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet 2240 * up-to-date here because we just decached cr0.TS (and we'll 2241 * only update vmcs12->guest_cr0 on nested exit). 2242 */ 2243 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2244 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | 2245 (vcpu->arch.cr0 & X86_CR0_TS); 2246 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2247 } else 2248 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2249} 2250 2251static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 2252{ 2253 unsigned long rflags, save_rflags; 2254 2255 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { 2256 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 2257 rflags = vmcs_readl(GUEST_RFLAGS); 2258 if (to_vmx(vcpu)->rmode.vm86_active) { 2259 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2260 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 2261 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 2262 } 2263 to_vmx(vcpu)->rflags = rflags; 2264 } 2265 return to_vmx(vcpu)->rflags; 2266} 2267 2268static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 2269{ 2270 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 2271 to_vmx(vcpu)->rflags = rflags; 2272 if (to_vmx(vcpu)->rmode.vm86_active) { 2273 to_vmx(vcpu)->rmode.save_rflags = rflags; 2274 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 2275 } 2276 vmcs_writel(GUEST_RFLAGS, rflags); 2277} 2278 2279static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 2280{ 2281 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 2282 int ret = 0; 2283 2284 if (interruptibility & GUEST_INTR_STATE_STI) 2285 ret |= KVM_X86_SHADOW_INT_STI; 2286 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 2287 ret |= KVM_X86_SHADOW_INT_MOV_SS; 2288 2289 return ret; 2290} 2291 2292static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 2293{ 2294 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 2295 u32 interruptibility = interruptibility_old; 2296 2297 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 2298 2299 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 2300 interruptibility |= GUEST_INTR_STATE_MOV_SS; 2301 else if (mask & KVM_X86_SHADOW_INT_STI) 2302 interruptibility |= GUEST_INTR_STATE_STI; 2303 2304 if ((interruptibility != interruptibility_old)) 2305 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 2306} 2307 2308static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 2309{ 2310 unsigned long rip; 2311 2312 rip = kvm_rip_read(vcpu); 2313 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 2314 kvm_rip_write(vcpu, rip); 2315 2316 /* skipping an emulated instruction also counts */ 2317 vmx_set_interrupt_shadow(vcpu, 0); 2318} 2319 2320/* 2321 * KVM wants to inject page-faults which it got to the guest. This function 2322 * checks whether in a nested guest, we need to inject them to L1 or L2. 2323 */ 2324static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) 2325{ 2326 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2327 2328 if (!(vmcs12->exception_bitmap & (1u << nr))) 2329 return 0; 2330 2331 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 2332 vmcs_read32(VM_EXIT_INTR_INFO), 2333 vmcs_readl(EXIT_QUALIFICATION)); 2334 return 1; 2335} 2336 2337static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 2338 bool has_error_code, u32 error_code, 2339 bool reinject) 2340{ 2341 struct vcpu_vmx *vmx = to_vmx(vcpu); 2342 u32 intr_info = nr | INTR_INFO_VALID_MASK; 2343 2344 if (!reinject && is_guest_mode(vcpu) && 2345 nested_vmx_check_exception(vcpu, nr)) 2346 return; 2347 2348 if (has_error_code) { 2349 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 2350 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 2351 } 2352 2353 if (vmx->rmode.vm86_active) { 2354 int inc_eip = 0; 2355 if (kvm_exception_is_soft(nr)) 2356 inc_eip = vcpu->arch.event_exit_inst_len; 2357 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) 2358 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2359 return; 2360 } 2361 2362 if (kvm_exception_is_soft(nr)) { 2363 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2364 vmx->vcpu.arch.event_exit_inst_len); 2365 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 2366 } else 2367 intr_info |= INTR_TYPE_HARD_EXCEPTION; 2368 2369 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 2370} 2371 2372static bool vmx_rdtscp_supported(void) 2373{ 2374 return cpu_has_vmx_rdtscp(); 2375} 2376 2377static bool vmx_invpcid_supported(void) 2378{ 2379 return cpu_has_vmx_invpcid() && enable_ept; 2380} 2381 2382/* 2383 * Swap MSR entry in host/guest MSR entry array. 2384 */ 2385static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 2386{ 2387 struct shared_msr_entry tmp; 2388 2389 tmp = vmx->guest_msrs[to]; 2390 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 2391 vmx->guest_msrs[from] = tmp; 2392} 2393 2394static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) 2395{ 2396 unsigned long *msr_bitmap; 2397 2398 if (is_guest_mode(vcpu)) 2399 msr_bitmap = vmx_msr_bitmap_nested; 2400 else if (vcpu->arch.apic_base & X2APIC_ENABLE) { 2401 if (is_long_mode(vcpu)) 2402 msr_bitmap = vmx_msr_bitmap_longmode_x2apic; 2403 else 2404 msr_bitmap = vmx_msr_bitmap_legacy_x2apic; 2405 } else { 2406 if (is_long_mode(vcpu)) 2407 msr_bitmap = vmx_msr_bitmap_longmode; 2408 else 2409 msr_bitmap = vmx_msr_bitmap_legacy; 2410 } 2411 2412 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); 2413} 2414 2415/* 2416 * Set up the vmcs to automatically save and restore system 2417 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 2418 * mode, as fiddling with msrs is very expensive. 2419 */ 2420static void setup_msrs(struct vcpu_vmx *vmx) 2421{ 2422 int save_nmsrs, index; 2423 2424 save_nmsrs = 0; 2425#ifdef CONFIG_X86_64 2426 if (is_long_mode(&vmx->vcpu)) { 2427 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 2428 if (index >= 0) 2429 move_msr_up(vmx, index, save_nmsrs++); 2430 index = __find_msr_index(vmx, MSR_LSTAR); 2431 if (index >= 0) 2432 move_msr_up(vmx, index, save_nmsrs++); 2433 index = __find_msr_index(vmx, MSR_CSTAR); 2434 if (index >= 0) 2435 move_msr_up(vmx, index, save_nmsrs++); 2436 index = __find_msr_index(vmx, MSR_TSC_AUX); 2437 if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) 2438 move_msr_up(vmx, index, save_nmsrs++); 2439 /* 2440 * MSR_STAR is only needed on long mode guests, and only 2441 * if efer.sce is enabled. 2442 */ 2443 index = __find_msr_index(vmx, MSR_STAR); 2444 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) 2445 move_msr_up(vmx, index, save_nmsrs++); 2446 } 2447#endif 2448 index = __find_msr_index(vmx, MSR_EFER); 2449 if (index >= 0 && update_transition_efer(vmx, index)) 2450 move_msr_up(vmx, index, save_nmsrs++); 2451 2452 vmx->save_nmsrs = save_nmsrs; 2453 2454 if (cpu_has_vmx_msr_bitmap()) 2455 vmx_set_msr_bitmap(&vmx->vcpu); 2456} 2457 2458/* 2459 * reads and returns guest's timestamp counter "register" 2460 * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset 2461 * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3 2462 */ 2463static u64 guest_read_tsc(struct kvm_vcpu *vcpu) 2464{ 2465 u64 host_tsc, tsc_offset; 2466 2467 host_tsc = rdtsc(); 2468 tsc_offset = vmcs_read64(TSC_OFFSET); 2469 return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; 2470} 2471 2472/* 2473 * Like guest_read_tsc, but always returns L1's notion of the timestamp 2474 * counter, even if a nested guest (L2) is currently running. 2475 */ 2476static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2477{ 2478 u64 tsc_offset; 2479 2480 tsc_offset = is_guest_mode(vcpu) ? 2481 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 2482 vmcs_read64(TSC_OFFSET); 2483 return host_tsc + tsc_offset; 2484} 2485 2486static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) 2487{ 2488 return vmcs_read64(TSC_OFFSET); 2489} 2490 2491/* 2492 * writes 'offset' into guest's timestamp counter offset register 2493 */ 2494static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 2495{ 2496 if (is_guest_mode(vcpu)) { 2497 /* 2498 * We're here if L1 chose not to trap WRMSR to TSC. According 2499 * to the spec, this should set L1's TSC; The offset that L1 2500 * set for L2 remains unchanged, and still needs to be added 2501 * to the newly set TSC to get L2's TSC. 2502 */ 2503 struct vmcs12 *vmcs12; 2504 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; 2505 /* recalculate vmcs02.TSC_OFFSET: */ 2506 vmcs12 = get_vmcs12(vcpu); 2507 vmcs_write64(TSC_OFFSET, offset + 2508 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? 2509 vmcs12->tsc_offset : 0)); 2510 } else { 2511 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 2512 vmcs_read64(TSC_OFFSET), offset); 2513 vmcs_write64(TSC_OFFSET, offset); 2514 } 2515} 2516 2517static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) 2518{ 2519 u64 offset = vmcs_read64(TSC_OFFSET); 2520 2521 vmcs_write64(TSC_OFFSET, offset + adjustment); 2522 if (is_guest_mode(vcpu)) { 2523 /* Even when running L2, the adjustment needs to apply to L1 */ 2524 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2525 } else 2526 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, 2527 offset + adjustment); 2528} 2529 2530static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) 2531{ 2532 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); 2533 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); 2534} 2535 2536/* 2537 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 2538 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 2539 * all guests if the "nested" module option is off, and can also be disabled 2540 * for a single guest by disabling its VMX cpuid bit. 2541 */ 2542static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 2543{ 2544 return nested && guest_cpuid_has_vmx(vcpu); 2545} 2546 2547/* 2548 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 2549 * returned for the various VMX controls MSRs when nested VMX is enabled. 2550 * The same values should also be used to verify that vmcs12 control fields are 2551 * valid during nested entry from L1 to L2. 2552 * Each of these control msrs has a low and high 32-bit half: A low bit is on 2553 * if the corresponding bit in the (32-bit) control field *must* be on, and a 2554 * bit in the high half is on if the corresponding bit in the control field 2555 * may be on. See also vmx_control_verify(). 2556 */ 2557static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) 2558{ 2559 /* 2560 * Note that as a general rule, the high half of the MSRs (bits in 2561 * the control fields which may be 1) should be initialized by the 2562 * intersection of the underlying hardware's MSR (i.e., features which 2563 * can be supported) and the list of features we want to expose - 2564 * because they are known to be properly supported in our code. 2565 * Also, usually, the low half of the MSRs (bits which must be 1) can 2566 * be set to 0, meaning that L1 may turn off any of these bits. The 2567 * reason is that if one of these bits is necessary, it will appear 2568 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 2569 * fields of vmcs01 and vmcs02, will turn these bits off - and 2570 * nested_vmx_exit_handled() will not pass related exits to L1. 2571 * These rules have exceptions below. 2572 */ 2573 2574 /* pin-based controls */ 2575 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2576 vmx->nested.nested_vmx_pinbased_ctls_low, 2577 vmx->nested.nested_vmx_pinbased_ctls_high); 2578 vmx->nested.nested_vmx_pinbased_ctls_low |= 2579 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2580 vmx->nested.nested_vmx_pinbased_ctls_high &= 2581 PIN_BASED_EXT_INTR_MASK | 2582 PIN_BASED_NMI_EXITING | 2583 PIN_BASED_VIRTUAL_NMIS; 2584 vmx->nested.nested_vmx_pinbased_ctls_high |= 2585 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2586 PIN_BASED_VMX_PREEMPTION_TIMER; 2587 if (kvm_vcpu_apicv_active(&vmx->vcpu)) 2588 vmx->nested.nested_vmx_pinbased_ctls_high |= 2589 PIN_BASED_POSTED_INTR; 2590 2591 /* exit controls */ 2592 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2593 vmx->nested.nested_vmx_exit_ctls_low, 2594 vmx->nested.nested_vmx_exit_ctls_high); 2595 vmx->nested.nested_vmx_exit_ctls_low = 2596 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2597 2598 vmx->nested.nested_vmx_exit_ctls_high &= 2599#ifdef CONFIG_X86_64 2600 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2601#endif 2602 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2603 vmx->nested.nested_vmx_exit_ctls_high |= 2604 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2605 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2606 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2607 2608 if (vmx_mpx_supported()) 2609 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2610 2611 /* We support free control of debug control saving. */ 2612 vmx->nested.nested_vmx_true_exit_ctls_low = 2613 vmx->nested.nested_vmx_exit_ctls_low & 2614 ~VM_EXIT_SAVE_DEBUG_CONTROLS; 2615 2616 /* entry controls */ 2617 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2618 vmx->nested.nested_vmx_entry_ctls_low, 2619 vmx->nested.nested_vmx_entry_ctls_high); 2620 vmx->nested.nested_vmx_entry_ctls_low = 2621 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2622 vmx->nested.nested_vmx_entry_ctls_high &= 2623#ifdef CONFIG_X86_64 2624 VM_ENTRY_IA32E_MODE | 2625#endif 2626 VM_ENTRY_LOAD_IA32_PAT; 2627 vmx->nested.nested_vmx_entry_ctls_high |= 2628 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 2629 if (vmx_mpx_supported()) 2630 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 2631 2632 /* We support free control of debug control loading. */ 2633 vmx->nested.nested_vmx_true_entry_ctls_low = 2634 vmx->nested.nested_vmx_entry_ctls_low & 2635 ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 2636 2637 /* cpu-based controls */ 2638 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2639 vmx->nested.nested_vmx_procbased_ctls_low, 2640 vmx->nested.nested_vmx_procbased_ctls_high); 2641 vmx->nested.nested_vmx_procbased_ctls_low = 2642 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2643 vmx->nested.nested_vmx_procbased_ctls_high &= 2644 CPU_BASED_VIRTUAL_INTR_PENDING | 2645 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 2646 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 2647 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 2648 CPU_BASED_CR3_STORE_EXITING | 2649#ifdef CONFIG_X86_64 2650 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 2651#endif 2652 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 2653 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 2654 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 2655 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 2656 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2657 /* 2658 * We can allow some features even when not supported by the 2659 * hardware. For example, L1 can specify an MSR bitmap - and we 2660 * can use it to avoid exits to L1 - even when L0 runs L2 2661 * without MSR bitmaps. 2662 */ 2663 vmx->nested.nested_vmx_procbased_ctls_high |= 2664 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2665 CPU_BASED_USE_MSR_BITMAPS; 2666 2667 /* We support free control of CR3 access interception. */ 2668 vmx->nested.nested_vmx_true_procbased_ctls_low = 2669 vmx->nested.nested_vmx_procbased_ctls_low & 2670 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2671 2672 /* secondary cpu-based controls */ 2673 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2674 vmx->nested.nested_vmx_secondary_ctls_low, 2675 vmx->nested.nested_vmx_secondary_ctls_high); 2676 vmx->nested.nested_vmx_secondary_ctls_low = 0; 2677 vmx->nested.nested_vmx_secondary_ctls_high &= 2678 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2679 SECONDARY_EXEC_RDTSCP | 2680 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2681 SECONDARY_EXEC_ENABLE_VPID | 2682 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2683 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2684 SECONDARY_EXEC_WBINVD_EXITING | 2685 SECONDARY_EXEC_XSAVES | 2686 SECONDARY_EXEC_PCOMMIT; 2687 2688 if (enable_ept) { 2689 /* nested EPT: emulate EPT also to L1 */ 2690 vmx->nested.nested_vmx_secondary_ctls_high |= 2691 SECONDARY_EXEC_ENABLE_EPT; 2692 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 2693 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | 2694 VMX_EPT_INVEPT_BIT; 2695 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; 2696 /* 2697 * For nested guests, we don't do anything specific 2698 * for single context invalidation. Hence, only advertise 2699 * support for global context invalidation. 2700 */ 2701 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; 2702 } else 2703 vmx->nested.nested_vmx_ept_caps = 0; 2704 2705 if (enable_vpid) 2706 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | 2707 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; 2708 else 2709 vmx->nested.nested_vmx_vpid_caps = 0; 2710 2711 if (enable_unrestricted_guest) 2712 vmx->nested.nested_vmx_secondary_ctls_high |= 2713 SECONDARY_EXEC_UNRESTRICTED_GUEST; 2714 2715 /* miscellaneous data */ 2716 rdmsr(MSR_IA32_VMX_MISC, 2717 vmx->nested.nested_vmx_misc_low, 2718 vmx->nested.nested_vmx_misc_high); 2719 vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; 2720 vmx->nested.nested_vmx_misc_low |= 2721 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 2722 VMX_MISC_ACTIVITY_HLT; 2723 vmx->nested.nested_vmx_misc_high = 0; 2724} 2725 2726static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2727{ 2728 /* 2729 * Bits 0 in high must be 0, and bits 1 in low must be 1. 2730 */ 2731 return ((control & high) | low) == control; 2732} 2733 2734static inline u64 vmx_control_msr(u32 low, u32 high) 2735{ 2736 return low | ((u64)high << 32); 2737} 2738 2739/* Returns 0 on success, non-0 otherwise. */ 2740static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2741{ 2742 struct vcpu_vmx *vmx = to_vmx(vcpu); 2743 2744 switch (msr_index) { 2745 case MSR_IA32_VMX_BASIC: 2746 /* 2747 * This MSR reports some information about VMX support. We 2748 * should return information about the VMX we emulate for the 2749 * guest, and the VMCS structure we give it - not about the 2750 * VMX support of the underlying hardware. 2751 */ 2752 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | 2753 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 2754 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 2755 break; 2756 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 2757 case MSR_IA32_VMX_PINBASED_CTLS: 2758 *pdata = vmx_control_msr( 2759 vmx->nested.nested_vmx_pinbased_ctls_low, 2760 vmx->nested.nested_vmx_pinbased_ctls_high); 2761 break; 2762 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 2763 *pdata = vmx_control_msr( 2764 vmx->nested.nested_vmx_true_procbased_ctls_low, 2765 vmx->nested.nested_vmx_procbased_ctls_high); 2766 break; 2767 case MSR_IA32_VMX_PROCBASED_CTLS: 2768 *pdata = vmx_control_msr( 2769 vmx->nested.nested_vmx_procbased_ctls_low, 2770 vmx->nested.nested_vmx_procbased_ctls_high); 2771 break; 2772 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 2773 *pdata = vmx_control_msr( 2774 vmx->nested.nested_vmx_true_exit_ctls_low, 2775 vmx->nested.nested_vmx_exit_ctls_high); 2776 break; 2777 case MSR_IA32_VMX_EXIT_CTLS: 2778 *pdata = vmx_control_msr( 2779 vmx->nested.nested_vmx_exit_ctls_low, 2780 vmx->nested.nested_vmx_exit_ctls_high); 2781 break; 2782 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 2783 *pdata = vmx_control_msr( 2784 vmx->nested.nested_vmx_true_entry_ctls_low, 2785 vmx->nested.nested_vmx_entry_ctls_high); 2786 break; 2787 case MSR_IA32_VMX_ENTRY_CTLS: 2788 *pdata = vmx_control_msr( 2789 vmx->nested.nested_vmx_entry_ctls_low, 2790 vmx->nested.nested_vmx_entry_ctls_high); 2791 break; 2792 case MSR_IA32_VMX_MISC: 2793 *pdata = vmx_control_msr( 2794 vmx->nested.nested_vmx_misc_low, 2795 vmx->nested.nested_vmx_misc_high); 2796 break; 2797 /* 2798 * These MSRs specify bits which the guest must keep fixed (on or off) 2799 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 2800 * We picked the standard core2 setting. 2801 */ 2802#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 2803#define VMXON_CR4_ALWAYSON X86_CR4_VMXE 2804 case MSR_IA32_VMX_CR0_FIXED0: 2805 *pdata = VMXON_CR0_ALWAYSON; 2806 break; 2807 case MSR_IA32_VMX_CR0_FIXED1: 2808 *pdata = -1ULL; 2809 break; 2810 case MSR_IA32_VMX_CR4_FIXED0: 2811 *pdata = VMXON_CR4_ALWAYSON; 2812 break; 2813 case MSR_IA32_VMX_CR4_FIXED1: 2814 *pdata = -1ULL; 2815 break; 2816 case MSR_IA32_VMX_VMCS_ENUM: 2817 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 2818 break; 2819 case MSR_IA32_VMX_PROCBASED_CTLS2: 2820 *pdata = vmx_control_msr( 2821 vmx->nested.nested_vmx_secondary_ctls_low, 2822 vmx->nested.nested_vmx_secondary_ctls_high); 2823 break; 2824 case MSR_IA32_VMX_EPT_VPID_CAP: 2825 /* Currently, no nested vpid support */ 2826 *pdata = vmx->nested.nested_vmx_ept_caps | 2827 ((u64)vmx->nested.nested_vmx_vpid_caps << 32); 2828 break; 2829 default: 2830 return 1; 2831 } 2832 2833 return 0; 2834} 2835 2836/* 2837 * Reads an msr value (of 'msr_index') into 'pdata'. 2838 * Returns 0 on success, non-0 otherwise. 2839 * Assumes vcpu_load() was already called. 2840 */ 2841static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2842{ 2843 struct shared_msr_entry *msr; 2844 2845 switch (msr_info->index) { 2846#ifdef CONFIG_X86_64 2847 case MSR_FS_BASE: 2848 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2849 break; 2850 case MSR_GS_BASE: 2851 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2852 break; 2853 case MSR_KERNEL_GS_BASE: 2854 vmx_load_host_state(to_vmx(vcpu)); 2855 msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; 2856 break; 2857#endif 2858 case MSR_EFER: 2859 return kvm_get_msr_common(vcpu, msr_info); 2860 case MSR_IA32_TSC: 2861 msr_info->data = guest_read_tsc(vcpu); 2862 break; 2863 case MSR_IA32_SYSENTER_CS: 2864 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2865 break; 2866 case MSR_IA32_SYSENTER_EIP: 2867 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2868 break; 2869 case MSR_IA32_SYSENTER_ESP: 2870 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2871 break; 2872 case MSR_IA32_BNDCFGS: 2873 if (!vmx_mpx_supported()) 2874 return 1; 2875 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2876 break; 2877 case MSR_IA32_FEATURE_CONTROL: 2878 if (!nested_vmx_allowed(vcpu)) 2879 return 1; 2880 msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; 2881 break; 2882 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2883 if (!nested_vmx_allowed(vcpu)) 2884 return 1; 2885 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); 2886 case MSR_IA32_XSS: 2887 if (!vmx_xsaves_supported()) 2888 return 1; 2889 msr_info->data = vcpu->arch.ia32_xss; 2890 break; 2891 case MSR_TSC_AUX: 2892 if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) 2893 return 1; 2894 /* Otherwise falls through */ 2895 default: 2896 msr = find_msr_entry(to_vmx(vcpu), msr_info->index); 2897 if (msr) { 2898 msr_info->data = msr->data; 2899 break; 2900 } 2901 return kvm_get_msr_common(vcpu, msr_info); 2902 } 2903 2904 return 0; 2905} 2906 2907static void vmx_leave_nested(struct kvm_vcpu *vcpu); 2908 2909/* 2910 * Writes msr value into into the appropriate "register". 2911 * Returns 0 on success, non-0 otherwise. 2912 * Assumes vcpu_load() was already called. 2913 */ 2914static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2915{ 2916 struct vcpu_vmx *vmx = to_vmx(vcpu); 2917 struct shared_msr_entry *msr; 2918 int ret = 0; 2919 u32 msr_index = msr_info->index; 2920 u64 data = msr_info->data; 2921 2922 switch (msr_index) { 2923 case MSR_EFER: 2924 ret = kvm_set_msr_common(vcpu, msr_info); 2925 break; 2926#ifdef CONFIG_X86_64 2927 case MSR_FS_BASE: 2928 vmx_segment_cache_clear(vmx); 2929 vmcs_writel(GUEST_FS_BASE, data); 2930 break; 2931 case MSR_GS_BASE: 2932 vmx_segment_cache_clear(vmx); 2933 vmcs_writel(GUEST_GS_BASE, data); 2934 break; 2935 case MSR_KERNEL_GS_BASE: 2936 vmx_load_host_state(vmx); 2937 vmx->msr_guest_kernel_gs_base = data; 2938 break; 2939#endif 2940 case MSR_IA32_SYSENTER_CS: 2941 vmcs_write32(GUEST_SYSENTER_CS, data); 2942 break; 2943 case MSR_IA32_SYSENTER_EIP: 2944 vmcs_writel(GUEST_SYSENTER_EIP, data); 2945 break; 2946 case MSR_IA32_SYSENTER_ESP: 2947 vmcs_writel(GUEST_SYSENTER_ESP, data); 2948 break; 2949 case MSR_IA32_BNDCFGS: 2950 if (!vmx_mpx_supported()) 2951 return 1; 2952 vmcs_write64(GUEST_BNDCFGS, data); 2953 break; 2954 case MSR_IA32_TSC: 2955 kvm_write_tsc(vcpu, msr_info); 2956 break; 2957 case MSR_IA32_CR_PAT: 2958 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2959 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 2960 return 1; 2961 vmcs_write64(GUEST_IA32_PAT, data); 2962 vcpu->arch.pat = data; 2963 break; 2964 } 2965 ret = kvm_set_msr_common(vcpu, msr_info); 2966 break; 2967 case MSR_IA32_TSC_ADJUST: 2968 ret = kvm_set_msr_common(vcpu, msr_info); 2969 break; 2970 case MSR_IA32_FEATURE_CONTROL: 2971 if (!nested_vmx_allowed(vcpu) || 2972 (to_vmx(vcpu)->nested.msr_ia32_feature_control & 2973 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) 2974 return 1; 2975 vmx->nested.msr_ia32_feature_control = data; 2976 if (msr_info->host_initiated && data == 0) 2977 vmx_leave_nested(vcpu); 2978 break; 2979 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2980 return 1; /* they are read-only */ 2981 case MSR_IA32_XSS: 2982 if (!vmx_xsaves_supported()) 2983 return 1; 2984 /* 2985 * The only supported bit as of Skylake is bit 8, but 2986 * it is not supported on KVM. 2987 */ 2988 if (data != 0) 2989 return 1; 2990 vcpu->arch.ia32_xss = data; 2991 if (vcpu->arch.ia32_xss != host_xss) 2992 add_atomic_switch_msr(vmx, MSR_IA32_XSS, 2993 vcpu->arch.ia32_xss, host_xss); 2994 else 2995 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 2996 break; 2997 case MSR_TSC_AUX: 2998 if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) 2999 return 1; 3000 /* Check reserved bit, higher 32 bits should be zero */ 3001 if ((data >> 32) != 0) 3002 return 1; 3003 /* Otherwise falls through */ 3004 default: 3005 msr = find_msr_entry(vmx, msr_index); 3006 if (msr) { 3007 u64 old_msr_data = msr->data; 3008 msr->data = data; 3009 if (msr - vmx->guest_msrs < vmx->save_nmsrs) { 3010 preempt_disable(); 3011 ret = kvm_set_shared_msr(msr->index, msr->data, 3012 msr->mask); 3013 preempt_enable(); 3014 if (ret) 3015 msr->data = old_msr_data; 3016 } 3017 break; 3018 } 3019 ret = kvm_set_msr_common(vcpu, msr_info); 3020 } 3021 3022 return ret; 3023} 3024 3025static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 3026{ 3027 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); 3028 switch (reg) { 3029 case VCPU_REGS_RSP: 3030 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 3031 break; 3032 case VCPU_REGS_RIP: 3033 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 3034 break; 3035 case VCPU_EXREG_PDPTR: 3036 if (enable_ept) 3037 ept_save_pdptrs(vcpu); 3038 break; 3039 default: 3040 break; 3041 } 3042} 3043 3044static __init int cpu_has_kvm_support(void) 3045{ 3046 return cpu_has_vmx(); 3047} 3048 3049static __init int vmx_disabled_by_bios(void) 3050{ 3051 u64 msr; 3052 3053 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 3054 if (msr & FEATURE_CONTROL_LOCKED) { 3055 /* launched w/ TXT and VMX disabled */ 3056 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 3057 && tboot_enabled()) 3058 return 1; 3059 /* launched w/o TXT and VMX only enabled w/ TXT */ 3060 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 3061 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 3062 && !tboot_enabled()) { 3063 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 3064 "activate TXT before enabling KVM\n"); 3065 return 1; 3066 } 3067 /* launched w/o TXT and VMX disabled */ 3068 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 3069 && !tboot_enabled()) 3070 return 1; 3071 } 3072 3073 return 0; 3074} 3075 3076static void kvm_cpu_vmxon(u64 addr) 3077{ 3078 asm volatile (ASM_VMX_VMXON_RAX 3079 : : "a"(&addr), "m"(addr) 3080 : "memory", "cc"); 3081} 3082 3083static int hardware_enable(void) 3084{ 3085 int cpu = raw_smp_processor_id(); 3086 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 3087 u64 old, test_bits; 3088 3089 if (cr4_read_shadow() & X86_CR4_VMXE) 3090 return -EBUSY; 3091 3092 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 3093 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); 3094 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 3095 3096 /* 3097 * Now we can enable the vmclear operation in kdump 3098 * since the loaded_vmcss_on_cpu list on this cpu 3099 * has been initialized. 3100 * 3101 * Though the cpu is not in VMX operation now, there 3102 * is no problem to enable the vmclear operation 3103 * for the loaded_vmcss_on_cpu list is empty! 3104 */ 3105 crash_enable_local_vmclear(cpu); 3106 3107 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 3108 3109 test_bits = FEATURE_CONTROL_LOCKED; 3110 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 3111 if (tboot_enabled()) 3112 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; 3113 3114 if ((old & test_bits) != test_bits) { 3115 /* enable and lock */ 3116 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 3117 } 3118 cr4_set_bits(X86_CR4_VMXE); 3119 3120 if (vmm_exclusive) { 3121 kvm_cpu_vmxon(phys_addr); 3122 ept_sync_global(); 3123 } 3124 3125 native_store_gdt(this_cpu_ptr(&host_gdt)); 3126 3127 return 0; 3128} 3129 3130static void vmclear_local_loaded_vmcss(void) 3131{ 3132 int cpu = raw_smp_processor_id(); 3133 struct loaded_vmcs *v, *n; 3134 3135 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 3136 loaded_vmcss_on_cpu_link) 3137 __loaded_vmcs_clear(v); 3138} 3139 3140 3141/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() 3142 * tricks. 3143 */ 3144static void kvm_cpu_vmxoff(void) 3145{ 3146 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 3147} 3148 3149static void hardware_disable(void) 3150{ 3151 if (vmm_exclusive) { 3152 vmclear_local_loaded_vmcss(); 3153 kvm_cpu_vmxoff(); 3154 } 3155 cr4_clear_bits(X86_CR4_VMXE); 3156} 3157 3158static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 3159 u32 msr, u32 *result) 3160{ 3161 u32 vmx_msr_low, vmx_msr_high; 3162 u32 ctl = ctl_min | ctl_opt; 3163 3164 rdmsr(msr, vmx_msr_low, vmx_msr_high); 3165 3166 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 3167 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 3168 3169 /* Ensure minimum (required) set of control bits are supported. */ 3170 if (ctl_min & ~ctl) 3171 return -EIO; 3172 3173 *result = ctl; 3174 return 0; 3175} 3176 3177static __init bool allow_1_setting(u32 msr, u32 ctl) 3178{ 3179 u32 vmx_msr_low, vmx_msr_high; 3180 3181 rdmsr(msr, vmx_msr_low, vmx_msr_high); 3182 return vmx_msr_high & ctl; 3183} 3184 3185static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 3186{ 3187 u32 vmx_msr_low, vmx_msr_high; 3188 u32 min, opt, min2, opt2; 3189 u32 _pin_based_exec_control = 0; 3190 u32 _cpu_based_exec_control = 0; 3191 u32 _cpu_based_2nd_exec_control = 0; 3192 u32 _vmexit_control = 0; 3193 u32 _vmentry_control = 0; 3194 3195 min = CPU_BASED_HLT_EXITING | 3196#ifdef CONFIG_X86_64 3197 CPU_BASED_CR8_LOAD_EXITING | 3198 CPU_BASED_CR8_STORE_EXITING | 3199#endif 3200 CPU_BASED_CR3_LOAD_EXITING | 3201 CPU_BASED_CR3_STORE_EXITING | 3202 CPU_BASED_USE_IO_BITMAPS | 3203 CPU_BASED_MOV_DR_EXITING | 3204 CPU_BASED_USE_TSC_OFFSETING | 3205 CPU_BASED_MWAIT_EXITING | 3206 CPU_BASED_MONITOR_EXITING | 3207 CPU_BASED_INVLPG_EXITING | 3208 CPU_BASED_RDPMC_EXITING; 3209 3210 opt = CPU_BASED_TPR_SHADOW | 3211 CPU_BASED_USE_MSR_BITMAPS | 3212 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 3213 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 3214 &_cpu_based_exec_control) < 0) 3215 return -EIO; 3216#ifdef CONFIG_X86_64 3217 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 3218 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 3219 ~CPU_BASED_CR8_STORE_EXITING; 3220#endif 3221 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 3222 min2 = 0; 3223 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 3224 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3225 SECONDARY_EXEC_WBINVD_EXITING | 3226 SECONDARY_EXEC_ENABLE_VPID | 3227 SECONDARY_EXEC_ENABLE_EPT | 3228 SECONDARY_EXEC_UNRESTRICTED_GUEST | 3229 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 3230 SECONDARY_EXEC_RDTSCP | 3231 SECONDARY_EXEC_ENABLE_INVPCID | 3232 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3233 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3234 SECONDARY_EXEC_SHADOW_VMCS | 3235 SECONDARY_EXEC_XSAVES | 3236 SECONDARY_EXEC_ENABLE_PML | 3237 SECONDARY_EXEC_PCOMMIT | 3238 SECONDARY_EXEC_TSC_SCALING; 3239 if (adjust_vmx_controls(min2, opt2, 3240 MSR_IA32_VMX_PROCBASED_CTLS2, 3241 &_cpu_based_2nd_exec_control) < 0) 3242 return -EIO; 3243 } 3244#ifndef CONFIG_X86_64 3245 if (!(_cpu_based_2nd_exec_control & 3246 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 3247 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 3248#endif 3249 3250 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 3251 _cpu_based_2nd_exec_control &= ~( 3252 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3253 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3254 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3255 3256 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 3257 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 3258 enabled */ 3259 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 3260 CPU_BASED_CR3_STORE_EXITING | 3261 CPU_BASED_INVLPG_EXITING); 3262 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 3263 vmx_capability.ept, vmx_capability.vpid); 3264 } 3265 3266 min = VM_EXIT_SAVE_DEBUG_CONTROLS; 3267#ifdef CONFIG_X86_64 3268 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 3269#endif 3270 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | 3271 VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; 3272 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 3273 &_vmexit_control) < 0) 3274 return -EIO; 3275 3276 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 3277 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; 3278 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 3279 &_pin_based_exec_control) < 0) 3280 return -EIO; 3281 3282 if (!(_cpu_based_2nd_exec_control & 3283 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || 3284 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) 3285 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 3286 3287 min = VM_ENTRY_LOAD_DEBUG_CONTROLS; 3288 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 3289 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 3290 &_vmentry_control) < 0) 3291 return -EIO; 3292 3293 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 3294 3295 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 3296 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 3297 return -EIO; 3298 3299#ifdef CONFIG_X86_64 3300 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 3301 if (vmx_msr_high & (1u<<16)) 3302 return -EIO; 3303#endif 3304 3305 /* Require Write-Back (WB) memory type for VMCS accesses. */ 3306 if (((vmx_msr_high >> 18) & 15) != 6) 3307 return -EIO; 3308 3309 vmcs_conf->size = vmx_msr_high & 0x1fff; 3310 vmcs_conf->order = get_order(vmcs_config.size); 3311 vmcs_conf->revision_id = vmx_msr_low; 3312 3313 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 3314 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 3315 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 3316 vmcs_conf->vmexit_ctrl = _vmexit_control; 3317 vmcs_conf->vmentry_ctrl = _vmentry_control; 3318 3319 cpu_has_load_ia32_efer = 3320 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 3321 VM_ENTRY_LOAD_IA32_EFER) 3322 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 3323 VM_EXIT_LOAD_IA32_EFER); 3324 3325 cpu_has_load_perf_global_ctrl = 3326 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 3327 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 3328 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 3329 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 3330 3331 /* 3332 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL 3333 * but due to arrata below it can't be used. Workaround is to use 3334 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. 3335 * 3336 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] 3337 * 3338 * AAK155 (model 26) 3339 * AAP115 (model 30) 3340 * AAT100 (model 37) 3341 * BC86,AAY89,BD102 (model 44) 3342 * BA97 (model 46) 3343 * 3344 */ 3345 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { 3346 switch (boot_cpu_data.x86_model) { 3347 case 26: 3348 case 30: 3349 case 37: 3350 case 44: 3351 case 46: 3352 cpu_has_load_perf_global_ctrl = false; 3353 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 3354 "does not work properly. Using workaround\n"); 3355 break; 3356 default: 3357 break; 3358 } 3359 } 3360 3361 if (cpu_has_xsaves) 3362 rdmsrl(MSR_IA32_XSS, host_xss); 3363 3364 return 0; 3365} 3366 3367static struct vmcs *alloc_vmcs_cpu(int cpu) 3368{ 3369 int node = cpu_to_node(cpu); 3370 struct page *pages; 3371 struct vmcs *vmcs; 3372 3373 pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); 3374 if (!pages) 3375 return NULL; 3376 vmcs = page_address(pages); 3377 memset(vmcs, 0, vmcs_config.size); 3378 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ 3379 return vmcs; 3380} 3381 3382static struct vmcs *alloc_vmcs(void) 3383{ 3384 return alloc_vmcs_cpu(raw_smp_processor_id()); 3385} 3386 3387static void free_vmcs(struct vmcs *vmcs) 3388{ 3389 free_pages((unsigned long)vmcs, vmcs_config.order); 3390} 3391 3392/* 3393 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 3394 */ 3395static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3396{ 3397 if (!loaded_vmcs->vmcs) 3398 return; 3399 loaded_vmcs_clear(loaded_vmcs); 3400 free_vmcs(loaded_vmcs->vmcs); 3401 loaded_vmcs->vmcs = NULL; 3402} 3403 3404static void free_kvm_area(void) 3405{ 3406 int cpu; 3407 3408 for_each_possible_cpu(cpu) { 3409 free_vmcs(per_cpu(vmxarea, cpu)); 3410 per_cpu(vmxarea, cpu) = NULL; 3411 } 3412} 3413 3414static void init_vmcs_shadow_fields(void) 3415{ 3416 int i, j; 3417 3418 /* No checks for read only fields yet */ 3419 3420 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 3421 switch (shadow_read_write_fields[i]) { 3422 case GUEST_BNDCFGS: 3423 if (!vmx_mpx_supported()) 3424 continue; 3425 break; 3426 default: 3427 break; 3428 } 3429 3430 if (j < i) 3431 shadow_read_write_fields[j] = 3432 shadow_read_write_fields[i]; 3433 j++; 3434 } 3435 max_shadow_read_write_fields = j; 3436 3437 /* shadowed fields guest access without vmexit */ 3438 for (i = 0; i < max_shadow_read_write_fields; i++) { 3439 clear_bit(shadow_read_write_fields[i], 3440 vmx_vmwrite_bitmap); 3441 clear_bit(shadow_read_write_fields[i], 3442 vmx_vmread_bitmap); 3443 } 3444 for (i = 0; i < max_shadow_read_only_fields; i++) 3445 clear_bit(shadow_read_only_fields[i], 3446 vmx_vmread_bitmap); 3447} 3448 3449static __init int alloc_kvm_area(void) 3450{ 3451 int cpu; 3452 3453 for_each_possible_cpu(cpu) { 3454 struct vmcs *vmcs; 3455 3456 vmcs = alloc_vmcs_cpu(cpu); 3457 if (!vmcs) { 3458 free_kvm_area(); 3459 return -ENOMEM; 3460 } 3461 3462 per_cpu(vmxarea, cpu) = vmcs; 3463 } 3464 return 0; 3465} 3466 3467static bool emulation_required(struct kvm_vcpu *vcpu) 3468{ 3469 return emulate_invalid_guest_state && !guest_state_valid(vcpu); 3470} 3471 3472static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3473 struct kvm_segment *save) 3474{ 3475 if (!emulate_invalid_guest_state) { 3476 /* 3477 * CS and SS RPL should be equal during guest entry according 3478 * to VMX spec, but in reality it is not always so. Since vcpu 3479 * is in the middle of the transition from real mode to 3480 * protected mode it is safe to assume that RPL 0 is a good 3481 * default value. 3482 */ 3483 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3484 save->selector &= ~SEGMENT_RPL_MASK; 3485 save->dpl = save->selector & SEGMENT_RPL_MASK; 3486 save->s = 1; 3487 } 3488 vmx_set_segment(vcpu, save, seg); 3489} 3490 3491static void enter_pmode(struct kvm_vcpu *vcpu) 3492{ 3493 unsigned long flags; 3494 struct vcpu_vmx *vmx = to_vmx(vcpu); 3495 3496 /* 3497 * Update real mode segment cache. It may be not up-to-date if sement 3498 * register was written while vcpu was in a guest mode. 3499 */ 3500 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3501 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3502 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3503 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3504 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3505 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3506 3507 vmx->rmode.vm86_active = 0; 3508 3509 vmx_segment_cache_clear(vmx); 3510 3511 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3512 3513 flags = vmcs_readl(GUEST_RFLAGS); 3514 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3515 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3516 vmcs_writel(GUEST_RFLAGS, flags); 3517 3518 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3519 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3520 3521 update_exception_bitmap(vcpu); 3522 3523 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3524 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3525 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3526 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3527 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3528 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3529} 3530 3531static void fix_rmode_seg(int seg, struct kvm_segment *save) 3532{ 3533 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3534 struct kvm_segment var = *save; 3535 3536 var.dpl = 0x3; 3537 if (seg == VCPU_SREG_CS) 3538 var.type = 0x3; 3539 3540 if (!emulate_invalid_guest_state) { 3541 var.selector = var.base >> 4; 3542 var.base = var.base & 0xffff0; 3543 var.limit = 0xffff; 3544 var.g = 0; 3545 var.db = 0; 3546 var.present = 1; 3547 var.s = 1; 3548 var.l = 0; 3549 var.unusable = 0; 3550 var.type = 0x3; 3551 var.avl = 0; 3552 if (save->base & 0xf) 3553 printk_once(KERN_WARNING "kvm: segment base is not " 3554 "paragraph aligned when entering " 3555 "protected mode (seg=%d)", seg); 3556 } 3557 3558 vmcs_write16(sf->selector, var.selector); 3559 vmcs_write32(sf->base, var.base); 3560 vmcs_write32(sf->limit, var.limit); 3561 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3562} 3563 3564static void enter_rmode(struct kvm_vcpu *vcpu) 3565{ 3566 unsigned long flags; 3567 struct vcpu_vmx *vmx = to_vmx(vcpu); 3568 3569 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3570 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3571 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3572 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3573 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3574 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3575 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3576 3577 vmx->rmode.vm86_active = 1; 3578 3579 /* 3580 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3581 * vcpu. Warn the user that an update is overdue. 3582 */ 3583 if (!vcpu->kvm->arch.tss_addr) 3584 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3585 "called before entering vcpu\n"); 3586 3587 vmx_segment_cache_clear(vmx); 3588 3589 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); 3590 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3591 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3592 3593 flags = vmcs_readl(GUEST_RFLAGS); 3594 vmx->rmode.save_rflags = flags; 3595 3596 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3597 3598 vmcs_writel(GUEST_RFLAGS, flags); 3599 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3600 update_exception_bitmap(vcpu); 3601 3602 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3603 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3604 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3605 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3606 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3607 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3608 3609 kvm_mmu_reset_context(vcpu); 3610} 3611 3612static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3613{ 3614 struct vcpu_vmx *vmx = to_vmx(vcpu); 3615 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 3616 3617 if (!msr) 3618 return; 3619 3620 /* 3621 * Force kernel_gs_base reloading before EFER changes, as control 3622 * of this msr depends on is_long_mode(). 3623 */ 3624 vmx_load_host_state(to_vmx(vcpu)); 3625 vcpu->arch.efer = efer; 3626 if (efer & EFER_LMA) { 3627 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3628 msr->data = efer; 3629 } else { 3630 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3631 3632 msr->data = efer & ~EFER_LME; 3633 } 3634 setup_msrs(vmx); 3635} 3636 3637#ifdef CONFIG_X86_64 3638 3639static void enter_lmode(struct kvm_vcpu *vcpu) 3640{ 3641 u32 guest_tr_ar; 3642 3643 vmx_segment_cache_clear(to_vmx(vcpu)); 3644 3645 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3646 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3647 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3648 __func__); 3649 vmcs_write32(GUEST_TR_AR_BYTES, 3650 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3651 | VMX_AR_TYPE_BUSY_64_TSS); 3652 } 3653 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3654} 3655 3656static void exit_lmode(struct kvm_vcpu *vcpu) 3657{ 3658 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3659 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3660} 3661 3662#endif 3663 3664static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) 3665{ 3666 vpid_sync_context(vpid); 3667 if (enable_ept) { 3668 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3669 return; 3670 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 3671 } 3672} 3673 3674static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 3675{ 3676 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); 3677} 3678 3679static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 3680{ 3681 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 3682 3683 vcpu->arch.cr0 &= ~cr0_guest_owned_bits; 3684 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 3685} 3686 3687static void vmx_decache_cr3(struct kvm_vcpu *vcpu) 3688{ 3689 if (enable_ept && is_paging(vcpu)) 3690 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3691 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3692} 3693 3694static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 3695{ 3696 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 3697 3698 vcpu->arch.cr4 &= ~cr4_guest_owned_bits; 3699 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; 3700} 3701 3702static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 3703{ 3704 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3705 3706 if (!test_bit(VCPU_EXREG_PDPTR, 3707 (unsigned long *)&vcpu->arch.regs_dirty)) 3708 return; 3709 3710 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3711 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3712 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3713 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3714 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3715 } 3716} 3717 3718static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3719{ 3720 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3721 3722 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3723 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3724 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3725 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3726 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3727 } 3728 3729 __set_bit(VCPU_EXREG_PDPTR, 3730 (unsigned long *)&vcpu->arch.regs_avail); 3731 __set_bit(VCPU_EXREG_PDPTR, 3732 (unsigned long *)&vcpu->arch.regs_dirty); 3733} 3734 3735static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 3736 3737static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 3738 unsigned long cr0, 3739 struct kvm_vcpu *vcpu) 3740{ 3741 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) 3742 vmx_decache_cr3(vcpu); 3743 if (!(cr0 & X86_CR0_PG)) { 3744 /* From paging/starting to nonpaging */ 3745 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3746 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 3747 (CPU_BASED_CR3_LOAD_EXITING | 3748 CPU_BASED_CR3_STORE_EXITING)); 3749 vcpu->arch.cr0 = cr0; 3750 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3751 } else if (!is_paging(vcpu)) { 3752 /* From nonpaging to paging */ 3753 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3754 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 3755 ~(CPU_BASED_CR3_LOAD_EXITING | 3756 CPU_BASED_CR3_STORE_EXITING)); 3757 vcpu->arch.cr0 = cr0; 3758 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3759 } 3760 3761 if (!(cr0 & X86_CR0_WP)) 3762 *hw_cr0 &= ~X86_CR0_WP; 3763} 3764 3765static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3766{ 3767 struct vcpu_vmx *vmx = to_vmx(vcpu); 3768 unsigned long hw_cr0; 3769 3770 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); 3771 if (enable_unrestricted_guest) 3772 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3773 else { 3774 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3775 3776 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3777 enter_pmode(vcpu); 3778 3779 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3780 enter_rmode(vcpu); 3781 } 3782 3783#ifdef CONFIG_X86_64 3784 if (vcpu->arch.efer & EFER_LME) { 3785 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 3786 enter_lmode(vcpu); 3787 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 3788 exit_lmode(vcpu); 3789 } 3790#endif 3791 3792 if (enable_ept) 3793 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 3794 3795 if (!vcpu->fpu_active) 3796 hw_cr0 |= X86_CR0_TS | X86_CR0_MP; 3797 3798 vmcs_writel(CR0_READ_SHADOW, cr0); 3799 vmcs_writel(GUEST_CR0, hw_cr0); 3800 vcpu->arch.cr0 = cr0; 3801 3802 /* depends on vcpu->arch.cr0 to be set to a new value */ 3803 vmx->emulation_required = emulation_required(vcpu); 3804} 3805 3806static u64 construct_eptp(unsigned long root_hpa) 3807{ 3808 u64 eptp; 3809 3810 /* TODO write the value reading from MSR */ 3811 eptp = VMX_EPT_DEFAULT_MT | 3812 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3813 if (enable_ept_ad_bits) 3814 eptp |= VMX_EPT_AD_ENABLE_BIT; 3815 eptp |= (root_hpa & PAGE_MASK); 3816 3817 return eptp; 3818} 3819 3820static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 3821{ 3822 unsigned long guest_cr3; 3823 u64 eptp; 3824 3825 guest_cr3 = cr3; 3826 if (enable_ept) { 3827 eptp = construct_eptp(cr3); 3828 vmcs_write64(EPT_POINTER, eptp); 3829 if (is_paging(vcpu) || is_guest_mode(vcpu)) 3830 guest_cr3 = kvm_read_cr3(vcpu); 3831 else 3832 guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; 3833 ept_load_pdptrs(vcpu); 3834 } 3835 3836 vmx_flush_tlb(vcpu); 3837 vmcs_writel(GUEST_CR3, guest_cr3); 3838} 3839 3840static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3841{ 3842 /* 3843 * Pass through host's Machine Check Enable value to hw_cr4, which 3844 * is in force while we are in guest mode. Do not let guests control 3845 * this bit, even if host CR4.MCE == 0. 3846 */ 3847 unsigned long hw_cr4 = 3848 (cr4_read_shadow() & X86_CR4_MCE) | 3849 (cr4 & ~X86_CR4_MCE) | 3850 (to_vmx(vcpu)->rmode.vm86_active ? 3851 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3852 3853 if (cr4 & X86_CR4_VMXE) { 3854 /* 3855 * To use VMXON (and later other VMX instructions), a guest 3856 * must first be able to turn on cr4.VMXE (see handle_vmon()). 3857 * So basically the check on whether to allow nested VMX 3858 * is here. 3859 */ 3860 if (!nested_vmx_allowed(vcpu)) 3861 return 1; 3862 } 3863 if (to_vmx(vcpu)->nested.vmxon && 3864 ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) 3865 return 1; 3866 3867 vcpu->arch.cr4 = cr4; 3868 if (enable_ept) { 3869 if (!is_paging(vcpu)) { 3870 hw_cr4 &= ~X86_CR4_PAE; 3871 hw_cr4 |= X86_CR4_PSE; 3872 } else if (!(cr4 & X86_CR4_PAE)) { 3873 hw_cr4 &= ~X86_CR4_PAE; 3874 } 3875 } 3876 3877 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3878 /* 3879 * SMEP/SMAP is disabled if CPU is in non-paging mode in 3880 * hardware. However KVM always uses paging mode without 3881 * unrestricted guest. 3882 * To emulate this behavior, SMEP/SMAP needs to be manually 3883 * disabled when guest switches to non-paging mode. 3884 */ 3885 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); 3886 3887 vmcs_writel(CR4_READ_SHADOW, cr4); 3888 vmcs_writel(GUEST_CR4, hw_cr4); 3889 return 0; 3890} 3891 3892static void vmx_get_segment(struct kvm_vcpu *vcpu, 3893 struct kvm_segment *var, int seg) 3894{ 3895 struct vcpu_vmx *vmx = to_vmx(vcpu); 3896 u32 ar; 3897 3898 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3899 *var = vmx->rmode.segs[seg]; 3900 if (seg == VCPU_SREG_TR 3901 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3902 return; 3903 var->base = vmx_read_guest_seg_base(vmx, seg); 3904 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3905 return; 3906 } 3907 var->base = vmx_read_guest_seg_base(vmx, seg); 3908 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3909 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3910 ar = vmx_read_guest_seg_ar(vmx, seg); 3911 var->unusable = (ar >> 16) & 1; 3912 var->type = ar & 15; 3913 var->s = (ar >> 4) & 1; 3914 var->dpl = (ar >> 5) & 3; 3915 /* 3916 * Some userspaces do not preserve unusable property. Since usable 3917 * segment has to be present according to VMX spec we can use present 3918 * property to amend userspace bug by making unusable segment always 3919 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3920 * segment as unusable. 3921 */ 3922 var->present = !var->unusable; 3923 var->avl = (ar >> 12) & 1; 3924 var->l = (ar >> 13) & 1; 3925 var->db = (ar >> 14) & 1; 3926 var->g = (ar >> 15) & 1; 3927} 3928 3929static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3930{ 3931 struct kvm_segment s; 3932 3933 if (to_vmx(vcpu)->rmode.vm86_active) { 3934 vmx_get_segment(vcpu, &s, seg); 3935 return s.base; 3936 } 3937 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3938} 3939 3940static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3941{ 3942 struct vcpu_vmx *vmx = to_vmx(vcpu); 3943 3944 if (unlikely(vmx->rmode.vm86_active)) 3945 return 0; 3946 else { 3947 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3948 return VMX_AR_DPL(ar); 3949 } 3950} 3951 3952static u32 vmx_segment_access_rights(struct kvm_segment *var) 3953{ 3954 u32 ar; 3955 3956 if (var->unusable || !var->present) 3957 ar = 1 << 16; 3958 else { 3959 ar = var->type & 15; 3960 ar |= (var->s & 1) << 4; 3961 ar |= (var->dpl & 3) << 5; 3962 ar |= (var->present & 1) << 7; 3963 ar |= (var->avl & 1) << 12; 3964 ar |= (var->l & 1) << 13; 3965 ar |= (var->db & 1) << 14; 3966 ar |= (var->g & 1) << 15; 3967 } 3968 3969 return ar; 3970} 3971 3972static void vmx_set_segment(struct kvm_vcpu *vcpu, 3973 struct kvm_segment *var, int seg) 3974{ 3975 struct vcpu_vmx *vmx = to_vmx(vcpu); 3976 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3977 3978 vmx_segment_cache_clear(vmx); 3979 3980 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3981 vmx->rmode.segs[seg] = *var; 3982 if (seg == VCPU_SREG_TR) 3983 vmcs_write16(sf->selector, var->selector); 3984 else if (var->s) 3985 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3986 goto out; 3987 } 3988 3989 vmcs_writel(sf->base, var->base); 3990 vmcs_write32(sf->limit, var->limit); 3991 vmcs_write16(sf->selector, var->selector); 3992 3993 /* 3994 * Fix the "Accessed" bit in AR field of segment registers for older 3995 * qemu binaries. 3996 * IA32 arch specifies that at the time of processor reset the 3997 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3998 * is setting it to 0 in the userland code. This causes invalid guest 3999 * state vmexit when "unrestricted guest" mode is turned on. 4000 * Fix for this setup issue in cpu_reset is being pushed in the qemu 4001 * tree. Newer qemu binaries with that qemu fix would not need this 4002 * kvm hack. 4003 */ 4004 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 4005 var->type |= 0x1; /* Accessed */ 4006 4007 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 4008 4009out: 4010 vmx->emulation_required = emulation_required(vcpu); 4011} 4012 4013static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4014{ 4015 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 4016 4017 *db = (ar >> 14) & 1; 4018 *l = (ar >> 13) & 1; 4019} 4020 4021static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 4022{ 4023 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 4024 dt->address = vmcs_readl(GUEST_IDTR_BASE); 4025} 4026 4027static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 4028{ 4029 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 4030 vmcs_writel(GUEST_IDTR_BASE, dt->address); 4031} 4032 4033static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 4034{ 4035 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 4036 dt->address = vmcs_readl(GUEST_GDTR_BASE); 4037} 4038 4039static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 4040{ 4041 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 4042 vmcs_writel(GUEST_GDTR_BASE, dt->address); 4043} 4044 4045static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 4046{ 4047 struct kvm_segment var; 4048 u32 ar; 4049 4050 vmx_get_segment(vcpu, &var, seg); 4051 var.dpl = 0x3; 4052 if (seg == VCPU_SREG_CS) 4053 var.type = 0x3; 4054 ar = vmx_segment_access_rights(&var); 4055 4056 if (var.base != (var.selector << 4)) 4057 return false; 4058 if (var.limit != 0xffff) 4059 return false; 4060 if (ar != 0xf3) 4061 return false; 4062 4063 return true; 4064} 4065 4066static bool code_segment_valid(struct kvm_vcpu *vcpu) 4067{ 4068 struct kvm_segment cs; 4069 unsigned int cs_rpl; 4070 4071 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 4072 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 4073 4074 if (cs.unusable) 4075 return false; 4076 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 4077 return false; 4078 if (!cs.s) 4079 return false; 4080 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 4081 if (cs.dpl > cs_rpl) 4082 return false; 4083 } else { 4084 if (cs.dpl != cs_rpl) 4085 return false; 4086 } 4087 if (!cs.present) 4088 return false; 4089 4090 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 4091 return true; 4092} 4093 4094static bool stack_segment_valid(struct kvm_vcpu *vcpu) 4095{ 4096 struct kvm_segment ss; 4097 unsigned int ss_rpl; 4098 4099 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 4100 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 4101 4102 if (ss.unusable) 4103 return true; 4104 if (ss.type != 3 && ss.type != 7) 4105 return false; 4106 if (!ss.s) 4107 return false; 4108 if (ss.dpl != ss_rpl) /* DPL != RPL */ 4109 return false; 4110 if (!ss.present) 4111 return false; 4112 4113 return true; 4114} 4115 4116static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 4117{ 4118 struct kvm_segment var; 4119 unsigned int rpl; 4120 4121 vmx_get_segment(vcpu, &var, seg); 4122 rpl = var.selector & SEGMENT_RPL_MASK; 4123 4124 if (var.unusable) 4125 return true; 4126 if (!var.s) 4127 return false; 4128 if (!var.present) 4129 return false; 4130 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 4131 if (var.dpl < rpl) /* DPL < RPL */ 4132 return false; 4133 } 4134 4135 /* TODO: Add other members to kvm_segment_field to allow checking for other access 4136 * rights flags 4137 */ 4138 return true; 4139} 4140 4141static bool tr_valid(struct kvm_vcpu *vcpu) 4142{ 4143 struct kvm_segment tr; 4144 4145 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 4146 4147 if (tr.unusable) 4148 return false; 4149 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 4150 return false; 4151 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 4152 return false; 4153 if (!tr.present) 4154 return false; 4155 4156 return true; 4157} 4158 4159static bool ldtr_valid(struct kvm_vcpu *vcpu) 4160{ 4161 struct kvm_segment ldtr; 4162 4163 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 4164 4165 if (ldtr.unusable) 4166 return true; 4167 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 4168 return false; 4169 if (ldtr.type != 2) 4170 return false; 4171 if (!ldtr.present) 4172 return false; 4173 4174 return true; 4175} 4176 4177static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 4178{ 4179 struct kvm_segment cs, ss; 4180 4181 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 4182 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 4183 4184 return ((cs.selector & SEGMENT_RPL_MASK) == 4185 (ss.selector & SEGMENT_RPL_MASK)); 4186} 4187 4188/* 4189 * Check if guest state is valid. Returns true if valid, false if 4190 * not. 4191 * We assume that registers are always usable 4192 */ 4193static bool guest_state_valid(struct kvm_vcpu *vcpu) 4194{ 4195 if (enable_unrestricted_guest) 4196 return true; 4197 4198 /* real mode guest state checks */ 4199 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 4200 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 4201 return false; 4202 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 4203 return false; 4204 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 4205 return false; 4206 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 4207 return false; 4208 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 4209 return false; 4210 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 4211 return false; 4212 } else { 4213 /* protected mode guest state checks */ 4214 if (!cs_ss_rpl_check(vcpu)) 4215 return false; 4216 if (!code_segment_valid(vcpu)) 4217 return false; 4218 if (!stack_segment_valid(vcpu)) 4219 return false; 4220 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 4221 return false; 4222 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 4223 return false; 4224 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 4225 return false; 4226 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 4227 return false; 4228 if (!tr_valid(vcpu)) 4229 return false; 4230 if (!ldtr_valid(vcpu)) 4231 return false; 4232 } 4233 /* TODO: 4234 * - Add checks on RIP 4235 * - Add checks on RFLAGS 4236 */ 4237 4238 return true; 4239} 4240 4241static int init_rmode_tss(struct kvm *kvm) 4242{ 4243 gfn_t fn; 4244 u16 data = 0; 4245 int idx, r; 4246 4247 idx = srcu_read_lock(&kvm->srcu); 4248 fn = kvm->arch.tss_addr >> PAGE_SHIFT; 4249 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 4250 if (r < 0) 4251 goto out; 4252 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 4253 r = kvm_write_guest_page(kvm, fn++, &data, 4254 TSS_IOPB_BASE_OFFSET, sizeof(u16)); 4255 if (r < 0) 4256 goto out; 4257 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 4258 if (r < 0) 4259 goto out; 4260 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 4261 if (r < 0) 4262 goto out; 4263 data = ~0; 4264 r = kvm_write_guest_page(kvm, fn, &data, 4265 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, 4266 sizeof(u8)); 4267out: 4268 srcu_read_unlock(&kvm->srcu, idx); 4269 return r; 4270} 4271 4272static int init_rmode_identity_map(struct kvm *kvm) 4273{ 4274 int i, idx, r = 0; 4275 kvm_pfn_t identity_map_pfn; 4276 u32 tmp; 4277 4278 if (!enable_ept) 4279 return 0; 4280 4281 /* Protect kvm->arch.ept_identity_pagetable_done. */ 4282 mutex_lock(&kvm->slots_lock); 4283 4284 if (likely(kvm->arch.ept_identity_pagetable_done)) 4285 goto out2; 4286 4287 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 4288 4289 r = alloc_identity_pagetable(kvm); 4290 if (r < 0) 4291 goto out2; 4292 4293 idx = srcu_read_lock(&kvm->srcu); 4294 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 4295 if (r < 0) 4296 goto out; 4297 /* Set up identity-mapping pagetable for EPT in real mode */ 4298 for (i = 0; i < PT32_ENT_PER_PAGE; i++) { 4299 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 4300 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 4301 r = kvm_write_guest_page(kvm, identity_map_pfn, 4302 &tmp, i * sizeof(tmp), sizeof(tmp)); 4303 if (r < 0) 4304 goto out; 4305 } 4306 kvm->arch.ept_identity_pagetable_done = true; 4307 4308out: 4309 srcu_read_unlock(&kvm->srcu, idx); 4310 4311out2: 4312 mutex_unlock(&kvm->slots_lock); 4313 return r; 4314} 4315 4316static void seg_setup(int seg) 4317{ 4318 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 4319 unsigned int ar; 4320 4321 vmcs_write16(sf->selector, 0); 4322 vmcs_writel(sf->base, 0); 4323 vmcs_write32(sf->limit, 0xffff); 4324 ar = 0x93; 4325 if (seg == VCPU_SREG_CS) 4326 ar |= 0x08; /* code segment */ 4327 4328 vmcs_write32(sf->ar_bytes, ar); 4329} 4330 4331static int alloc_apic_access_page(struct kvm *kvm) 4332{ 4333 struct page *page; 4334 int r = 0; 4335 4336 mutex_lock(&kvm->slots_lock); 4337 if (kvm->arch.apic_access_page_done) 4338 goto out; 4339 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 4340 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); 4341 if (r) 4342 goto out; 4343 4344 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 4345 if (is_error_page(page)) { 4346 r = -EFAULT; 4347 goto out; 4348 } 4349 4350 /* 4351 * Do not pin the page in memory, so that memory hot-unplug 4352 * is able to migrate it. 4353 */ 4354 put_page(page); 4355 kvm->arch.apic_access_page_done = true; 4356out: 4357 mutex_unlock(&kvm->slots_lock); 4358 return r; 4359} 4360 4361static int alloc_identity_pagetable(struct kvm *kvm) 4362{ 4363 /* Called with kvm->slots_lock held. */ 4364 4365 int r = 0; 4366 4367 BUG_ON(kvm->arch.ept_identity_pagetable_done); 4368 4369 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 4370 kvm->arch.ept_identity_map_addr, PAGE_SIZE); 4371 4372 return r; 4373} 4374 4375static int allocate_vpid(void) 4376{ 4377 int vpid; 4378 4379 if (!enable_vpid) 4380 return 0; 4381 spin_lock(&vmx_vpid_lock); 4382 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4383 if (vpid < VMX_NR_VPIDS) 4384 __set_bit(vpid, vmx_vpid_bitmap); 4385 else 4386 vpid = 0; 4387 spin_unlock(&vmx_vpid_lock); 4388 return vpid; 4389} 4390 4391static void free_vpid(int vpid) 4392{ 4393 if (!enable_vpid || vpid == 0) 4394 return; 4395 spin_lock(&vmx_vpid_lock); 4396 __clear_bit(vpid, vmx_vpid_bitmap); 4397 spin_unlock(&vmx_vpid_lock); 4398} 4399 4400#define MSR_TYPE_R 1 4401#define MSR_TYPE_W 2 4402static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 4403 u32 msr, int type) 4404{ 4405 int f = sizeof(unsigned long); 4406 4407 if (!cpu_has_vmx_msr_bitmap()) 4408 return; 4409 4410 /* 4411 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4412 * have the write-low and read-high bitmap offsets the wrong way round. 4413 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4414 */ 4415 if (msr <= 0x1fff) { 4416 if (type & MSR_TYPE_R) 4417 /* read-low */ 4418 __clear_bit(msr, msr_bitmap + 0x000 / f); 4419 4420 if (type & MSR_TYPE_W) 4421 /* write-low */ 4422 __clear_bit(msr, msr_bitmap + 0x800 / f); 4423 4424 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4425 msr &= 0x1fff; 4426 if (type & MSR_TYPE_R) 4427 /* read-high */ 4428 __clear_bit(msr, msr_bitmap + 0x400 / f); 4429 4430 if (type & MSR_TYPE_W) 4431 /* write-high */ 4432 __clear_bit(msr, msr_bitmap + 0xc00 / f); 4433 4434 } 4435} 4436 4437static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, 4438 u32 msr, int type) 4439{ 4440 int f = sizeof(unsigned long); 4441 4442 if (!cpu_has_vmx_msr_bitmap()) 4443 return; 4444 4445 /* 4446 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4447 * have the write-low and read-high bitmap offsets the wrong way round. 4448 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4449 */ 4450 if (msr <= 0x1fff) { 4451 if (type & MSR_TYPE_R) 4452 /* read-low */ 4453 __set_bit(msr, msr_bitmap + 0x000 / f); 4454 4455 if (type & MSR_TYPE_W) 4456 /* write-low */ 4457 __set_bit(msr, msr_bitmap + 0x800 / f); 4458 4459 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4460 msr &= 0x1fff; 4461 if (type & MSR_TYPE_R) 4462 /* read-high */ 4463 __set_bit(msr, msr_bitmap + 0x400 / f); 4464 4465 if (type & MSR_TYPE_W) 4466 /* write-high */ 4467 __set_bit(msr, msr_bitmap + 0xc00 / f); 4468 4469 } 4470} 4471 4472/* 4473 * If a msr is allowed by L0, we should check whether it is allowed by L1. 4474 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 4475 */ 4476static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 4477 unsigned long *msr_bitmap_nested, 4478 u32 msr, int type) 4479{ 4480 int f = sizeof(unsigned long); 4481 4482 if (!cpu_has_vmx_msr_bitmap()) { 4483 WARN_ON(1); 4484 return; 4485 } 4486 4487 /* 4488 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4489 * have the write-low and read-high bitmap offsets the wrong way round. 4490 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4491 */ 4492 if (msr <= 0x1fff) { 4493 if (type & MSR_TYPE_R && 4494 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 4495 /* read-low */ 4496 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 4497 4498 if (type & MSR_TYPE_W && 4499 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 4500 /* write-low */ 4501 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 4502 4503 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4504 msr &= 0x1fff; 4505 if (type & MSR_TYPE_R && 4506 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 4507 /* read-high */ 4508 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 4509 4510 if (type & MSR_TYPE_W && 4511 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 4512 /* write-high */ 4513 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 4514 4515 } 4516} 4517 4518static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 4519{ 4520 if (!longmode_only) 4521 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, 4522 msr, MSR_TYPE_R | MSR_TYPE_W); 4523 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, 4524 msr, MSR_TYPE_R | MSR_TYPE_W); 4525} 4526 4527static void vmx_enable_intercept_msr_read_x2apic(u32 msr) 4528{ 4529 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4530 msr, MSR_TYPE_R); 4531 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4532 msr, MSR_TYPE_R); 4533} 4534 4535static void vmx_disable_intercept_msr_read_x2apic(u32 msr) 4536{ 4537 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4538 msr, MSR_TYPE_R); 4539 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4540 msr, MSR_TYPE_R); 4541} 4542 4543static void vmx_disable_intercept_msr_write_x2apic(u32 msr) 4544{ 4545 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4546 msr, MSR_TYPE_W); 4547 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4548 msr, MSR_TYPE_W); 4549} 4550 4551static bool vmx_get_enable_apicv(void) 4552{ 4553 return enable_apicv; 4554} 4555 4556static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4557{ 4558 struct vcpu_vmx *vmx = to_vmx(vcpu); 4559 int max_irr; 4560 void *vapic_page; 4561 u16 status; 4562 4563 if (vmx->nested.pi_desc && 4564 vmx->nested.pi_pending) { 4565 vmx->nested.pi_pending = false; 4566 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4567 return 0; 4568 4569 max_irr = find_last_bit( 4570 (unsigned long *)vmx->nested.pi_desc->pir, 256); 4571 4572 if (max_irr == 256) 4573 return 0; 4574 4575 vapic_page = kmap(vmx->nested.virtual_apic_page); 4576 if (!vapic_page) { 4577 WARN_ON(1); 4578 return -ENOMEM; 4579 } 4580 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); 4581 kunmap(vmx->nested.virtual_apic_page); 4582 4583 status = vmcs_read16(GUEST_INTR_STATUS); 4584 if ((u8)max_irr > ((u8)status & 0xff)) { 4585 status &= ~0xff; 4586 status |= (u8)max_irr; 4587 vmcs_write16(GUEST_INTR_STATUS, status); 4588 } 4589 } 4590 return 0; 4591} 4592 4593static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) 4594{ 4595#ifdef CONFIG_SMP 4596 if (vcpu->mode == IN_GUEST_MODE) { 4597 struct vcpu_vmx *vmx = to_vmx(vcpu); 4598 4599 /* 4600 * Currently, we don't support urgent interrupt, 4601 * all interrupts are recognized as non-urgent 4602 * interrupt, so we cannot post interrupts when 4603 * 'SN' is set. 4604 * 4605 * If the vcpu is in guest mode, it means it is 4606 * running instead of being scheduled out and 4607 * waiting in the run queue, and that's the only 4608 * case when 'SN' is set currently, warning if 4609 * 'SN' is set. 4610 */ 4611 WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); 4612 4613 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), 4614 POSTED_INTR_VECTOR); 4615 return true; 4616 } 4617#endif 4618 return false; 4619} 4620 4621static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4622 int vector) 4623{ 4624 struct vcpu_vmx *vmx = to_vmx(vcpu); 4625 4626 if (is_guest_mode(vcpu) && 4627 vector == vmx->nested.posted_intr_nv) { 4628 /* the PIR and ON have been set by L1. */ 4629 kvm_vcpu_trigger_posted_interrupt(vcpu); 4630 /* 4631 * If a posted intr is not recognized by hardware, 4632 * we will accomplish it in the next vmentry. 4633 */ 4634 vmx->nested.pi_pending = true; 4635 kvm_make_request(KVM_REQ_EVENT, vcpu); 4636 return 0; 4637 } 4638 return -1; 4639} 4640/* 4641 * Send interrupt to vcpu via posted interrupt way. 4642 * 1. If target vcpu is running(non-root mode), send posted interrupt 4643 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4644 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4645 * interrupt from PIR in next vmentry. 4646 */ 4647static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4648{ 4649 struct vcpu_vmx *vmx = to_vmx(vcpu); 4650 int r; 4651 4652 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4653 if (!r) 4654 return; 4655 4656 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4657 return; 4658 4659 r = pi_test_and_set_on(&vmx->pi_desc); 4660 kvm_make_request(KVM_REQ_EVENT, vcpu); 4661 if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) 4662 kvm_vcpu_kick(vcpu); 4663} 4664 4665static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 4666{ 4667 struct vcpu_vmx *vmx = to_vmx(vcpu); 4668 4669 if (!pi_test_and_clear_on(&vmx->pi_desc)) 4670 return; 4671 4672 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); 4673} 4674 4675/* 4676 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4677 * will not change in the lifetime of the guest. 4678 * Note that host-state that does change is set elsewhere. E.g., host-state 4679 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4680 */ 4681static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4682{ 4683 u32 low32, high32; 4684 unsigned long tmpl; 4685 struct desc_ptr dt; 4686 unsigned long cr4; 4687 4688 vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ 4689 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 4690 4691 /* Save the most likely value for this task's CR4 in the VMCS. */ 4692 cr4 = cr4_read_shadow(); 4693 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4694 vmx->host_state.vmcs_host_cr4 = cr4; 4695 4696 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4697#ifdef CONFIG_X86_64 4698 /* 4699 * Load null selectors, so we can avoid reloading them in 4700 * __vmx_load_host_state(), in case userspace uses the null selectors 4701 * too (the expected case). 4702 */ 4703 vmcs_write16(HOST_DS_SELECTOR, 0); 4704 vmcs_write16(HOST_ES_SELECTOR, 0); 4705#else 4706 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4707 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4708#endif 4709 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4710 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4711 4712 native_store_idt(&dt); 4713 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 4714 vmx->host_idt_base = dt.address; 4715 4716 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ 4717 4718 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4719 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4720 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4721 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4722 4723 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4724 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4725 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4726 } 4727} 4728 4729static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4730{ 4731 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 4732 if (enable_ept) 4733 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 4734 if (is_guest_mode(&vmx->vcpu)) 4735 vmx->vcpu.arch.cr4_guest_owned_bits &= 4736 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; 4737 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 4738} 4739 4740static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4741{ 4742 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4743 4744 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4745 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4746 return pin_based_exec_ctrl; 4747} 4748 4749static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4750{ 4751 struct vcpu_vmx *vmx = to_vmx(vcpu); 4752 4753 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4754} 4755 4756static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4757{ 4758 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4759 4760 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4761 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4762 4763 if (!cpu_need_tpr_shadow(&vmx->vcpu)) { 4764 exec_control &= ~CPU_BASED_TPR_SHADOW; 4765#ifdef CONFIG_X86_64 4766 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4767 CPU_BASED_CR8_LOAD_EXITING; 4768#endif 4769 } 4770 if (!enable_ept) 4771 exec_control |= CPU_BASED_CR3_STORE_EXITING | 4772 CPU_BASED_CR3_LOAD_EXITING | 4773 CPU_BASED_INVLPG_EXITING; 4774 return exec_control; 4775} 4776 4777static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4778{ 4779 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4780 if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) 4781 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4782 if (vmx->vpid == 0) 4783 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4784 if (!enable_ept) { 4785 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4786 enable_unrestricted_guest = 0; 4787 /* Enable INVPCID for non-ept guests may cause performance regression. */ 4788 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 4789 } 4790 if (!enable_unrestricted_guest) 4791 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4792 if (!ple_gap) 4793 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4794 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4795 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4796 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4797 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4798 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4799 (handle_vmptrld). 4800 We can NOT enable shadow_vmcs here because we don't have yet 4801 a current VMCS12 4802 */ 4803 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4804 4805 if (!enable_pml) 4806 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4807 4808 /* Currently, we allow L1 guest to directly run pcommit instruction. */ 4809 exec_control &= ~SECONDARY_EXEC_PCOMMIT; 4810 4811 return exec_control; 4812} 4813 4814static void ept_set_mmio_spte_mask(void) 4815{ 4816 /* 4817 * EPT Misconfigurations can be generated if the value of bits 2:0 4818 * of an EPT paging-structure entry is 110b (write/execute). 4819 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio 4820 * spte. 4821 */ 4822 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); 4823} 4824 4825#define VMX_XSS_EXIT_BITMAP 0 4826/* 4827 * Sets up the vmcs for emulated real mode. 4828 */ 4829static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 4830{ 4831#ifdef CONFIG_X86_64 4832 unsigned long a; 4833#endif 4834 int i; 4835 4836 /* I/O */ 4837 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 4838 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); 4839 4840 if (enable_shadow_vmcs) { 4841 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 4842 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 4843 } 4844 if (cpu_has_vmx_msr_bitmap()) 4845 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 4846 4847 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4848 4849 /* Control */ 4850 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4851 4852 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4853 4854 if (cpu_has_secondary_exec_ctrls()) 4855 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4856 vmx_secondary_exec_control(vmx)); 4857 4858 if (kvm_vcpu_apicv_active(&vmx->vcpu)) { 4859 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4860 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4861 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4862 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4863 4864 vmcs_write16(GUEST_INTR_STATUS, 0); 4865 4866 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4867 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4868 } 4869 4870 if (ple_gap) { 4871 vmcs_write32(PLE_GAP, ple_gap); 4872 vmx->ple_window = ple_window; 4873 vmx->ple_window_dirty = true; 4874 } 4875 4876 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4877 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4878 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4879 4880 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4881 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4882 vmx_set_constant_host_state(vmx); 4883#ifdef CONFIG_X86_64 4884 rdmsrl(MSR_FS_BASE, a); 4885 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 4886 rdmsrl(MSR_GS_BASE, a); 4887 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ 4888#else 4889 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4890 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4891#endif 4892 4893 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4894 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4895 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 4896 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4897 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 4898 4899 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4900 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4901 4902 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { 4903 u32 index = vmx_msr_index[i]; 4904 u32 data_low, data_high; 4905 int j = vmx->nmsrs; 4906 4907 if (rdmsr_safe(index, &data_low, &data_high) < 0) 4908 continue; 4909 if (wrmsr_safe(index, data_low, data_high) < 0) 4910 continue; 4911 vmx->guest_msrs[j].index = i; 4912 vmx->guest_msrs[j].data = 0; 4913 vmx->guest_msrs[j].mask = -1ull; 4914 ++vmx->nmsrs; 4915 } 4916 4917 4918 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); 4919 4920 /* 22.2.1, 20.8.1 */ 4921 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); 4922 4923 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 4924 set_cr4_guest_host_mask(vmx); 4925 4926 if (vmx_xsaves_supported()) 4927 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4928 4929 return 0; 4930} 4931 4932static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4933{ 4934 struct vcpu_vmx *vmx = to_vmx(vcpu); 4935 struct msr_data apic_base_msr; 4936 u64 cr0; 4937 4938 vmx->rmode.vm86_active = 0; 4939 4940 vmx->soft_vnmi_blocked = 0; 4941 4942 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4943 kvm_set_cr8(vcpu, 0); 4944 4945 if (!init_event) { 4946 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | 4947 MSR_IA32_APICBASE_ENABLE; 4948 if (kvm_vcpu_is_reset_bsp(vcpu)) 4949 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4950 apic_base_msr.host_initiated = true; 4951 kvm_set_apic_base(vcpu, &apic_base_msr); 4952 } 4953 4954 vmx_segment_cache_clear(vmx); 4955 4956 seg_setup(VCPU_SREG_CS); 4957 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4958 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4959 4960 seg_setup(VCPU_SREG_DS); 4961 seg_setup(VCPU_SREG_ES); 4962 seg_setup(VCPU_SREG_FS); 4963 seg_setup(VCPU_SREG_GS); 4964 seg_setup(VCPU_SREG_SS); 4965 4966 vmcs_write16(GUEST_TR_SELECTOR, 0); 4967 vmcs_writel(GUEST_TR_BASE, 0); 4968 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4969 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4970 4971 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4972 vmcs_writel(GUEST_LDTR_BASE, 0); 4973 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4974 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4975 4976 if (!init_event) { 4977 vmcs_write32(GUEST_SYSENTER_CS, 0); 4978 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4979 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4980 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4981 } 4982 4983 vmcs_writel(GUEST_RFLAGS, 0x02); 4984 kvm_rip_write(vcpu, 0xfff0); 4985 4986 vmcs_writel(GUEST_GDTR_BASE, 0); 4987 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4988 4989 vmcs_writel(GUEST_IDTR_BASE, 0); 4990 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4991 4992 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4993 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4994 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4995 4996 setup_msrs(vmx); 4997 4998 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4999 5000 if (cpu_has_vmx_tpr_shadow() && !init_event) { 5001 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 5002 if (cpu_need_tpr_shadow(vcpu)) 5003 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 5004 __pa(vcpu->arch.apic->regs)); 5005 vmcs_write32(TPR_THRESHOLD, 0); 5006 } 5007 5008 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5009 5010 if (kvm_vcpu_apicv_active(vcpu)) 5011 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); 5012 5013 if (vmx->vpid != 0) 5014 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 5015 5016 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 5017 vmx_set_cr0(vcpu, cr0); /* enter rmode */ 5018 vmx->vcpu.arch.cr0 = cr0; 5019 vmx_set_cr4(vcpu, 0); 5020 vmx_set_efer(vcpu, 0); 5021 vmx_fpu_activate(vcpu); 5022 update_exception_bitmap(vcpu); 5023 5024 vpid_sync_context(vmx->vpid); 5025} 5026 5027/* 5028 * In nested virtualization, check if L1 asked to exit on external interrupts. 5029 * For most existing hypervisors, this will always return true. 5030 */ 5031static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) 5032{ 5033 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 5034 PIN_BASED_EXT_INTR_MASK; 5035} 5036 5037/* 5038 * In nested virtualization, check if L1 has set 5039 * VM_EXIT_ACK_INTR_ON_EXIT 5040 */ 5041static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 5042{ 5043 return get_vmcs12(vcpu)->vm_exit_controls & 5044 VM_EXIT_ACK_INTR_ON_EXIT; 5045} 5046 5047static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 5048{ 5049 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 5050 PIN_BASED_NMI_EXITING; 5051} 5052 5053static void enable_irq_window(struct kvm_vcpu *vcpu) 5054{ 5055 u32 cpu_based_vm_exec_control; 5056 5057 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5058 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 5059 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5060} 5061 5062static void enable_nmi_window(struct kvm_vcpu *vcpu) 5063{ 5064 u32 cpu_based_vm_exec_control; 5065 5066 if (!cpu_has_virtual_nmis() || 5067 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 5068 enable_irq_window(vcpu); 5069 return; 5070 } 5071 5072 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5073 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 5074 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5075} 5076 5077static void vmx_inject_irq(struct kvm_vcpu *vcpu) 5078{ 5079 struct vcpu_vmx *vmx = to_vmx(vcpu); 5080 uint32_t intr; 5081 int irq = vcpu->arch.interrupt.nr; 5082 5083 trace_kvm_inj_virq(irq); 5084 5085 ++vcpu->stat.irq_injections; 5086 if (vmx->rmode.vm86_active) { 5087 int inc_eip = 0; 5088 if (vcpu->arch.interrupt.soft) 5089 inc_eip = vcpu->arch.event_exit_inst_len; 5090 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) 5091 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5092 return; 5093 } 5094 intr = irq | INTR_INFO_VALID_MASK; 5095 if (vcpu->arch.interrupt.soft) { 5096 intr |= INTR_TYPE_SOFT_INTR; 5097 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 5098 vmx->vcpu.arch.event_exit_inst_len); 5099 } else 5100 intr |= INTR_TYPE_EXT_INTR; 5101 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 5102} 5103 5104static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 5105{ 5106 struct vcpu_vmx *vmx = to_vmx(vcpu); 5107 5108 if (is_guest_mode(vcpu)) 5109 return; 5110 5111 if (!cpu_has_virtual_nmis()) { 5112 /* 5113 * Tracking the NMI-blocked state in software is built upon 5114 * finding the next open IRQ window. This, in turn, depends on 5115 * well-behaving guests: They have to keep IRQs disabled at 5116 * least as long as the NMI handler runs. Otherwise we may 5117 * cause NMI nesting, maybe breaking the guest. But as this is 5118 * highly unlikely, we can live with the residual risk. 5119 */ 5120 vmx->soft_vnmi_blocked = 1; 5121 vmx->vnmi_blocked_time = 0; 5122 } 5123 5124 ++vcpu->stat.nmi_injections; 5125 vmx->nmi_known_unmasked = false; 5126 if (vmx->rmode.vm86_active) { 5127 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) 5128 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5129 return; 5130 } 5131 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 5132 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 5133} 5134 5135static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 5136{ 5137 if (!cpu_has_virtual_nmis()) 5138 return to_vmx(vcpu)->soft_vnmi_blocked; 5139 if (to_vmx(vcpu)->nmi_known_unmasked) 5140 return false; 5141 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5142} 5143 5144static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5145{ 5146 struct vcpu_vmx *vmx = to_vmx(vcpu); 5147 5148 if (!cpu_has_virtual_nmis()) { 5149 if (vmx->soft_vnmi_blocked != masked) { 5150 vmx->soft_vnmi_blocked = masked; 5151 vmx->vnmi_blocked_time = 0; 5152 } 5153 } else { 5154 vmx->nmi_known_unmasked = !masked; 5155 if (masked) 5156 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5157 GUEST_INTR_STATE_NMI); 5158 else 5159 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5160 GUEST_INTR_STATE_NMI); 5161 } 5162} 5163 5164static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 5165{ 5166 if (to_vmx(vcpu)->nested.nested_run_pending) 5167 return 0; 5168 5169 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 5170 return 0; 5171 5172 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5173 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 5174 | GUEST_INTR_STATE_NMI)); 5175} 5176 5177static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 5178{ 5179 return (!to_vmx(vcpu)->nested.nested_run_pending && 5180 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 5181 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5182 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5183} 5184 5185static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5186{ 5187 int ret; 5188 5189 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5190 PAGE_SIZE * 3); 5191 if (ret) 5192 return ret; 5193 kvm->arch.tss_addr = addr; 5194 return init_rmode_tss(kvm); 5195} 5196 5197static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5198{ 5199 switch (vec) { 5200 case BP_VECTOR: 5201 /* 5202 * Update instruction length as we may reinject the exception 5203 * from user space while in guest debugging mode. 5204 */ 5205 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5206 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5207 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5208 return false; 5209 /* fall through */ 5210 case DB_VECTOR: 5211 if (vcpu->guest_debug & 5212 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5213 return false; 5214 /* fall through */ 5215 case DE_VECTOR: 5216 case OF_VECTOR: 5217 case BR_VECTOR: 5218 case UD_VECTOR: 5219 case DF_VECTOR: 5220 case SS_VECTOR: 5221 case GP_VECTOR: 5222 case MF_VECTOR: 5223 return true; 5224 break; 5225 } 5226 return false; 5227} 5228 5229static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5230 int vec, u32 err_code) 5231{ 5232 /* 5233 * Instruction with address size override prefix opcode 0x67 5234 * Cause the #SS fault with 0 error code in VM86 mode. 5235 */ 5236 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5237 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { 5238 if (vcpu->arch.halt_request) { 5239 vcpu->arch.halt_request = 0; 5240 return kvm_vcpu_halt(vcpu); 5241 } 5242 return 1; 5243 } 5244 return 0; 5245 } 5246 5247 /* 5248 * Forward all other exceptions that are valid in real mode. 5249 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5250 * the required debugging infrastructure rework. 5251 */ 5252 kvm_queue_exception(vcpu, vec); 5253 return 1; 5254} 5255 5256/* 5257 * Trigger machine check on the host. We assume all the MSRs are already set up 5258 * by the CPU and that we still run on the same CPU as the MCE occurred on. 5259 * We pass a fake environment to the machine check handler because we want 5260 * the guest to be always treated like user space, no matter what context 5261 * it used internally. 5262 */ 5263static void kvm_machine_check(void) 5264{ 5265#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) 5266 struct pt_regs regs = { 5267 .cs = 3, /* Fake ring 3 no matter what the guest ran on */ 5268 .flags = X86_EFLAGS_IF, 5269 }; 5270 5271 do_machine_check(&regs, 0); 5272#endif 5273} 5274 5275static int handle_machine_check(struct kvm_vcpu *vcpu) 5276{ 5277 /* already handled by vcpu_run */ 5278 return 1; 5279} 5280 5281static int handle_exception(struct kvm_vcpu *vcpu) 5282{ 5283 struct vcpu_vmx *vmx = to_vmx(vcpu); 5284 struct kvm_run *kvm_run = vcpu->run; 5285 u32 intr_info, ex_no, error_code; 5286 unsigned long cr2, rip, dr6; 5287 u32 vect_info; 5288 enum emulation_result er; 5289 5290 vect_info = vmx->idt_vectoring_info; 5291 intr_info = vmx->exit_intr_info; 5292 5293 if (is_machine_check(intr_info)) 5294 return handle_machine_check(vcpu); 5295 5296 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 5297 return 1; /* already handled by vmx_vcpu_run() */ 5298 5299 if (is_no_device(intr_info)) { 5300 vmx_fpu_activate(vcpu); 5301 return 1; 5302 } 5303 5304 if (is_invalid_opcode(intr_info)) { 5305 if (is_guest_mode(vcpu)) { 5306 kvm_queue_exception(vcpu, UD_VECTOR); 5307 return 1; 5308 } 5309 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); 5310 if (er != EMULATE_DONE) 5311 kvm_queue_exception(vcpu, UD_VECTOR); 5312 return 1; 5313 } 5314 5315 error_code = 0; 5316 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5317 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5318 5319 /* 5320 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5321 * MMIO, it is better to report an internal error. 5322 * See the comments in vmx_handle_exit. 5323 */ 5324 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5325 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5326 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5327 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5328 vcpu->run->internal.ndata = 3; 5329 vcpu->run->internal.data[0] = vect_info; 5330 vcpu->run->internal.data[1] = intr_info; 5331 vcpu->run->internal.data[2] = error_code; 5332 return 0; 5333 } 5334 5335 if (is_page_fault(intr_info)) { 5336 /* EPT won't cause page fault directly */ 5337 BUG_ON(enable_ept); 5338 cr2 = vmcs_readl(EXIT_QUALIFICATION); 5339 trace_kvm_page_fault(cr2, error_code); 5340 5341 if (kvm_event_needs_reinjection(vcpu)) 5342 kvm_mmu_unprotect_page_virt(vcpu, cr2); 5343 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 5344 } 5345 5346 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5347 5348 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5349 return handle_rmode_exception(vcpu, ex_no, error_code); 5350 5351 switch (ex_no) { 5352 case AC_VECTOR: 5353 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5354 return 1; 5355 case DB_VECTOR: 5356 dr6 = vmcs_readl(EXIT_QUALIFICATION); 5357 if (!(vcpu->guest_debug & 5358 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5359 vcpu->arch.dr6 &= ~15; 5360 vcpu->arch.dr6 |= dr6 | DR6_RTM; 5361 if (!(dr6 & ~DR6_RESERVED)) /* icebp */ 5362 skip_emulated_instruction(vcpu); 5363 5364 kvm_queue_exception(vcpu, DB_VECTOR); 5365 return 1; 5366 } 5367 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; 5368 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5369 /* fall through */ 5370 case BP_VECTOR: 5371 /* 5372 * Update instruction length as we may reinject #BP from 5373 * user space while in guest debugging mode. Reading it for 5374 * #DB as well causes no harm, it is not used in that case. 5375 */ 5376 vmx->vcpu.arch.event_exit_inst_len = 5377 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5378 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5379 rip = kvm_rip_read(vcpu); 5380 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 5381 kvm_run->debug.arch.exception = ex_no; 5382 break; 5383 default: 5384 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5385 kvm_run->ex.exception = ex_no; 5386 kvm_run->ex.error_code = error_code; 5387 break; 5388 } 5389 return 0; 5390} 5391 5392static int handle_external_interrupt(struct kvm_vcpu *vcpu) 5393{ 5394 ++vcpu->stat.irq_exits; 5395 return 1; 5396} 5397 5398static int handle_triple_fault(struct kvm_vcpu *vcpu) 5399{ 5400 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5401 return 0; 5402} 5403 5404static int handle_io(struct kvm_vcpu *vcpu) 5405{ 5406 unsigned long exit_qualification; 5407 int size, in, string; 5408 unsigned port; 5409 5410 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5411 string = (exit_qualification & 16) != 0; 5412 in = (exit_qualification & 8) != 0; 5413 5414 ++vcpu->stat.io_exits; 5415 5416 if (string || in) 5417 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5418 5419 port = exit_qualification >> 16; 5420 size = (exit_qualification & 7) + 1; 5421 skip_emulated_instruction(vcpu); 5422 5423 return kvm_fast_pio_out(vcpu, size, port); 5424} 5425 5426static void 5427vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5428{ 5429 /* 5430 * Patch in the VMCALL instruction: 5431 */ 5432 hypercall[0] = 0x0f; 5433 hypercall[1] = 0x01; 5434 hypercall[2] = 0xc1; 5435} 5436 5437static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) 5438{ 5439 unsigned long always_on = VMXON_CR0_ALWAYSON; 5440 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5441 5442 if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & 5443 SECONDARY_EXEC_UNRESTRICTED_GUEST && 5444 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 5445 always_on &= ~(X86_CR0_PE | X86_CR0_PG); 5446 return (val & always_on) == always_on; 5447} 5448 5449/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5450static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5451{ 5452 if (is_guest_mode(vcpu)) { 5453 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5454 unsigned long orig_val = val; 5455 5456 /* 5457 * We get here when L2 changed cr0 in a way that did not change 5458 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5459 * but did change L0 shadowed bits. So we first calculate the 5460 * effective cr0 value that L1 would like to write into the 5461 * hardware. It consists of the L2-owned bits from the new 5462 * value combined with the L1-owned bits from L1's guest_cr0. 5463 */ 5464 val = (val & ~vmcs12->cr0_guest_host_mask) | 5465 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5466 5467 if (!nested_cr0_valid(vcpu, val)) 5468 return 1; 5469 5470 if (kvm_set_cr0(vcpu, val)) 5471 return 1; 5472 vmcs_writel(CR0_READ_SHADOW, orig_val); 5473 return 0; 5474 } else { 5475 if (to_vmx(vcpu)->nested.vmxon && 5476 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) 5477 return 1; 5478 return kvm_set_cr0(vcpu, val); 5479 } 5480} 5481 5482static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5483{ 5484 if (is_guest_mode(vcpu)) { 5485 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5486 unsigned long orig_val = val; 5487 5488 /* analogously to handle_set_cr0 */ 5489 val = (val & ~vmcs12->cr4_guest_host_mask) | 5490 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5491 if (kvm_set_cr4(vcpu, val)) 5492 return 1; 5493 vmcs_writel(CR4_READ_SHADOW, orig_val); 5494 return 0; 5495 } else 5496 return kvm_set_cr4(vcpu, val); 5497} 5498 5499/* called to set cr0 as approriate for clts instruction exit. */ 5500static void handle_clts(struct kvm_vcpu *vcpu) 5501{ 5502 if (is_guest_mode(vcpu)) { 5503 /* 5504 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS 5505 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, 5506 * just pretend it's off (also in arch.cr0 for fpu_activate). 5507 */ 5508 vmcs_writel(CR0_READ_SHADOW, 5509 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); 5510 vcpu->arch.cr0 &= ~X86_CR0_TS; 5511 } else 5512 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 5513} 5514 5515static int handle_cr(struct kvm_vcpu *vcpu) 5516{ 5517 unsigned long exit_qualification, val; 5518 int cr; 5519 int reg; 5520 int err; 5521 5522 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5523 cr = exit_qualification & 15; 5524 reg = (exit_qualification >> 8) & 15; 5525 switch ((exit_qualification >> 4) & 3) { 5526 case 0: /* mov to cr */ 5527 val = kvm_register_readl(vcpu, reg); 5528 trace_kvm_cr_write(cr, val); 5529 switch (cr) { 5530 case 0: 5531 err = handle_set_cr0(vcpu, val); 5532 kvm_complete_insn_gp(vcpu, err); 5533 return 1; 5534 case 3: 5535 err = kvm_set_cr3(vcpu, val); 5536 kvm_complete_insn_gp(vcpu, err); 5537 return 1; 5538 case 4: 5539 err = handle_set_cr4(vcpu, val); 5540 kvm_complete_insn_gp(vcpu, err); 5541 return 1; 5542 case 8: { 5543 u8 cr8_prev = kvm_get_cr8(vcpu); 5544 u8 cr8 = (u8)val; 5545 err = kvm_set_cr8(vcpu, cr8); 5546 kvm_complete_insn_gp(vcpu, err); 5547 if (lapic_in_kernel(vcpu)) 5548 return 1; 5549 if (cr8_prev <= cr8) 5550 return 1; 5551 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5552 return 0; 5553 } 5554 } 5555 break; 5556 case 2: /* clts */ 5557 handle_clts(vcpu); 5558 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 5559 skip_emulated_instruction(vcpu); 5560 vmx_fpu_activate(vcpu); 5561 return 1; 5562 case 1: /*mov from cr*/ 5563 switch (cr) { 5564 case 3: 5565 val = kvm_read_cr3(vcpu); 5566 kvm_register_write(vcpu, reg, val); 5567 trace_kvm_cr_read(cr, val); 5568 skip_emulated_instruction(vcpu); 5569 return 1; 5570 case 8: 5571 val = kvm_get_cr8(vcpu); 5572 kvm_register_write(vcpu, reg, val); 5573 trace_kvm_cr_read(cr, val); 5574 skip_emulated_instruction(vcpu); 5575 return 1; 5576 } 5577 break; 5578 case 3: /* lmsw */ 5579 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5580 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); 5581 kvm_lmsw(vcpu, val); 5582 5583 skip_emulated_instruction(vcpu); 5584 return 1; 5585 default: 5586 break; 5587 } 5588 vcpu->run->exit_reason = 0; 5589 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5590 (int)(exit_qualification >> 4) & 3, cr); 5591 return 0; 5592} 5593 5594static int handle_dr(struct kvm_vcpu *vcpu) 5595{ 5596 unsigned long exit_qualification; 5597 int dr, dr7, reg; 5598 5599 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5600 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5601 5602 /* First, if DR does not exist, trigger UD */ 5603 if (!kvm_require_dr(vcpu, dr)) 5604 return 1; 5605 5606 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 5607 if (!kvm_require_cpl(vcpu, 0)) 5608 return 1; 5609 dr7 = vmcs_readl(GUEST_DR7); 5610 if (dr7 & DR7_GD) { 5611 /* 5612 * As the vm-exit takes precedence over the debug trap, we 5613 * need to emulate the latter, either for the host or the 5614 * guest debugging itself. 5615 */ 5616 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5617 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; 5618 vcpu->run->debug.arch.dr7 = dr7; 5619 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5620 vcpu->run->debug.arch.exception = DB_VECTOR; 5621 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5622 return 0; 5623 } else { 5624 vcpu->arch.dr6 &= ~15; 5625 vcpu->arch.dr6 |= DR6_BD | DR6_RTM; 5626 kvm_queue_exception(vcpu, DB_VECTOR); 5627 return 1; 5628 } 5629 } 5630 5631 if (vcpu->guest_debug == 0) { 5632 u32 cpu_based_vm_exec_control; 5633 5634 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5635 cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; 5636 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5637 5638 /* 5639 * No more DR vmexits; force a reload of the debug registers 5640 * and reenter on this instruction. The next vmexit will 5641 * retrieve the full state of the debug registers. 5642 */ 5643 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5644 return 1; 5645 } 5646 5647 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5648 if (exit_qualification & TYPE_MOV_FROM_DR) { 5649 unsigned long val; 5650 5651 if (kvm_get_dr(vcpu, dr, &val)) 5652 return 1; 5653 kvm_register_write(vcpu, reg, val); 5654 } else 5655 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) 5656 return 1; 5657 5658 skip_emulated_instruction(vcpu); 5659 return 1; 5660} 5661 5662static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) 5663{ 5664 return vcpu->arch.dr6; 5665} 5666 5667static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5668{ 5669} 5670 5671static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5672{ 5673 u32 cpu_based_vm_exec_control; 5674 5675 get_debugreg(vcpu->arch.db[0], 0); 5676 get_debugreg(vcpu->arch.db[1], 1); 5677 get_debugreg(vcpu->arch.db[2], 2); 5678 get_debugreg(vcpu->arch.db[3], 3); 5679 get_debugreg(vcpu->arch.dr6, 6); 5680 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5681 5682 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5683 5684 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5685 cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; 5686 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5687} 5688 5689static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5690{ 5691 vmcs_writel(GUEST_DR7, val); 5692} 5693 5694static int handle_cpuid(struct kvm_vcpu *vcpu) 5695{ 5696 kvm_emulate_cpuid(vcpu); 5697 return 1; 5698} 5699 5700static int handle_rdmsr(struct kvm_vcpu *vcpu) 5701{ 5702 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5703 struct msr_data msr_info; 5704 5705 msr_info.index = ecx; 5706 msr_info.host_initiated = false; 5707 if (vmx_get_msr(vcpu, &msr_info)) { 5708 trace_kvm_msr_read_ex(ecx); 5709 kvm_inject_gp(vcpu, 0); 5710 return 1; 5711 } 5712 5713 trace_kvm_msr_read(ecx, msr_info.data); 5714 5715 /* FIXME: handling of bits 32:63 of rax, rdx */ 5716 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; 5717 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; 5718 skip_emulated_instruction(vcpu); 5719 return 1; 5720} 5721 5722static int handle_wrmsr(struct kvm_vcpu *vcpu) 5723{ 5724 struct msr_data msr; 5725 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5726 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 5727 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 5728 5729 msr.data = data; 5730 msr.index = ecx; 5731 msr.host_initiated = false; 5732 if (kvm_set_msr(vcpu, &msr) != 0) { 5733 trace_kvm_msr_write_ex(ecx, data); 5734 kvm_inject_gp(vcpu, 0); 5735 return 1; 5736 } 5737 5738 trace_kvm_msr_write(ecx, data); 5739 skip_emulated_instruction(vcpu); 5740 return 1; 5741} 5742 5743static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5744{ 5745 kvm_make_request(KVM_REQ_EVENT, vcpu); 5746 return 1; 5747} 5748 5749static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5750{ 5751 u32 cpu_based_vm_exec_control; 5752 5753 /* clear pending irq */ 5754 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5755 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 5756 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5757 5758 kvm_make_request(KVM_REQ_EVENT, vcpu); 5759 5760 ++vcpu->stat.irq_window_exits; 5761 return 1; 5762} 5763 5764static int handle_halt(struct kvm_vcpu *vcpu) 5765{ 5766 return kvm_emulate_halt(vcpu); 5767} 5768 5769static int handle_vmcall(struct kvm_vcpu *vcpu) 5770{ 5771 kvm_emulate_hypercall(vcpu); 5772 return 1; 5773} 5774 5775static int handle_invd(struct kvm_vcpu *vcpu) 5776{ 5777 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5778} 5779 5780static int handle_invlpg(struct kvm_vcpu *vcpu) 5781{ 5782 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5783 5784 kvm_mmu_invlpg(vcpu, exit_qualification); 5785 skip_emulated_instruction(vcpu); 5786 return 1; 5787} 5788 5789static int handle_rdpmc(struct kvm_vcpu *vcpu) 5790{ 5791 int err; 5792 5793 err = kvm_rdpmc(vcpu); 5794 kvm_complete_insn_gp(vcpu, err); 5795 5796 return 1; 5797} 5798 5799static int handle_wbinvd(struct kvm_vcpu *vcpu) 5800{ 5801 kvm_emulate_wbinvd(vcpu); 5802 return 1; 5803} 5804 5805static int handle_xsetbv(struct kvm_vcpu *vcpu) 5806{ 5807 u64 new_bv = kvm_read_edx_eax(vcpu); 5808 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 5809 5810 if (kvm_set_xcr(vcpu, index, new_bv) == 0) 5811 skip_emulated_instruction(vcpu); 5812 return 1; 5813} 5814 5815static int handle_xsaves(struct kvm_vcpu *vcpu) 5816{ 5817 skip_emulated_instruction(vcpu); 5818 WARN(1, "this should never happen\n"); 5819 return 1; 5820} 5821 5822static int handle_xrstors(struct kvm_vcpu *vcpu) 5823{ 5824 skip_emulated_instruction(vcpu); 5825 WARN(1, "this should never happen\n"); 5826 return 1; 5827} 5828 5829static int handle_apic_access(struct kvm_vcpu *vcpu) 5830{ 5831 if (likely(fasteoi)) { 5832 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5833 int access_type, offset; 5834 5835 access_type = exit_qualification & APIC_ACCESS_TYPE; 5836 offset = exit_qualification & APIC_ACCESS_OFFSET; 5837 /* 5838 * Sane guest uses MOV to write EOI, with written value 5839 * not cared. So make a short-circuit here by avoiding 5840 * heavy instruction emulation. 5841 */ 5842 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5843 (offset == APIC_EOI)) { 5844 kvm_lapic_set_eoi(vcpu); 5845 skip_emulated_instruction(vcpu); 5846 return 1; 5847 } 5848 } 5849 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5850} 5851 5852static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5853{ 5854 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5855 int vector = exit_qualification & 0xff; 5856 5857 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5858 kvm_apic_set_eoi_accelerated(vcpu, vector); 5859 return 1; 5860} 5861 5862static int handle_apic_write(struct kvm_vcpu *vcpu) 5863{ 5864 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5865 u32 offset = exit_qualification & 0xfff; 5866 5867 /* APIC-write VM exit is trap-like and thus no need to adjust IP */ 5868 kvm_apic_write_nodecode(vcpu, offset); 5869 return 1; 5870} 5871 5872static int handle_task_switch(struct kvm_vcpu *vcpu) 5873{ 5874 struct vcpu_vmx *vmx = to_vmx(vcpu); 5875 unsigned long exit_qualification; 5876 bool has_error_code = false; 5877 u32 error_code = 0; 5878 u16 tss_selector; 5879 int reason, type, idt_v, idt_index; 5880 5881 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5882 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5883 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5884 5885 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5886 5887 reason = (u32)exit_qualification >> 30; 5888 if (reason == TASK_SWITCH_GATE && idt_v) { 5889 switch (type) { 5890 case INTR_TYPE_NMI_INTR: 5891 vcpu->arch.nmi_injected = false; 5892 vmx_set_nmi_mask(vcpu, true); 5893 break; 5894 case INTR_TYPE_EXT_INTR: 5895 case INTR_TYPE_SOFT_INTR: 5896 kvm_clear_interrupt_queue(vcpu); 5897 break; 5898 case INTR_TYPE_HARD_EXCEPTION: 5899 if (vmx->idt_vectoring_info & 5900 VECTORING_INFO_DELIVER_CODE_MASK) { 5901 has_error_code = true; 5902 error_code = 5903 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5904 } 5905 /* fall through */ 5906 case INTR_TYPE_SOFT_EXCEPTION: 5907 kvm_clear_exception_queue(vcpu); 5908 break; 5909 default: 5910 break; 5911 } 5912 } 5913 tss_selector = exit_qualification; 5914 5915 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5916 type != INTR_TYPE_EXT_INTR && 5917 type != INTR_TYPE_NMI_INTR)) 5918 skip_emulated_instruction(vcpu); 5919 5920 if (kvm_task_switch(vcpu, tss_selector, 5921 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, 5922 has_error_code, error_code) == EMULATE_FAIL) { 5923 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5924 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5925 vcpu->run->internal.ndata = 0; 5926 return 0; 5927 } 5928 5929 /* 5930 * TODO: What about debug traps on tss switch? 5931 * Are we supposed to inject them and update dr6? 5932 */ 5933 5934 return 1; 5935} 5936 5937static int handle_ept_violation(struct kvm_vcpu *vcpu) 5938{ 5939 unsigned long exit_qualification; 5940 gpa_t gpa; 5941 u32 error_code; 5942 int gla_validity; 5943 5944 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5945 5946 gla_validity = (exit_qualification >> 7) & 0x3; 5947 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { 5948 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 5949 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 5950 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 5951 vmcs_readl(GUEST_LINEAR_ADDRESS)); 5952 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 5953 (long unsigned int)exit_qualification); 5954 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 5955 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 5956 return 0; 5957 } 5958 5959 /* 5960 * EPT violation happened while executing iret from NMI, 5961 * "blocked by NMI" bit has to be set before next VM entry. 5962 * There are errata that may cause this bit to not be set: 5963 * AAK134, BY25. 5964 */ 5965 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5966 cpu_has_virtual_nmis() && 5967 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5968 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5969 5970 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5971 trace_kvm_page_fault(gpa, exit_qualification); 5972 5973 /* It is a write fault? */ 5974 error_code = exit_qualification & PFERR_WRITE_MASK; 5975 /* It is a fetch fault? */ 5976 error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; 5977 /* ept page table is present? */ 5978 error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; 5979 5980 vcpu->arch.exit_qualification = exit_qualification; 5981 5982 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5983} 5984 5985static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5986{ 5987 int ret; 5988 gpa_t gpa; 5989 5990 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5991 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5992 skip_emulated_instruction(vcpu); 5993 trace_kvm_fast_mmio(gpa); 5994 return 1; 5995 } 5996 5997 ret = handle_mmio_page_fault(vcpu, gpa, true); 5998 if (likely(ret == RET_MMIO_PF_EMULATE)) 5999 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 6000 EMULATE_DONE; 6001 6002 if (unlikely(ret == RET_MMIO_PF_INVALID)) 6003 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); 6004 6005 if (unlikely(ret == RET_MMIO_PF_RETRY)) 6006 return 1; 6007 6008 /* It is the real ept misconfig */ 6009 WARN_ON(1); 6010 6011 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 6012 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 6013 6014 return 0; 6015} 6016 6017static int handle_nmi_window(struct kvm_vcpu *vcpu) 6018{ 6019 u32 cpu_based_vm_exec_control; 6020 6021 /* clear pending NMI */ 6022 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6023 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 6024 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 6025 ++vcpu->stat.nmi_window_exits; 6026 kvm_make_request(KVM_REQ_EVENT, vcpu); 6027 6028 return 1; 6029} 6030 6031static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 6032{ 6033 struct vcpu_vmx *vmx = to_vmx(vcpu); 6034 enum emulation_result err = EMULATE_DONE; 6035 int ret = 1; 6036 u32 cpu_exec_ctrl; 6037 bool intr_window_requested; 6038 unsigned count = 130; 6039 6040 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6041 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 6042 6043 while (vmx->emulation_required && count-- != 0) { 6044 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 6045 return handle_interrupt_window(&vmx->vcpu); 6046 6047 if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 6048 return 1; 6049 6050 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 6051 6052 if (err == EMULATE_USER_EXIT) { 6053 ++vcpu->stat.mmio_exits; 6054 ret = 0; 6055 goto out; 6056 } 6057 6058 if (err != EMULATE_DONE) { 6059 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6060 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 6061 vcpu->run->internal.ndata = 0; 6062 return 0; 6063 } 6064 6065 if (vcpu->arch.halt_request) { 6066 vcpu->arch.halt_request = 0; 6067 ret = kvm_vcpu_halt(vcpu); 6068 goto out; 6069 } 6070 6071 if (signal_pending(current)) 6072 goto out; 6073 if (need_resched()) 6074 schedule(); 6075 } 6076 6077out: 6078 return ret; 6079} 6080 6081static int __grow_ple_window(int val) 6082{ 6083 if (ple_window_grow < 1) 6084 return ple_window; 6085 6086 val = min(val, ple_window_actual_max); 6087 6088 if (ple_window_grow < ple_window) 6089 val *= ple_window_grow; 6090 else 6091 val += ple_window_grow; 6092 6093 return val; 6094} 6095 6096static int __shrink_ple_window(int val, int modifier, int minimum) 6097{ 6098 if (modifier < 1) 6099 return ple_window; 6100 6101 if (modifier < ple_window) 6102 val /= modifier; 6103 else 6104 val -= modifier; 6105 6106 return max(val, minimum); 6107} 6108 6109static void grow_ple_window(struct kvm_vcpu *vcpu) 6110{ 6111 struct vcpu_vmx *vmx = to_vmx(vcpu); 6112 int old = vmx->ple_window; 6113 6114 vmx->ple_window = __grow_ple_window(old); 6115 6116 if (vmx->ple_window != old) 6117 vmx->ple_window_dirty = true; 6118 6119 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); 6120} 6121 6122static void shrink_ple_window(struct kvm_vcpu *vcpu) 6123{ 6124 struct vcpu_vmx *vmx = to_vmx(vcpu); 6125 int old = vmx->ple_window; 6126 6127 vmx->ple_window = __shrink_ple_window(old, 6128 ple_window_shrink, ple_window); 6129 6130 if (vmx->ple_window != old) 6131 vmx->ple_window_dirty = true; 6132 6133 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); 6134} 6135 6136/* 6137 * ple_window_actual_max is computed to be one grow_ple_window() below 6138 * ple_window_max. (See __grow_ple_window for the reason.) 6139 * This prevents overflows, because ple_window_max is int. 6140 * ple_window_max effectively rounded down to a multiple of ple_window_grow in 6141 * this process. 6142 * ple_window_max is also prevented from setting vmx->ple_window < ple_window. 6143 */ 6144static void update_ple_window_actual_max(void) 6145{ 6146 ple_window_actual_max = 6147 __shrink_ple_window(max(ple_window_max, ple_window), 6148 ple_window_grow, INT_MIN); 6149} 6150 6151/* 6152 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. 6153 */ 6154static void wakeup_handler(void) 6155{ 6156 struct kvm_vcpu *vcpu; 6157 int cpu = smp_processor_id(); 6158 6159 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 6160 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), 6161 blocked_vcpu_list) { 6162 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 6163 6164 if (pi_test_on(pi_desc) == 1) 6165 kvm_vcpu_kick(vcpu); 6166 } 6167 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 6168} 6169 6170static __init int hardware_setup(void) 6171{ 6172 int r = -ENOMEM, i, msr; 6173 6174 rdmsrl_safe(MSR_EFER, &host_efer); 6175 6176 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) 6177 kvm_define_shared_msr(i, vmx_msr_index[i]); 6178 6179 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 6180 if (!vmx_io_bitmap_a) 6181 return r; 6182 6183 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 6184 if (!vmx_io_bitmap_b) 6185 goto out; 6186 6187 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 6188 if (!vmx_msr_bitmap_legacy) 6189 goto out1; 6190 6191 vmx_msr_bitmap_legacy_x2apic = 6192 (unsigned long *)__get_free_page(GFP_KERNEL); 6193 if (!vmx_msr_bitmap_legacy_x2apic) 6194 goto out2; 6195 6196 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 6197 if (!vmx_msr_bitmap_longmode) 6198 goto out3; 6199 6200 vmx_msr_bitmap_longmode_x2apic = 6201 (unsigned long *)__get_free_page(GFP_KERNEL); 6202 if (!vmx_msr_bitmap_longmode_x2apic) 6203 goto out4; 6204 6205 if (nested) { 6206 vmx_msr_bitmap_nested = 6207 (unsigned long *)__get_free_page(GFP_KERNEL); 6208 if (!vmx_msr_bitmap_nested) 6209 goto out5; 6210 } 6211 6212 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6213 if (!vmx_vmread_bitmap) 6214 goto out6; 6215 6216 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6217 if (!vmx_vmwrite_bitmap) 6218 goto out7; 6219 6220 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 6221 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 6222 6223 /* 6224 * Allow direct access to the PC debug port (it is often used for I/O 6225 * delays, but the vmexits simply slow things down). 6226 */ 6227 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); 6228 clear_bit(0x80, vmx_io_bitmap_a); 6229 6230 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); 6231 6232 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); 6233 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); 6234 if (nested) 6235 memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); 6236 6237 if (setup_vmcs_config(&vmcs_config) < 0) { 6238 r = -EIO; 6239 goto out8; 6240 } 6241 6242 if (boot_cpu_has(X86_FEATURE_NX)) 6243 kvm_enable_efer_bits(EFER_NX); 6244 6245 if (!cpu_has_vmx_vpid()) 6246 enable_vpid = 0; 6247 if (!cpu_has_vmx_shadow_vmcs()) 6248 enable_shadow_vmcs = 0; 6249 if (enable_shadow_vmcs) 6250 init_vmcs_shadow_fields(); 6251 6252 if (!cpu_has_vmx_ept() || 6253 !cpu_has_vmx_ept_4levels()) { 6254 enable_ept = 0; 6255 enable_unrestricted_guest = 0; 6256 enable_ept_ad_bits = 0; 6257 } 6258 6259 if (!cpu_has_vmx_ept_ad_bits()) 6260 enable_ept_ad_bits = 0; 6261 6262 if (!cpu_has_vmx_unrestricted_guest()) 6263 enable_unrestricted_guest = 0; 6264 6265 if (!cpu_has_vmx_flexpriority()) 6266 flexpriority_enabled = 0; 6267 6268 /* 6269 * set_apic_access_page_addr() is used to reload apic access 6270 * page upon invalidation. No need to do anything if not 6271 * using the APIC_ACCESS_ADDR VMCS field. 6272 */ 6273 if (!flexpriority_enabled) 6274 kvm_x86_ops->set_apic_access_page_addr = NULL; 6275 6276 if (!cpu_has_vmx_tpr_shadow()) 6277 kvm_x86_ops->update_cr8_intercept = NULL; 6278 6279 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 6280 kvm_disable_largepages(); 6281 6282 if (!cpu_has_vmx_ple()) 6283 ple_gap = 0; 6284 6285 if (!cpu_has_vmx_apicv()) 6286 enable_apicv = 0; 6287 6288 if (cpu_has_vmx_tsc_scaling()) { 6289 kvm_has_tsc_control = true; 6290 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 6291 kvm_tsc_scaling_ratio_frac_bits = 48; 6292 } 6293 6294 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 6295 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 6296 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 6297 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 6298 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 6299 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 6300 vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); 6301 6302 memcpy(vmx_msr_bitmap_legacy_x2apic, 6303 vmx_msr_bitmap_legacy, PAGE_SIZE); 6304 memcpy(vmx_msr_bitmap_longmode_x2apic, 6305 vmx_msr_bitmap_longmode, PAGE_SIZE); 6306 6307 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 6308 6309 if (enable_apicv) { 6310 for (msr = 0x800; msr <= 0x8ff; msr++) 6311 vmx_disable_intercept_msr_read_x2apic(msr); 6312 6313 /* According SDM, in x2apic mode, the whole id reg is used. 6314 * But in KVM, it only use the highest eight bits. Need to 6315 * intercept it */ 6316 vmx_enable_intercept_msr_read_x2apic(0x802); 6317 /* TMCCT */ 6318 vmx_enable_intercept_msr_read_x2apic(0x839); 6319 /* TPR */ 6320 vmx_disable_intercept_msr_write_x2apic(0x808); 6321 /* EOI */ 6322 vmx_disable_intercept_msr_write_x2apic(0x80b); 6323 /* SELF-IPI */ 6324 vmx_disable_intercept_msr_write_x2apic(0x83f); 6325 } 6326 6327 if (enable_ept) { 6328 kvm_mmu_set_mask_ptes(0ull, 6329 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, 6330 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, 6331 0ull, VMX_EPT_EXECUTABLE_MASK); 6332 ept_set_mmio_spte_mask(); 6333 kvm_enable_tdp(); 6334 } else 6335 kvm_disable_tdp(); 6336 6337 update_ple_window_actual_max(); 6338 6339 /* 6340 * Only enable PML when hardware supports PML feature, and both EPT 6341 * and EPT A/D bit features are enabled -- PML depends on them to work. 6342 */ 6343 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 6344 enable_pml = 0; 6345 6346 if (!enable_pml) { 6347 kvm_x86_ops->slot_enable_log_dirty = NULL; 6348 kvm_x86_ops->slot_disable_log_dirty = NULL; 6349 kvm_x86_ops->flush_log_dirty = NULL; 6350 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 6351 } 6352 6353 kvm_set_posted_intr_wakeup_handler(wakeup_handler); 6354 6355 return alloc_kvm_area(); 6356 6357out8: 6358 free_page((unsigned long)vmx_vmwrite_bitmap); 6359out7: 6360 free_page((unsigned long)vmx_vmread_bitmap); 6361out6: 6362 if (nested) 6363 free_page((unsigned long)vmx_msr_bitmap_nested); 6364out5: 6365 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6366out4: 6367 free_page((unsigned long)vmx_msr_bitmap_longmode); 6368out3: 6369 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 6370out2: 6371 free_page((unsigned long)vmx_msr_bitmap_legacy); 6372out1: 6373 free_page((unsigned long)vmx_io_bitmap_b); 6374out: 6375 free_page((unsigned long)vmx_io_bitmap_a); 6376 6377 return r; 6378} 6379 6380static __exit void hardware_unsetup(void) 6381{ 6382 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 6383 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6384 free_page((unsigned long)vmx_msr_bitmap_legacy); 6385 free_page((unsigned long)vmx_msr_bitmap_longmode); 6386 free_page((unsigned long)vmx_io_bitmap_b); 6387 free_page((unsigned long)vmx_io_bitmap_a); 6388 free_page((unsigned long)vmx_vmwrite_bitmap); 6389 free_page((unsigned long)vmx_vmread_bitmap); 6390 if (nested) 6391 free_page((unsigned long)vmx_msr_bitmap_nested); 6392 6393 free_kvm_area(); 6394} 6395 6396/* 6397 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 6398 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 6399 */ 6400static int handle_pause(struct kvm_vcpu *vcpu) 6401{ 6402 if (ple_gap) 6403 grow_ple_window(vcpu); 6404 6405 skip_emulated_instruction(vcpu); 6406 kvm_vcpu_on_spin(vcpu); 6407 6408 return 1; 6409} 6410 6411static int handle_nop(struct kvm_vcpu *vcpu) 6412{ 6413 skip_emulated_instruction(vcpu); 6414 return 1; 6415} 6416 6417static int handle_mwait(struct kvm_vcpu *vcpu) 6418{ 6419 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 6420 return handle_nop(vcpu); 6421} 6422 6423static int handle_monitor_trap(struct kvm_vcpu *vcpu) 6424{ 6425 return 1; 6426} 6427 6428static int handle_monitor(struct kvm_vcpu *vcpu) 6429{ 6430 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 6431 return handle_nop(vcpu); 6432} 6433 6434/* 6435 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. 6436 * We could reuse a single VMCS for all the L2 guests, but we also want the 6437 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this 6438 * allows keeping them loaded on the processor, and in the future will allow 6439 * optimizations where prepare_vmcs02 doesn't need to set all the fields on 6440 * every entry if they never change. 6441 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE 6442 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. 6443 * 6444 * The following functions allocate and free a vmcs02 in this pool. 6445 */ 6446 6447/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ 6448static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) 6449{ 6450 struct vmcs02_list *item; 6451 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 6452 if (item->vmptr == vmx->nested.current_vmptr) { 6453 list_move(&item->list, &vmx->nested.vmcs02_pool); 6454 return &item->vmcs02; 6455 } 6456 6457 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { 6458 /* Recycle the least recently used VMCS. */ 6459 item = list_entry(vmx->nested.vmcs02_pool.prev, 6460 struct vmcs02_list, list); 6461 item->vmptr = vmx->nested.current_vmptr; 6462 list_move(&item->list, &vmx->nested.vmcs02_pool); 6463 return &item->vmcs02; 6464 } 6465 6466 /* Create a new VMCS */ 6467 item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); 6468 if (!item) 6469 return NULL; 6470 item->vmcs02.vmcs = alloc_vmcs(); 6471 if (!item->vmcs02.vmcs) { 6472 kfree(item); 6473 return NULL; 6474 } 6475 loaded_vmcs_init(&item->vmcs02); 6476 item->vmptr = vmx->nested.current_vmptr; 6477 list_add(&(item->list), &(vmx->nested.vmcs02_pool)); 6478 vmx->nested.vmcs02_num++; 6479 return &item->vmcs02; 6480} 6481 6482/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ 6483static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) 6484{ 6485 struct vmcs02_list *item; 6486 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 6487 if (item->vmptr == vmptr) { 6488 free_loaded_vmcs(&item->vmcs02); 6489 list_del(&item->list); 6490 kfree(item); 6491 vmx->nested.vmcs02_num--; 6492 return; 6493 } 6494} 6495 6496/* 6497 * Free all VMCSs saved for this vcpu, except the one pointed by 6498 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs 6499 * must be &vmx->vmcs01. 6500 */ 6501static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) 6502{ 6503 struct vmcs02_list *item, *n; 6504 6505 WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); 6506 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { 6507 /* 6508 * Something will leak if the above WARN triggers. Better than 6509 * a use-after-free. 6510 */ 6511 if (vmx->loaded_vmcs == &item->vmcs02) 6512 continue; 6513 6514 free_loaded_vmcs(&item->vmcs02); 6515 list_del(&item->list); 6516 kfree(item); 6517 vmx->nested.vmcs02_num--; 6518 } 6519} 6520 6521/* 6522 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 6523 * set the success or error code of an emulated VMX instruction, as specified 6524 * by Vol 2B, VMX Instruction Reference, "Conventions". 6525 */ 6526static void nested_vmx_succeed(struct kvm_vcpu *vcpu) 6527{ 6528 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 6529 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 6530 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 6531} 6532 6533static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 6534{ 6535 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 6536 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 6537 X86_EFLAGS_SF | X86_EFLAGS_OF)) 6538 | X86_EFLAGS_CF); 6539} 6540 6541static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 6542 u32 vm_instruction_error) 6543{ 6544 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { 6545 /* 6546 * failValid writes the error number to the current VMCS, which 6547 * can't be done there isn't a current VMCS. 6548 */ 6549 nested_vmx_failInvalid(vcpu); 6550 return; 6551 } 6552 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 6553 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 6554 X86_EFLAGS_SF | X86_EFLAGS_OF)) 6555 | X86_EFLAGS_ZF); 6556 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 6557 /* 6558 * We don't need to force a shadow sync because 6559 * VM_INSTRUCTION_ERROR is not shadowed 6560 */ 6561} 6562 6563static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 6564{ 6565 /* TODO: not to reset guest simply here. */ 6566 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 6567 pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); 6568} 6569 6570static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 6571{ 6572 struct vcpu_vmx *vmx = 6573 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 6574 6575 vmx->nested.preemption_timer_expired = true; 6576 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 6577 kvm_vcpu_kick(&vmx->vcpu); 6578 6579 return HRTIMER_NORESTART; 6580} 6581 6582/* 6583 * Decode the memory-address operand of a vmx instruction, as recorded on an 6584 * exit caused by such an instruction (run by a guest hypervisor). 6585 * On success, returns 0. When the operand is invalid, returns 1 and throws 6586 * #UD or #GP. 6587 */ 6588static int get_vmx_mem_address(struct kvm_vcpu *vcpu, 6589 unsigned long exit_qualification, 6590 u32 vmx_instruction_info, bool wr, gva_t *ret) 6591{ 6592 gva_t off; 6593 bool exn; 6594 struct kvm_segment s; 6595 6596 /* 6597 * According to Vol. 3B, "Information for VM Exits Due to Instruction 6598 * Execution", on an exit, vmx_instruction_info holds most of the 6599 * addressing components of the operand. Only the displacement part 6600 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 6601 * For how an actual address is calculated from all these components, 6602 * refer to Vol. 1, "Operand Addressing". 6603 */ 6604 int scaling = vmx_instruction_info & 3; 6605 int addr_size = (vmx_instruction_info >> 7) & 7; 6606 bool is_reg = vmx_instruction_info & (1u << 10); 6607 int seg_reg = (vmx_instruction_info >> 15) & 7; 6608 int index_reg = (vmx_instruction_info >> 18) & 0xf; 6609 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 6610 int base_reg = (vmx_instruction_info >> 23) & 0xf; 6611 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 6612 6613 if (is_reg) { 6614 kvm_queue_exception(vcpu, UD_VECTOR); 6615 return 1; 6616 } 6617 6618 /* Addr = segment_base + offset */ 6619 /* offset = base + [index * scale] + displacement */ 6620 off = exit_qualification; /* holds the displacement */ 6621 if (base_is_valid) 6622 off += kvm_register_read(vcpu, base_reg); 6623 if (index_is_valid) 6624 off += kvm_register_read(vcpu, index_reg)<<scaling; 6625 vmx_get_segment(vcpu, &s, seg_reg); 6626 *ret = s.base + off; 6627 6628 if (addr_size == 1) /* 32 bit */ 6629 *ret &= 0xffffffff; 6630 6631 /* Checks for #GP/#SS exceptions. */ 6632 exn = false; 6633 if (is_protmode(vcpu)) { 6634 /* Protected mode: apply checks for segment validity in the 6635 * following order: 6636 * - segment type check (#GP(0) may be thrown) 6637 * - usability check (#GP(0)/#SS(0)) 6638 * - limit check (#GP(0)/#SS(0)) 6639 */ 6640 if (wr) 6641 /* #GP(0) if the destination operand is located in a 6642 * read-only data segment or any code segment. 6643 */ 6644 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 6645 else 6646 /* #GP(0) if the source operand is located in an 6647 * execute-only code segment 6648 */ 6649 exn = ((s.type & 0xa) == 8); 6650 } 6651 if (exn) { 6652 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 6653 return 1; 6654 } 6655 if (is_long_mode(vcpu)) { 6656 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 6657 * non-canonical form. This is an only check for long mode. 6658 */ 6659 exn = is_noncanonical_address(*ret); 6660 } else if (is_protmode(vcpu)) { 6661 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 6662 */ 6663 exn = (s.unusable != 0); 6664 /* Protected mode: #GP(0)/#SS(0) if the memory 6665 * operand is outside the segment limit. 6666 */ 6667 exn = exn || (off + sizeof(u64) > s.limit); 6668 } 6669 if (exn) { 6670 kvm_queue_exception_e(vcpu, 6671 seg_reg == VCPU_SREG_SS ? 6672 SS_VECTOR : GP_VECTOR, 6673 0); 6674 return 1; 6675 } 6676 6677 return 0; 6678} 6679 6680/* 6681 * This function performs the various checks including 6682 * - if it's 4KB aligned 6683 * - No bits beyond the physical address width are set 6684 * - Returns 0 on success or else 1 6685 * (Intel SDM Section 30.3) 6686 */ 6687static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, 6688 gpa_t *vmpointer) 6689{ 6690 gva_t gva; 6691 gpa_t vmptr; 6692 struct x86_exception e; 6693 struct page *page; 6694 struct vcpu_vmx *vmx = to_vmx(vcpu); 6695 int maxphyaddr = cpuid_maxphyaddr(vcpu); 6696 6697 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6698 vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) 6699 return 1; 6700 6701 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, 6702 sizeof(vmptr), &e)) { 6703 kvm_inject_page_fault(vcpu, &e); 6704 return 1; 6705 } 6706 6707 switch (exit_reason) { 6708 case EXIT_REASON_VMON: 6709 /* 6710 * SDM 3: 24.11.5 6711 * The first 4 bytes of VMXON region contain the supported 6712 * VMCS revision identifier 6713 * 6714 * Note - IA32_VMX_BASIC[48] will never be 1 6715 * for the nested case; 6716 * which replaces physical address width with 32 6717 * 6718 */ 6719 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6720 nested_vmx_failInvalid(vcpu); 6721 skip_emulated_instruction(vcpu); 6722 return 1; 6723 } 6724 6725 page = nested_get_page(vcpu, vmptr); 6726 if (page == NULL || 6727 *(u32 *)kmap(page) != VMCS12_REVISION) { 6728 nested_vmx_failInvalid(vcpu); 6729 kunmap(page); 6730 skip_emulated_instruction(vcpu); 6731 return 1; 6732 } 6733 kunmap(page); 6734 vmx->nested.vmxon_ptr = vmptr; 6735 break; 6736 case EXIT_REASON_VMCLEAR: 6737 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6738 nested_vmx_failValid(vcpu, 6739 VMXERR_VMCLEAR_INVALID_ADDRESS); 6740 skip_emulated_instruction(vcpu); 6741 return 1; 6742 } 6743 6744 if (vmptr == vmx->nested.vmxon_ptr) { 6745 nested_vmx_failValid(vcpu, 6746 VMXERR_VMCLEAR_VMXON_POINTER); 6747 skip_emulated_instruction(vcpu); 6748 return 1; 6749 } 6750 break; 6751 case EXIT_REASON_VMPTRLD: 6752 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6753 nested_vmx_failValid(vcpu, 6754 VMXERR_VMPTRLD_INVALID_ADDRESS); 6755 skip_emulated_instruction(vcpu); 6756 return 1; 6757 } 6758 6759 if (vmptr == vmx->nested.vmxon_ptr) { 6760 nested_vmx_failValid(vcpu, 6761 VMXERR_VMCLEAR_VMXON_POINTER); 6762 skip_emulated_instruction(vcpu); 6763 return 1; 6764 } 6765 break; 6766 default: 6767 return 1; /* shouldn't happen */ 6768 } 6769 6770 if (vmpointer) 6771 *vmpointer = vmptr; 6772 return 0; 6773} 6774 6775/* 6776 * Emulate the VMXON instruction. 6777 * Currently, we just remember that VMX is active, and do not save or even 6778 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 6779 * do not currently need to store anything in that guest-allocated memory 6780 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 6781 * argument is different from the VMXON pointer (which the spec says they do). 6782 */ 6783static int handle_vmon(struct kvm_vcpu *vcpu) 6784{ 6785 struct kvm_segment cs; 6786 struct vcpu_vmx *vmx = to_vmx(vcpu); 6787 struct vmcs *shadow_vmcs; 6788 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 6789 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 6790 6791 /* The Intel VMX Instruction Reference lists a bunch of bits that 6792 * are prerequisite to running VMXON, most notably cr4.VMXE must be 6793 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). 6794 * Otherwise, we should fail with #UD. We test these now: 6795 */ 6796 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || 6797 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || 6798 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 6799 kvm_queue_exception(vcpu, UD_VECTOR); 6800 return 1; 6801 } 6802 6803 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6804 if (is_long_mode(vcpu) && !cs.l) { 6805 kvm_queue_exception(vcpu, UD_VECTOR); 6806 return 1; 6807 } 6808 6809 if (vmx_get_cpl(vcpu)) { 6810 kvm_inject_gp(vcpu, 0); 6811 return 1; 6812 } 6813 6814 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) 6815 return 1; 6816 6817 if (vmx->nested.vmxon) { 6818 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 6819 skip_emulated_instruction(vcpu); 6820 return 1; 6821 } 6822 6823 if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 6824 != VMXON_NEEDED_FEATURES) { 6825 kvm_inject_gp(vcpu, 0); 6826 return 1; 6827 } 6828 6829 if (enable_shadow_vmcs) { 6830 shadow_vmcs = alloc_vmcs(); 6831 if (!shadow_vmcs) 6832 return -ENOMEM; 6833 /* mark vmcs as shadow */ 6834 shadow_vmcs->revision_id |= (1u << 31); 6835 /* init shadow vmcs */ 6836 vmcs_clear(shadow_vmcs); 6837 vmx->nested.current_shadow_vmcs = shadow_vmcs; 6838 } 6839 6840 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 6841 vmx->nested.vmcs02_num = 0; 6842 6843 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 6844 HRTIMER_MODE_REL); 6845 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 6846 6847 vmx->nested.vmxon = true; 6848 6849 skip_emulated_instruction(vcpu); 6850 nested_vmx_succeed(vcpu); 6851 return 1; 6852} 6853 6854/* 6855 * Intel's VMX Instruction Reference specifies a common set of prerequisites 6856 * for running VMX instructions (except VMXON, whose prerequisites are 6857 * slightly different). It also specifies what exception to inject otherwise. 6858 */ 6859static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 6860{ 6861 struct kvm_segment cs; 6862 struct vcpu_vmx *vmx = to_vmx(vcpu); 6863 6864 if (!vmx->nested.vmxon) { 6865 kvm_queue_exception(vcpu, UD_VECTOR); 6866 return 0; 6867 } 6868 6869 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6870 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || 6871 (is_long_mode(vcpu) && !cs.l)) { 6872 kvm_queue_exception(vcpu, UD_VECTOR); 6873 return 0; 6874 } 6875 6876 if (vmx_get_cpl(vcpu)) { 6877 kvm_inject_gp(vcpu, 0); 6878 return 0; 6879 } 6880 6881 return 1; 6882} 6883 6884static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) 6885{ 6886 if (vmx->nested.current_vmptr == -1ull) 6887 return; 6888 6889 /* current_vmptr and current_vmcs12 are always set/reset together */ 6890 if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) 6891 return; 6892 6893 if (enable_shadow_vmcs) { 6894 /* copy to memory all shadowed fields in case 6895 they were modified */ 6896 copy_shadow_to_vmcs12(vmx); 6897 vmx->nested.sync_shadow_vmcs = false; 6898 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, 6899 SECONDARY_EXEC_SHADOW_VMCS); 6900 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6901 } 6902 vmx->nested.posted_intr_nv = -1; 6903 kunmap(vmx->nested.current_vmcs12_page); 6904 nested_release_page(vmx->nested.current_vmcs12_page); 6905 vmx->nested.current_vmptr = -1ull; 6906 vmx->nested.current_vmcs12 = NULL; 6907} 6908 6909/* 6910 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 6911 * just stops using VMX. 6912 */ 6913static void free_nested(struct vcpu_vmx *vmx) 6914{ 6915 if (!vmx->nested.vmxon) 6916 return; 6917 6918 vmx->nested.vmxon = false; 6919 free_vpid(vmx->nested.vpid02); 6920 nested_release_vmcs12(vmx); 6921 if (enable_shadow_vmcs) 6922 free_vmcs(vmx->nested.current_shadow_vmcs); 6923 /* Unpin physical memory we referred to in current vmcs02 */ 6924 if (vmx->nested.apic_access_page) { 6925 nested_release_page(vmx->nested.apic_access_page); 6926 vmx->nested.apic_access_page = NULL; 6927 } 6928 if (vmx->nested.virtual_apic_page) { 6929 nested_release_page(vmx->nested.virtual_apic_page); 6930 vmx->nested.virtual_apic_page = NULL; 6931 } 6932 if (vmx->nested.pi_desc_page) { 6933 kunmap(vmx->nested.pi_desc_page); 6934 nested_release_page(vmx->nested.pi_desc_page); 6935 vmx->nested.pi_desc_page = NULL; 6936 vmx->nested.pi_desc = NULL; 6937 } 6938 6939 nested_free_all_saved_vmcss(vmx); 6940} 6941 6942/* Emulate the VMXOFF instruction */ 6943static int handle_vmoff(struct kvm_vcpu *vcpu) 6944{ 6945 if (!nested_vmx_check_permission(vcpu)) 6946 return 1; 6947 free_nested(to_vmx(vcpu)); 6948 skip_emulated_instruction(vcpu); 6949 nested_vmx_succeed(vcpu); 6950 return 1; 6951} 6952 6953/* Emulate the VMCLEAR instruction */ 6954static int handle_vmclear(struct kvm_vcpu *vcpu) 6955{ 6956 struct vcpu_vmx *vmx = to_vmx(vcpu); 6957 gpa_t vmptr; 6958 struct vmcs12 *vmcs12; 6959 struct page *page; 6960 6961 if (!nested_vmx_check_permission(vcpu)) 6962 return 1; 6963 6964 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) 6965 return 1; 6966 6967 if (vmptr == vmx->nested.current_vmptr) 6968 nested_release_vmcs12(vmx); 6969 6970 page = nested_get_page(vcpu, vmptr); 6971 if (page == NULL) { 6972 /* 6973 * For accurate processor emulation, VMCLEAR beyond available 6974 * physical memory should do nothing at all. However, it is 6975 * possible that a nested vmx bug, not a guest hypervisor bug, 6976 * resulted in this case, so let's shut down before doing any 6977 * more damage: 6978 */ 6979 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 6980 return 1; 6981 } 6982 vmcs12 = kmap(page); 6983 vmcs12->launch_state = 0; 6984 kunmap(page); 6985 nested_release_page(page); 6986 6987 nested_free_vmcs02(vmx, vmptr); 6988 6989 skip_emulated_instruction(vcpu); 6990 nested_vmx_succeed(vcpu); 6991 return 1; 6992} 6993 6994static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 6995 6996/* Emulate the VMLAUNCH instruction */ 6997static int handle_vmlaunch(struct kvm_vcpu *vcpu) 6998{ 6999 return nested_vmx_run(vcpu, true); 7000} 7001 7002/* Emulate the VMRESUME instruction */ 7003static int handle_vmresume(struct kvm_vcpu *vcpu) 7004{ 7005 7006 return nested_vmx_run(vcpu, false); 7007} 7008 7009enum vmcs_field_type { 7010 VMCS_FIELD_TYPE_U16 = 0, 7011 VMCS_FIELD_TYPE_U64 = 1, 7012 VMCS_FIELD_TYPE_U32 = 2, 7013 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 7014}; 7015 7016static inline int vmcs_field_type(unsigned long field) 7017{ 7018 if (0x1 & field) /* the *_HIGH fields are all 32 bit */ 7019 return VMCS_FIELD_TYPE_U32; 7020 return (field >> 13) & 0x3 ; 7021} 7022 7023static inline int vmcs_field_readonly(unsigned long field) 7024{ 7025 return (((field >> 10) & 0x3) == 1); 7026} 7027 7028/* 7029 * Read a vmcs12 field. Since these can have varying lengths and we return 7030 * one type, we chose the biggest type (u64) and zero-extend the return value 7031 * to that size. Note that the caller, handle_vmread, might need to use only 7032 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of 7033 * 64-bit fields are to be returned). 7034 */ 7035static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, 7036 unsigned long field, u64 *ret) 7037{ 7038 short offset = vmcs_field_to_offset(field); 7039 char *p; 7040 7041 if (offset < 0) 7042 return offset; 7043 7044 p = ((char *)(get_vmcs12(vcpu))) + offset; 7045 7046 switch (vmcs_field_type(field)) { 7047 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 7048 *ret = *((natural_width *)p); 7049 return 0; 7050 case VMCS_FIELD_TYPE_U16: 7051 *ret = *((u16 *)p); 7052 return 0; 7053 case VMCS_FIELD_TYPE_U32: 7054 *ret = *((u32 *)p); 7055 return 0; 7056 case VMCS_FIELD_TYPE_U64: 7057 *ret = *((u64 *)p); 7058 return 0; 7059 default: 7060 WARN_ON(1); 7061 return -ENOENT; 7062 } 7063} 7064 7065 7066static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, 7067 unsigned long field, u64 field_value){ 7068 short offset = vmcs_field_to_offset(field); 7069 char *p = ((char *) get_vmcs12(vcpu)) + offset; 7070 if (offset < 0) 7071 return offset; 7072 7073 switch (vmcs_field_type(field)) { 7074 case VMCS_FIELD_TYPE_U16: 7075 *(u16 *)p = field_value; 7076 return 0; 7077 case VMCS_FIELD_TYPE_U32: 7078 *(u32 *)p = field_value; 7079 return 0; 7080 case VMCS_FIELD_TYPE_U64: 7081 *(u64 *)p = field_value; 7082 return 0; 7083 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 7084 *(natural_width *)p = field_value; 7085 return 0; 7086 default: 7087 WARN_ON(1); 7088 return -ENOENT; 7089 } 7090 7091} 7092 7093static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 7094{ 7095 int i; 7096 unsigned long field; 7097 u64 field_value; 7098 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 7099 const unsigned long *fields = shadow_read_write_fields; 7100 const int num_fields = max_shadow_read_write_fields; 7101 7102 preempt_disable(); 7103 7104 vmcs_load(shadow_vmcs); 7105 7106 for (i = 0; i < num_fields; i++) { 7107 field = fields[i]; 7108 switch (vmcs_field_type(field)) { 7109 case VMCS_FIELD_TYPE_U16: 7110 field_value = vmcs_read16(field); 7111 break; 7112 case VMCS_FIELD_TYPE_U32: 7113 field_value = vmcs_read32(field); 7114 break; 7115 case VMCS_FIELD_TYPE_U64: 7116 field_value = vmcs_read64(field); 7117 break; 7118 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 7119 field_value = vmcs_readl(field); 7120 break; 7121 default: 7122 WARN_ON(1); 7123 continue; 7124 } 7125 vmcs12_write_any(&vmx->vcpu, field, field_value); 7126 } 7127 7128 vmcs_clear(shadow_vmcs); 7129 vmcs_load(vmx->loaded_vmcs->vmcs); 7130 7131 preempt_enable(); 7132} 7133 7134static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 7135{ 7136 const unsigned long *fields[] = { 7137 shadow_read_write_fields, 7138 shadow_read_only_fields 7139 }; 7140 const int max_fields[] = { 7141 max_shadow_read_write_fields, 7142 max_shadow_read_only_fields 7143 }; 7144 int i, q; 7145 unsigned long field; 7146 u64 field_value = 0; 7147 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 7148 7149 vmcs_load(shadow_vmcs); 7150 7151 for (q = 0; q < ARRAY_SIZE(fields); q++) { 7152 for (i = 0; i < max_fields[q]; i++) { 7153 field = fields[q][i]; 7154 vmcs12_read_any(&vmx->vcpu, field, &field_value); 7155 7156 switch (vmcs_field_type(field)) { 7157 case VMCS_FIELD_TYPE_U16: 7158 vmcs_write16(field, (u16)field_value); 7159 break; 7160 case VMCS_FIELD_TYPE_U32: 7161 vmcs_write32(field, (u32)field_value); 7162 break; 7163 case VMCS_FIELD_TYPE_U64: 7164 vmcs_write64(field, (u64)field_value); 7165 break; 7166 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 7167 vmcs_writel(field, (long)field_value); 7168 break; 7169 default: 7170 WARN_ON(1); 7171 break; 7172 } 7173 } 7174 } 7175 7176 vmcs_clear(shadow_vmcs); 7177 vmcs_load(vmx->loaded_vmcs->vmcs); 7178} 7179 7180/* 7181 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was 7182 * used before) all generate the same failure when it is missing. 7183 */ 7184static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) 7185{ 7186 struct vcpu_vmx *vmx = to_vmx(vcpu); 7187 if (vmx->nested.current_vmptr == -1ull) { 7188 nested_vmx_failInvalid(vcpu); 7189 skip_emulated_instruction(vcpu); 7190 return 0; 7191 } 7192 return 1; 7193} 7194 7195static int handle_vmread(struct kvm_vcpu *vcpu) 7196{ 7197 unsigned long field; 7198 u64 field_value; 7199 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7200 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7201 gva_t gva = 0; 7202 7203 if (!nested_vmx_check_permission(vcpu) || 7204 !nested_vmx_check_vmcs12(vcpu)) 7205 return 1; 7206 7207 /* Decode instruction info and find the field to read */ 7208 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 7209 /* Read the field, zero-extended to a u64 field_value */ 7210 if (vmcs12_read_any(vcpu, field, &field_value) < 0) { 7211 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 7212 skip_emulated_instruction(vcpu); 7213 return 1; 7214 } 7215 /* 7216 * Now copy part of this value to register or memory, as requested. 7217 * Note that the number of bits actually copied is 32 or 64 depending 7218 * on the guest's mode (32 or 64 bit), not on the given field's length. 7219 */ 7220 if (vmx_instruction_info & (1u << 10)) { 7221 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 7222 field_value); 7223 } else { 7224 if (get_vmx_mem_address(vcpu, exit_qualification, 7225 vmx_instruction_info, true, &gva)) 7226 return 1; 7227 /* _system ok, as nested_vmx_check_permission verified cpl=0 */ 7228 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, 7229 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); 7230 } 7231 7232 nested_vmx_succeed(vcpu); 7233 skip_emulated_instruction(vcpu); 7234 return 1; 7235} 7236 7237 7238static int handle_vmwrite(struct kvm_vcpu *vcpu) 7239{ 7240 unsigned long field; 7241 gva_t gva; 7242 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7243 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7244 /* The value to write might be 32 or 64 bits, depending on L1's long 7245 * mode, and eventually we need to write that into a field of several 7246 * possible lengths. The code below first zero-extends the value to 64 7247 * bit (field_value), and then copies only the approriate number of 7248 * bits into the vmcs12 field. 7249 */ 7250 u64 field_value = 0; 7251 struct x86_exception e; 7252 7253 if (!nested_vmx_check_permission(vcpu) || 7254 !nested_vmx_check_vmcs12(vcpu)) 7255 return 1; 7256 7257 if (vmx_instruction_info & (1u << 10)) 7258 field_value = kvm_register_readl(vcpu, 7259 (((vmx_instruction_info) >> 3) & 0xf)); 7260 else { 7261 if (get_vmx_mem_address(vcpu, exit_qualification, 7262 vmx_instruction_info, false, &gva)) 7263 return 1; 7264 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, 7265 &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { 7266 kvm_inject_page_fault(vcpu, &e); 7267 return 1; 7268 } 7269 } 7270 7271 7272 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 7273 if (vmcs_field_readonly(field)) { 7274 nested_vmx_failValid(vcpu, 7275 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 7276 skip_emulated_instruction(vcpu); 7277 return 1; 7278 } 7279 7280 if (vmcs12_write_any(vcpu, field, field_value) < 0) { 7281 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 7282 skip_emulated_instruction(vcpu); 7283 return 1; 7284 } 7285 7286 nested_vmx_succeed(vcpu); 7287 skip_emulated_instruction(vcpu); 7288 return 1; 7289} 7290 7291/* Emulate the VMPTRLD instruction */ 7292static int handle_vmptrld(struct kvm_vcpu *vcpu) 7293{ 7294 struct vcpu_vmx *vmx = to_vmx(vcpu); 7295 gpa_t vmptr; 7296 7297 if (!nested_vmx_check_permission(vcpu)) 7298 return 1; 7299 7300 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) 7301 return 1; 7302 7303 if (vmx->nested.current_vmptr != vmptr) { 7304 struct vmcs12 *new_vmcs12; 7305 struct page *page; 7306 page = nested_get_page(vcpu, vmptr); 7307 if (page == NULL) { 7308 nested_vmx_failInvalid(vcpu); 7309 skip_emulated_instruction(vcpu); 7310 return 1; 7311 } 7312 new_vmcs12 = kmap(page); 7313 if (new_vmcs12->revision_id != VMCS12_REVISION) { 7314 kunmap(page); 7315 nested_release_page_clean(page); 7316 nested_vmx_failValid(vcpu, 7317 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 7318 skip_emulated_instruction(vcpu); 7319 return 1; 7320 } 7321 7322 nested_release_vmcs12(vmx); 7323 vmx->nested.current_vmptr = vmptr; 7324 vmx->nested.current_vmcs12 = new_vmcs12; 7325 vmx->nested.current_vmcs12_page = page; 7326 if (enable_shadow_vmcs) { 7327 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 7328 SECONDARY_EXEC_SHADOW_VMCS); 7329 vmcs_write64(VMCS_LINK_POINTER, 7330 __pa(vmx->nested.current_shadow_vmcs)); 7331 vmx->nested.sync_shadow_vmcs = true; 7332 } 7333 } 7334 7335 nested_vmx_succeed(vcpu); 7336 skip_emulated_instruction(vcpu); 7337 return 1; 7338} 7339 7340/* Emulate the VMPTRST instruction */ 7341static int handle_vmptrst(struct kvm_vcpu *vcpu) 7342{ 7343 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7344 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7345 gva_t vmcs_gva; 7346 struct x86_exception e; 7347 7348 if (!nested_vmx_check_permission(vcpu)) 7349 return 1; 7350 7351 if (get_vmx_mem_address(vcpu, exit_qualification, 7352 vmx_instruction_info, true, &vmcs_gva)) 7353 return 1; 7354 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ 7355 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, 7356 (void *)&to_vmx(vcpu)->nested.current_vmptr, 7357 sizeof(u64), &e)) { 7358 kvm_inject_page_fault(vcpu, &e); 7359 return 1; 7360 } 7361 nested_vmx_succeed(vcpu); 7362 skip_emulated_instruction(vcpu); 7363 return 1; 7364} 7365 7366/* Emulate the INVEPT instruction */ 7367static int handle_invept(struct kvm_vcpu *vcpu) 7368{ 7369 struct vcpu_vmx *vmx = to_vmx(vcpu); 7370 u32 vmx_instruction_info, types; 7371 unsigned long type; 7372 gva_t gva; 7373 struct x86_exception e; 7374 struct { 7375 u64 eptp, gpa; 7376 } operand; 7377 7378 if (!(vmx->nested.nested_vmx_secondary_ctls_high & 7379 SECONDARY_EXEC_ENABLE_EPT) || 7380 !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 7381 kvm_queue_exception(vcpu, UD_VECTOR); 7382 return 1; 7383 } 7384 7385 if (!nested_vmx_check_permission(vcpu)) 7386 return 1; 7387 7388 if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { 7389 kvm_queue_exception(vcpu, UD_VECTOR); 7390 return 1; 7391 } 7392 7393 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7394 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 7395 7396 types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 7397 7398 if (!(types & (1UL << type))) { 7399 nested_vmx_failValid(vcpu, 7400 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7401 return 1; 7402 } 7403 7404 /* According to the Intel VMX instruction reference, the memory 7405 * operand is read even if it isn't needed (e.g., for type==global) 7406 */ 7407 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 7408 vmx_instruction_info, false, &gva)) 7409 return 1; 7410 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, 7411 sizeof(operand), &e)) { 7412 kvm_inject_page_fault(vcpu, &e); 7413 return 1; 7414 } 7415 7416 switch (type) { 7417 case VMX_EPT_EXTENT_GLOBAL: 7418 kvm_mmu_sync_roots(vcpu); 7419 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 7420 nested_vmx_succeed(vcpu); 7421 break; 7422 default: 7423 /* Trap single context invalidation invept calls */ 7424 BUG_ON(1); 7425 break; 7426 } 7427 7428 skip_emulated_instruction(vcpu); 7429 return 1; 7430} 7431 7432static int handle_invvpid(struct kvm_vcpu *vcpu) 7433{ 7434 struct vcpu_vmx *vmx = to_vmx(vcpu); 7435 u32 vmx_instruction_info; 7436 unsigned long type, types; 7437 gva_t gva; 7438 struct x86_exception e; 7439 int vpid; 7440 7441 if (!(vmx->nested.nested_vmx_secondary_ctls_high & 7442 SECONDARY_EXEC_ENABLE_VPID) || 7443 !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { 7444 kvm_queue_exception(vcpu, UD_VECTOR); 7445 return 1; 7446 } 7447 7448 if (!nested_vmx_check_permission(vcpu)) 7449 return 1; 7450 7451 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7452 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 7453 7454 types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; 7455 7456 if (!(types & (1UL << type))) { 7457 nested_vmx_failValid(vcpu, 7458 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7459 return 1; 7460 } 7461 7462 /* according to the intel vmx instruction reference, the memory 7463 * operand is read even if it isn't needed (e.g., for type==global) 7464 */ 7465 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 7466 vmx_instruction_info, false, &gva)) 7467 return 1; 7468 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, 7469 sizeof(u32), &e)) { 7470 kvm_inject_page_fault(vcpu, &e); 7471 return 1; 7472 } 7473 7474 switch (type) { 7475 case VMX_VPID_EXTENT_ALL_CONTEXT: 7476 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); 7477 nested_vmx_succeed(vcpu); 7478 break; 7479 default: 7480 /* Trap single context invalidation invvpid calls */ 7481 BUG_ON(1); 7482 break; 7483 } 7484 7485 skip_emulated_instruction(vcpu); 7486 return 1; 7487} 7488 7489static int handle_pml_full(struct kvm_vcpu *vcpu) 7490{ 7491 unsigned long exit_qualification; 7492 7493 trace_kvm_pml_full(vcpu->vcpu_id); 7494 7495 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7496 7497 /* 7498 * PML buffer FULL happened while executing iret from NMI, 7499 * "blocked by NMI" bit has to be set before next VM entry. 7500 */ 7501 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 7502 cpu_has_virtual_nmis() && 7503 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 7504 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7505 GUEST_INTR_STATE_NMI); 7506 7507 /* 7508 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 7509 * here.., and there's no userspace involvement needed for PML. 7510 */ 7511 return 1; 7512} 7513 7514static int handle_pcommit(struct kvm_vcpu *vcpu) 7515{ 7516 /* we never catch pcommit instruct for L1 guest. */ 7517 WARN_ON(1); 7518 return 1; 7519} 7520 7521/* 7522 * The exit handlers return 1 if the exit was handled fully and guest execution 7523 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 7524 * to be done to userspace and return 0. 7525 */ 7526static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 7527 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 7528 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 7529 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 7530 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 7531 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 7532 [EXIT_REASON_CR_ACCESS] = handle_cr, 7533 [EXIT_REASON_DR_ACCESS] = handle_dr, 7534 [EXIT_REASON_CPUID] = handle_cpuid, 7535 [EXIT_REASON_MSR_READ] = handle_rdmsr, 7536 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 7537 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 7538 [EXIT_REASON_HLT] = handle_halt, 7539 [EXIT_REASON_INVD] = handle_invd, 7540 [EXIT_REASON_INVLPG] = handle_invlpg, 7541 [EXIT_REASON_RDPMC] = handle_rdpmc, 7542 [EXIT_REASON_VMCALL] = handle_vmcall, 7543 [EXIT_REASON_VMCLEAR] = handle_vmclear, 7544 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 7545 [EXIT_REASON_VMPTRLD] = handle_vmptrld, 7546 [EXIT_REASON_VMPTRST] = handle_vmptrst, 7547 [EXIT_REASON_VMREAD] = handle_vmread, 7548 [EXIT_REASON_VMRESUME] = handle_vmresume, 7549 [EXIT_REASON_VMWRITE] = handle_vmwrite, 7550 [EXIT_REASON_VMOFF] = handle_vmoff, 7551 [EXIT_REASON_VMON] = handle_vmon, 7552 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 7553 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 7554 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 7555 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 7556 [EXIT_REASON_WBINVD] = handle_wbinvd, 7557 [EXIT_REASON_XSETBV] = handle_xsetbv, 7558 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 7559 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 7560 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 7561 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 7562 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 7563 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 7564 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 7565 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 7566 [EXIT_REASON_INVEPT] = handle_invept, 7567 [EXIT_REASON_INVVPID] = handle_invvpid, 7568 [EXIT_REASON_XSAVES] = handle_xsaves, 7569 [EXIT_REASON_XRSTORS] = handle_xrstors, 7570 [EXIT_REASON_PML_FULL] = handle_pml_full, 7571 [EXIT_REASON_PCOMMIT] = handle_pcommit, 7572}; 7573 7574static const int kvm_vmx_max_exit_handlers = 7575 ARRAY_SIZE(kvm_vmx_exit_handlers); 7576 7577static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 7578 struct vmcs12 *vmcs12) 7579{ 7580 unsigned long exit_qualification; 7581 gpa_t bitmap, last_bitmap; 7582 unsigned int port; 7583 int size; 7584 u8 b; 7585 7586 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 7587 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 7588 7589 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7590 7591 port = exit_qualification >> 16; 7592 size = (exit_qualification & 7) + 1; 7593 7594 last_bitmap = (gpa_t)-1; 7595 b = -1; 7596 7597 while (size > 0) { 7598 if (port < 0x8000) 7599 bitmap = vmcs12->io_bitmap_a; 7600 else if (port < 0x10000) 7601 bitmap = vmcs12->io_bitmap_b; 7602 else 7603 return true; 7604 bitmap += (port & 0x7fff) / 8; 7605 7606 if (last_bitmap != bitmap) 7607 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 7608 return true; 7609 if (b & (1 << (port & 7))) 7610 return true; 7611 7612 port++; 7613 size--; 7614 last_bitmap = bitmap; 7615 } 7616 7617 return false; 7618} 7619 7620/* 7621 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 7622 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 7623 * disinterest in the current event (read or write a specific MSR) by using an 7624 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 7625 */ 7626static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 7627 struct vmcs12 *vmcs12, u32 exit_reason) 7628{ 7629 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 7630 gpa_t bitmap; 7631 7632 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 7633 return true; 7634 7635 /* 7636 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 7637 * for the four combinations of read/write and low/high MSR numbers. 7638 * First we need to figure out which of the four to use: 7639 */ 7640 bitmap = vmcs12->msr_bitmap; 7641 if (exit_reason == EXIT_REASON_MSR_WRITE) 7642 bitmap += 2048; 7643 if (msr_index >= 0xc0000000) { 7644 msr_index -= 0xc0000000; 7645 bitmap += 1024; 7646 } 7647 7648 /* Then read the msr_index'th bit from this bitmap: */ 7649 if (msr_index < 1024*8) { 7650 unsigned char b; 7651 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 7652 return true; 7653 return 1 & (b >> (msr_index & 7)); 7654 } else 7655 return true; /* let L1 handle the wrong parameter */ 7656} 7657 7658/* 7659 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 7660 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 7661 * intercept (via guest_host_mask etc.) the current event. 7662 */ 7663static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 7664 struct vmcs12 *vmcs12) 7665{ 7666 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7667 int cr = exit_qualification & 15; 7668 int reg = (exit_qualification >> 8) & 15; 7669 unsigned long val = kvm_register_readl(vcpu, reg); 7670 7671 switch ((exit_qualification >> 4) & 3) { 7672 case 0: /* mov to cr */ 7673 switch (cr) { 7674 case 0: 7675 if (vmcs12->cr0_guest_host_mask & 7676 (val ^ vmcs12->cr0_read_shadow)) 7677 return true; 7678 break; 7679 case 3: 7680 if ((vmcs12->cr3_target_count >= 1 && 7681 vmcs12->cr3_target_value0 == val) || 7682 (vmcs12->cr3_target_count >= 2 && 7683 vmcs12->cr3_target_value1 == val) || 7684 (vmcs12->cr3_target_count >= 3 && 7685 vmcs12->cr3_target_value2 == val) || 7686 (vmcs12->cr3_target_count >= 4 && 7687 vmcs12->cr3_target_value3 == val)) 7688 return false; 7689 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 7690 return true; 7691 break; 7692 case 4: 7693 if (vmcs12->cr4_guest_host_mask & 7694 (vmcs12->cr4_read_shadow ^ val)) 7695 return true; 7696 break; 7697 case 8: 7698 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 7699 return true; 7700 break; 7701 } 7702 break; 7703 case 2: /* clts */ 7704 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 7705 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 7706 return true; 7707 break; 7708 case 1: /* mov from cr */ 7709 switch (cr) { 7710 case 3: 7711 if (vmcs12->cpu_based_vm_exec_control & 7712 CPU_BASED_CR3_STORE_EXITING) 7713 return true; 7714 break; 7715 case 8: 7716 if (vmcs12->cpu_based_vm_exec_control & 7717 CPU_BASED_CR8_STORE_EXITING) 7718 return true; 7719 break; 7720 } 7721 break; 7722 case 3: /* lmsw */ 7723 /* 7724 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 7725 * cr0. Other attempted changes are ignored, with no exit. 7726 */ 7727 if (vmcs12->cr0_guest_host_mask & 0xe & 7728 (val ^ vmcs12->cr0_read_shadow)) 7729 return true; 7730 if ((vmcs12->cr0_guest_host_mask & 0x1) && 7731 !(vmcs12->cr0_read_shadow & 0x1) && 7732 (val & 0x1)) 7733 return true; 7734 break; 7735 } 7736 return false; 7737} 7738 7739/* 7740 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 7741 * should handle it ourselves in L0 (and then continue L2). Only call this 7742 * when in is_guest_mode (L2). 7743 */ 7744static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) 7745{ 7746 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7747 struct vcpu_vmx *vmx = to_vmx(vcpu); 7748 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7749 u32 exit_reason = vmx->exit_reason; 7750 7751 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 7752 vmcs_readl(EXIT_QUALIFICATION), 7753 vmx->idt_vectoring_info, 7754 intr_info, 7755 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 7756 KVM_ISA_VMX); 7757 7758 if (vmx->nested.nested_run_pending) 7759 return false; 7760 7761 if (unlikely(vmx->fail)) { 7762 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 7763 vmcs_read32(VM_INSTRUCTION_ERROR)); 7764 return true; 7765 } 7766 7767 switch (exit_reason) { 7768 case EXIT_REASON_EXCEPTION_NMI: 7769 if (!is_exception(intr_info)) 7770 return false; 7771 else if (is_page_fault(intr_info)) 7772 return enable_ept; 7773 else if (is_no_device(intr_info) && 7774 !(vmcs12->guest_cr0 & X86_CR0_TS)) 7775 return false; 7776 return vmcs12->exception_bitmap & 7777 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 7778 case EXIT_REASON_EXTERNAL_INTERRUPT: 7779 return false; 7780 case EXIT_REASON_TRIPLE_FAULT: 7781 return true; 7782 case EXIT_REASON_PENDING_INTERRUPT: 7783 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 7784 case EXIT_REASON_NMI_WINDOW: 7785 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 7786 case EXIT_REASON_TASK_SWITCH: 7787 return true; 7788 case EXIT_REASON_CPUID: 7789 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) 7790 return false; 7791 return true; 7792 case EXIT_REASON_HLT: 7793 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 7794 case EXIT_REASON_INVD: 7795 return true; 7796 case EXIT_REASON_INVLPG: 7797 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 7798 case EXIT_REASON_RDPMC: 7799 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 7800 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 7801 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 7802 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 7803 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 7804 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 7805 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 7806 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 7807 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 7808 /* 7809 * VMX instructions trap unconditionally. This allows L1 to 7810 * emulate them for its L2 guest, i.e., allows 3-level nesting! 7811 */ 7812 return true; 7813 case EXIT_REASON_CR_ACCESS: 7814 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 7815 case EXIT_REASON_DR_ACCESS: 7816 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 7817 case EXIT_REASON_IO_INSTRUCTION: 7818 return nested_vmx_exit_handled_io(vcpu, vmcs12); 7819 case EXIT_REASON_MSR_READ: 7820 case EXIT_REASON_MSR_WRITE: 7821 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 7822 case EXIT_REASON_INVALID_STATE: 7823 return true; 7824 case EXIT_REASON_MWAIT_INSTRUCTION: 7825 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 7826 case EXIT_REASON_MONITOR_TRAP_FLAG: 7827 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); 7828 case EXIT_REASON_MONITOR_INSTRUCTION: 7829 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 7830 case EXIT_REASON_PAUSE_INSTRUCTION: 7831 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 7832 nested_cpu_has2(vmcs12, 7833 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 7834 case EXIT_REASON_MCE_DURING_VMENTRY: 7835 return false; 7836 case EXIT_REASON_TPR_BELOW_THRESHOLD: 7837 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 7838 case EXIT_REASON_APIC_ACCESS: 7839 return nested_cpu_has2(vmcs12, 7840 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 7841 case EXIT_REASON_APIC_WRITE: 7842 case EXIT_REASON_EOI_INDUCED: 7843 /* apic_write and eoi_induced should exit unconditionally. */ 7844 return true; 7845 case EXIT_REASON_EPT_VIOLATION: 7846 /* 7847 * L0 always deals with the EPT violation. If nested EPT is 7848 * used, and the nested mmu code discovers that the address is 7849 * missing in the guest EPT table (EPT12), the EPT violation 7850 * will be injected with nested_ept_inject_page_fault() 7851 */ 7852 return false; 7853 case EXIT_REASON_EPT_MISCONFIG: 7854 /* 7855 * L2 never uses directly L1's EPT, but rather L0's own EPT 7856 * table (shadow on EPT) or a merged EPT table that L0 built 7857 * (EPT on EPT). So any problems with the structure of the 7858 * table is L0's fault. 7859 */ 7860 return false; 7861 case EXIT_REASON_WBINVD: 7862 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 7863 case EXIT_REASON_XSETBV: 7864 return true; 7865 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 7866 /* 7867 * This should never happen, since it is not possible to 7868 * set XSS to a non-zero value---neither in L1 nor in L2. 7869 * If if it were, XSS would have to be checked against 7870 * the XSS exit bitmap in vmcs12. 7871 */ 7872 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 7873 case EXIT_REASON_PCOMMIT: 7874 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); 7875 default: 7876 return true; 7877 } 7878} 7879 7880static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 7881{ 7882 *info1 = vmcs_readl(EXIT_QUALIFICATION); 7883 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7884} 7885 7886static int vmx_create_pml_buffer(struct vcpu_vmx *vmx) 7887{ 7888 struct page *pml_pg; 7889 7890 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); 7891 if (!pml_pg) 7892 return -ENOMEM; 7893 7894 vmx->pml_pg = pml_pg; 7895 7896 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 7897 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7898 7899 return 0; 7900} 7901 7902static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 7903{ 7904 if (vmx->pml_pg) { 7905 __free_page(vmx->pml_pg); 7906 vmx->pml_pg = NULL; 7907 } 7908} 7909 7910static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 7911{ 7912 struct vcpu_vmx *vmx = to_vmx(vcpu); 7913 u64 *pml_buf; 7914 u16 pml_idx; 7915 7916 pml_idx = vmcs_read16(GUEST_PML_INDEX); 7917 7918 /* Do nothing if PML buffer is empty */ 7919 if (pml_idx == (PML_ENTITY_NUM - 1)) 7920 return; 7921 7922 /* PML index always points to next available PML buffer entity */ 7923 if (pml_idx >= PML_ENTITY_NUM) 7924 pml_idx = 0; 7925 else 7926 pml_idx++; 7927 7928 pml_buf = page_address(vmx->pml_pg); 7929 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 7930 u64 gpa; 7931 7932 gpa = pml_buf[pml_idx]; 7933 WARN_ON(gpa & (PAGE_SIZE - 1)); 7934 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 7935 } 7936 7937 /* reset PML index */ 7938 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7939} 7940 7941/* 7942 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. 7943 * Called before reporting dirty_bitmap to userspace. 7944 */ 7945static void kvm_flush_pml_buffers(struct kvm *kvm) 7946{ 7947 int i; 7948 struct kvm_vcpu *vcpu; 7949 /* 7950 * We only need to kick vcpu out of guest mode here, as PML buffer 7951 * is flushed at beginning of all VMEXITs, and it's obvious that only 7952 * vcpus running in guest are possible to have unflushed GPAs in PML 7953 * buffer. 7954 */ 7955 kvm_for_each_vcpu(i, vcpu, kvm) 7956 kvm_vcpu_kick(vcpu); 7957} 7958 7959static void vmx_dump_sel(char *name, uint32_t sel) 7960{ 7961 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 7962 name, vmcs_read32(sel), 7963 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 7964 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 7965 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 7966} 7967 7968static void vmx_dump_dtsel(char *name, uint32_t limit) 7969{ 7970 pr_err("%s limit=0x%08x, base=0x%016lx\n", 7971 name, vmcs_read32(limit), 7972 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 7973} 7974 7975static void dump_vmcs(void) 7976{ 7977 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 7978 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 7979 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 7980 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 7981 u32 secondary_exec_control = 0; 7982 unsigned long cr4 = vmcs_readl(GUEST_CR4); 7983 u64 efer = vmcs_read64(GUEST_IA32_EFER); 7984 int i, n; 7985 7986 if (cpu_has_secondary_exec_ctrls()) 7987 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7988 7989 pr_err("*** Guest State ***\n"); 7990 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 7991 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 7992 vmcs_readl(CR0_GUEST_HOST_MASK)); 7993 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 7994 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 7995 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 7996 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 7997 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) 7998 { 7999 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 8000 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 8001 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 8002 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 8003 } 8004 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 8005 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 8006 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 8007 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 8008 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 8009 vmcs_readl(GUEST_SYSENTER_ESP), 8010 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 8011 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 8012 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 8013 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 8014 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 8015 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 8016 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 8017 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 8018 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 8019 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 8020 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 8021 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || 8022 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) 8023 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", 8024 efer, vmcs_read64(GUEST_IA32_PAT)); 8025 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 8026 vmcs_read64(GUEST_IA32_DEBUGCTL), 8027 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 8028 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 8029 pr_err("PerfGlobCtl = 0x%016llx\n", 8030 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 8031 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 8032 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 8033 pr_err("Interruptibility = %08x ActivityState = %08x\n", 8034 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 8035 vmcs_read32(GUEST_ACTIVITY_STATE)); 8036 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 8037 pr_err("InterruptStatus = %04x\n", 8038 vmcs_read16(GUEST_INTR_STATUS)); 8039 8040 pr_err("*** Host State ***\n"); 8041 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 8042 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 8043 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 8044 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 8045 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 8046 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 8047 vmcs_read16(HOST_TR_SELECTOR)); 8048 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 8049 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 8050 vmcs_readl(HOST_TR_BASE)); 8051 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 8052 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 8053 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 8054 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 8055 vmcs_readl(HOST_CR4)); 8056 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 8057 vmcs_readl(HOST_IA32_SYSENTER_ESP), 8058 vmcs_read32(HOST_IA32_SYSENTER_CS), 8059 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 8060 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) 8061 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", 8062 vmcs_read64(HOST_IA32_EFER), 8063 vmcs_read64(HOST_IA32_PAT)); 8064 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 8065 pr_err("PerfGlobCtl = 0x%016llx\n", 8066 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 8067 8068 pr_err("*** Control State ***\n"); 8069 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", 8070 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); 8071 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); 8072 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 8073 vmcs_read32(EXCEPTION_BITMAP), 8074 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 8075 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 8076 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 8077 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 8078 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 8079 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 8080 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 8081 vmcs_read32(VM_EXIT_INTR_INFO), 8082 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 8083 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 8084 pr_err(" reason=%08x qualification=%016lx\n", 8085 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 8086 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 8087 vmcs_read32(IDT_VECTORING_INFO_FIELD), 8088 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 8089 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 8090 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 8091 pr_err("TSC Multiplier = 0x%016llx\n", 8092 vmcs_read64(TSC_MULTIPLIER)); 8093 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) 8094 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 8095 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 8096 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 8097 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 8098 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 8099 n = vmcs_read32(CR3_TARGET_COUNT); 8100 for (i = 0; i + 1 < n; i += 4) 8101 pr_err("CR3 target%u=%016lx target%u=%016lx\n", 8102 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), 8103 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); 8104 if (i < n) 8105 pr_err("CR3 target%u=%016lx\n", 8106 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); 8107 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 8108 pr_err("PLE Gap=%08x Window=%08x\n", 8109 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 8110 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 8111 pr_err("Virtual processor ID = 0x%04x\n", 8112 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 8113} 8114 8115/* 8116 * The guest has exited. See if we can fix it or if we need userspace 8117 * assistance. 8118 */ 8119static int vmx_handle_exit(struct kvm_vcpu *vcpu) 8120{ 8121 struct vcpu_vmx *vmx = to_vmx(vcpu); 8122 u32 exit_reason = vmx->exit_reason; 8123 u32 vectoring_info = vmx->idt_vectoring_info; 8124 8125 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); 8126 8127 /* 8128 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 8129 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 8130 * querying dirty_bitmap, we only need to kick all vcpus out of guest 8131 * mode as if vcpus is in root mode, the PML buffer must has been 8132 * flushed already. 8133 */ 8134 if (enable_pml) 8135 vmx_flush_pml_buffer(vcpu); 8136 8137 /* If guest state is invalid, start emulating */ 8138 if (vmx->emulation_required) 8139 return handle_invalid_guest_state(vcpu); 8140 8141 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { 8142 nested_vmx_vmexit(vcpu, exit_reason, 8143 vmcs_read32(VM_EXIT_INTR_INFO), 8144 vmcs_readl(EXIT_QUALIFICATION)); 8145 return 1; 8146 } 8147 8148 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 8149 dump_vmcs(); 8150 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 8151 vcpu->run->fail_entry.hardware_entry_failure_reason 8152 = exit_reason; 8153 return 0; 8154 } 8155 8156 if (unlikely(vmx->fail)) { 8157 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 8158 vcpu->run->fail_entry.hardware_entry_failure_reason 8159 = vmcs_read32(VM_INSTRUCTION_ERROR); 8160 return 0; 8161 } 8162 8163 /* 8164 * Note: 8165 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 8166 * delivery event since it indicates guest is accessing MMIO. 8167 * The vm-exit can be triggered again after return to guest that 8168 * will cause infinite loop. 8169 */ 8170 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 8171 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 8172 exit_reason != EXIT_REASON_EPT_VIOLATION && 8173 exit_reason != EXIT_REASON_TASK_SWITCH)) { 8174 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 8175 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 8176 vcpu->run->internal.ndata = 2; 8177 vcpu->run->internal.data[0] = vectoring_info; 8178 vcpu->run->internal.data[1] = exit_reason; 8179 return 0; 8180 } 8181 8182 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 8183 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 8184 get_vmcs12(vcpu))))) { 8185 if (vmx_interrupt_allowed(vcpu)) { 8186 vmx->soft_vnmi_blocked = 0; 8187 } else if (vmx->vnmi_blocked_time > 1000000000LL && 8188 vcpu->arch.nmi_pending) { 8189 /* 8190 * This CPU don't support us in finding the end of an 8191 * NMI-blocked window if the guest runs with IRQs 8192 * disabled. So we pull the trigger after 1 s of 8193 * futile waiting, but inform the user about this. 8194 */ 8195 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 8196 "state on VCPU %d after 1 s timeout\n", 8197 __func__, vcpu->vcpu_id); 8198 vmx->soft_vnmi_blocked = 0; 8199 } 8200 } 8201 8202 if (exit_reason < kvm_vmx_max_exit_handlers 8203 && kvm_vmx_exit_handlers[exit_reason]) 8204 return kvm_vmx_exit_handlers[exit_reason](vcpu); 8205 else { 8206 WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); 8207 kvm_queue_exception(vcpu, UD_VECTOR); 8208 return 1; 8209 } 8210} 8211 8212static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 8213{ 8214 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8215 8216 if (is_guest_mode(vcpu) && 8217 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 8218 return; 8219 8220 if (irr == -1 || tpr < irr) { 8221 vmcs_write32(TPR_THRESHOLD, 0); 8222 return; 8223 } 8224 8225 vmcs_write32(TPR_THRESHOLD, irr); 8226} 8227 8228static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) 8229{ 8230 u32 sec_exec_control; 8231 8232 /* 8233 * There is not point to enable virtualize x2apic without enable 8234 * apicv 8235 */ 8236 if (!cpu_has_vmx_virtualize_x2apic_mode() || 8237 !kvm_vcpu_apicv_active(vcpu)) 8238 return; 8239 8240 if (!cpu_need_tpr_shadow(vcpu)) 8241 return; 8242 8243 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8244 8245 if (set) { 8246 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 8247 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 8248 } else { 8249 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 8250 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 8251 } 8252 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 8253 8254 vmx_set_msr_bitmap(vcpu); 8255} 8256 8257static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) 8258{ 8259 struct vcpu_vmx *vmx = to_vmx(vcpu); 8260 8261 /* 8262 * Currently we do not handle the nested case where L2 has an 8263 * APIC access page of its own; that page is still pinned. 8264 * Hence, we skip the case where the VCPU is in guest mode _and_ 8265 * L1 prepared an APIC access page for L2. 8266 * 8267 * For the case where L1 and L2 share the same APIC access page 8268 * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear 8269 * in the vmcs12), this function will only update either the vmcs01 8270 * or the vmcs02. If the former, the vmcs02 will be updated by 8271 * prepare_vmcs02. If the latter, the vmcs01 will be updated in 8272 * the next L2->L1 exit. 8273 */ 8274 if (!is_guest_mode(vcpu) || 8275 !nested_cpu_has2(vmx->nested.current_vmcs12, 8276 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 8277 vmcs_write64(APIC_ACCESS_ADDR, hpa); 8278} 8279 8280static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) 8281{ 8282 u16 status; 8283 u8 old; 8284 8285 if (isr == -1) 8286 isr = 0; 8287 8288 status = vmcs_read16(GUEST_INTR_STATUS); 8289 old = status >> 8; 8290 if (isr != old) { 8291 status &= 0xff; 8292 status |= isr << 8; 8293 vmcs_write16(GUEST_INTR_STATUS, status); 8294 } 8295} 8296 8297static void vmx_set_rvi(int vector) 8298{ 8299 u16 status; 8300 u8 old; 8301 8302 if (vector == -1) 8303 vector = 0; 8304 8305 status = vmcs_read16(GUEST_INTR_STATUS); 8306 old = (u8)status & 0xff; 8307 if ((u8)vector != old) { 8308 status &= ~0xff; 8309 status |= (u8)vector; 8310 vmcs_write16(GUEST_INTR_STATUS, status); 8311 } 8312} 8313 8314static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 8315{ 8316 if (!is_guest_mode(vcpu)) { 8317 vmx_set_rvi(max_irr); 8318 return; 8319 } 8320 8321 if (max_irr == -1) 8322 return; 8323 8324 /* 8325 * In guest mode. If a vmexit is needed, vmx_check_nested_events 8326 * handles it. 8327 */ 8328 if (nested_exit_on_intr(vcpu)) 8329 return; 8330 8331 /* 8332 * Else, fall back to pre-APICv interrupt injection since L2 8333 * is run without virtual interrupt delivery. 8334 */ 8335 if (!kvm_event_needs_reinjection(vcpu) && 8336 vmx_interrupt_allowed(vcpu)) { 8337 kvm_queue_interrupt(vcpu, max_irr, false); 8338 vmx_inject_irq(vcpu); 8339 } 8340} 8341 8342static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 8343{ 8344 if (!kvm_vcpu_apicv_active(vcpu)) 8345 return; 8346 8347 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 8348 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 8349 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 8350 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 8351} 8352 8353static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 8354{ 8355 u32 exit_intr_info; 8356 8357 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY 8358 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) 8359 return; 8360 8361 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8362 exit_intr_info = vmx->exit_intr_info; 8363 8364 /* Handle machine checks before interrupts are enabled */ 8365 if (is_machine_check(exit_intr_info)) 8366 kvm_machine_check(); 8367 8368 /* We need to handle NMIs before interrupts are enabled */ 8369 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 8370 (exit_intr_info & INTR_INFO_VALID_MASK)) { 8371 kvm_before_handle_nmi(&vmx->vcpu); 8372 asm("int $2"); 8373 kvm_after_handle_nmi(&vmx->vcpu); 8374 } 8375} 8376 8377static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) 8378{ 8379 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8380 8381 /* 8382 * If external interrupt exists, IF bit is set in rflags/eflags on the 8383 * interrupt stack frame, and interrupt will be enabled on a return 8384 * from interrupt handler. 8385 */ 8386 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) 8387 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { 8388 unsigned int vector; 8389 unsigned long entry; 8390 gate_desc *desc; 8391 struct vcpu_vmx *vmx = to_vmx(vcpu); 8392#ifdef CONFIG_X86_64 8393 unsigned long tmp; 8394#endif 8395 8396 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 8397 desc = (gate_desc *)vmx->host_idt_base + vector; 8398 entry = gate_offset(*desc); 8399 asm volatile( 8400#ifdef CONFIG_X86_64 8401 "mov %%" _ASM_SP ", %[sp]\n\t" 8402 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 8403 "push $%c[ss]\n\t" 8404 "push %[sp]\n\t" 8405#endif 8406 "pushf\n\t" 8407 "orl $0x200, (%%" _ASM_SP ")\n\t" 8408 __ASM_SIZE(push) " $%c[cs]\n\t" 8409 "call *%[entry]\n\t" 8410 : 8411#ifdef CONFIG_X86_64 8412 [sp]"=&r"(tmp) 8413#endif 8414 : 8415 [entry]"r"(entry), 8416 [ss]"i"(__KERNEL_DS), 8417 [cs]"i"(__KERNEL_CS) 8418 ); 8419 } else 8420 local_irq_enable(); 8421} 8422 8423static bool vmx_has_high_real_mode_segbase(void) 8424{ 8425 return enable_unrestricted_guest || emulate_invalid_guest_state; 8426} 8427 8428static bool vmx_mpx_supported(void) 8429{ 8430 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && 8431 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); 8432} 8433 8434static bool vmx_xsaves_supported(void) 8435{ 8436 return vmcs_config.cpu_based_2nd_exec_ctrl & 8437 SECONDARY_EXEC_XSAVES; 8438} 8439 8440static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 8441{ 8442 u32 exit_intr_info; 8443 bool unblock_nmi; 8444 u8 vector; 8445 bool idtv_info_valid; 8446 8447 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 8448 8449 if (cpu_has_virtual_nmis()) { 8450 if (vmx->nmi_known_unmasked) 8451 return; 8452 /* 8453 * Can't use vmx->exit_intr_info since we're not sure what 8454 * the exit reason is. 8455 */ 8456 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8457 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 8458 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 8459 /* 8460 * SDM 3: 27.7.1.2 (September 2008) 8461 * Re-set bit "block by NMI" before VM entry if vmexit caused by 8462 * a guest IRET fault. 8463 * SDM 3: 23.2.2 (September 2008) 8464 * Bit 12 is undefined in any of the following cases: 8465 * If the VM exit sets the valid bit in the IDT-vectoring 8466 * information field. 8467 * If the VM exit is due to a double fault. 8468 */ 8469 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 8470 vector != DF_VECTOR && !idtv_info_valid) 8471 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 8472 GUEST_INTR_STATE_NMI); 8473 else 8474 vmx->nmi_known_unmasked = 8475 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 8476 & GUEST_INTR_STATE_NMI); 8477 } else if (unlikely(vmx->soft_vnmi_blocked)) 8478 vmx->vnmi_blocked_time += 8479 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 8480} 8481 8482static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 8483 u32 idt_vectoring_info, 8484 int instr_len_field, 8485 int error_code_field) 8486{ 8487 u8 vector; 8488 int type; 8489 bool idtv_info_valid; 8490 8491 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 8492 8493 vcpu->arch.nmi_injected = false; 8494 kvm_clear_exception_queue(vcpu); 8495 kvm_clear_interrupt_queue(vcpu); 8496 8497 if (!idtv_info_valid) 8498 return; 8499 8500 kvm_make_request(KVM_REQ_EVENT, vcpu); 8501 8502 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 8503 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 8504 8505 switch (type) { 8506 case INTR_TYPE_NMI_INTR: 8507 vcpu->arch.nmi_injected = true; 8508 /* 8509 * SDM 3: 27.7.1.2 (September 2008) 8510 * Clear bit "block by NMI" before VM entry if a NMI 8511 * delivery faulted. 8512 */ 8513 vmx_set_nmi_mask(vcpu, false); 8514 break; 8515 case INTR_TYPE_SOFT_EXCEPTION: 8516 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 8517 /* fall through */ 8518 case INTR_TYPE_HARD_EXCEPTION: 8519 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 8520 u32 err = vmcs_read32(error_code_field); 8521 kvm_requeue_exception_e(vcpu, vector, err); 8522 } else 8523 kvm_requeue_exception(vcpu, vector); 8524 break; 8525 case INTR_TYPE_SOFT_INTR: 8526 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 8527 /* fall through */ 8528 case INTR_TYPE_EXT_INTR: 8529 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 8530 break; 8531 default: 8532 break; 8533 } 8534} 8535 8536static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 8537{ 8538 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 8539 VM_EXIT_INSTRUCTION_LEN, 8540 IDT_VECTORING_ERROR_CODE); 8541} 8542 8543static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 8544{ 8545 __vmx_complete_interrupts(vcpu, 8546 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 8547 VM_ENTRY_INSTRUCTION_LEN, 8548 VM_ENTRY_EXCEPTION_ERROR_CODE); 8549 8550 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 8551} 8552 8553static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 8554{ 8555 int i, nr_msrs; 8556 struct perf_guest_switch_msr *msrs; 8557 8558 msrs = perf_guest_get_msrs(&nr_msrs); 8559 8560 if (!msrs) 8561 return; 8562 8563 for (i = 0; i < nr_msrs; i++) 8564 if (msrs[i].host == msrs[i].guest) 8565 clear_atomic_switch_msr(vmx, msrs[i].msr); 8566 else 8567 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 8568 msrs[i].host); 8569} 8570 8571static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 8572{ 8573 struct vcpu_vmx *vmx = to_vmx(vcpu); 8574 unsigned long debugctlmsr, cr4; 8575 8576 /* Record the guest's net vcpu time for enforced NMI injections. */ 8577 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 8578 vmx->entry_time = ktime_get(); 8579 8580 /* Don't enter VMX if guest state is invalid, let the exit handler 8581 start emulation until we arrive back to a valid state */ 8582 if (vmx->emulation_required) 8583 return; 8584 8585 if (vmx->ple_window_dirty) { 8586 vmx->ple_window_dirty = false; 8587 vmcs_write32(PLE_WINDOW, vmx->ple_window); 8588 } 8589 8590 if (vmx->nested.sync_shadow_vmcs) { 8591 copy_vmcs12_to_shadow(vmx); 8592 vmx->nested.sync_shadow_vmcs = false; 8593 } 8594 8595 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 8596 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 8597 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 8598 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 8599 8600 cr4 = cr4_read_shadow(); 8601 if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { 8602 vmcs_writel(HOST_CR4, cr4); 8603 vmx->host_state.vmcs_host_cr4 = cr4; 8604 } 8605 8606 /* When single-stepping over STI and MOV SS, we must clear the 8607 * corresponding interruptibility bits in the guest state. Otherwise 8608 * vmentry fails as it then expects bit 14 (BS) in pending debug 8609 * exceptions being set, but that's not correct for the guest debugging 8610 * case. */ 8611 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 8612 vmx_set_interrupt_shadow(vcpu, 0); 8613 8614 atomic_switch_perf_msrs(vmx); 8615 debugctlmsr = get_debugctlmsr(); 8616 8617 vmx->__launched = vmx->loaded_vmcs->launched; 8618 asm( 8619 /* Store host registers */ 8620 "push %%" _ASM_DX "; push %%" _ASM_BP ";" 8621 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ 8622 "push %%" _ASM_CX " \n\t" 8623 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 8624 "je 1f \n\t" 8625 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 8626 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 8627 "1: \n\t" 8628 /* Reload cr2 if changed */ 8629 "mov %c[cr2](%0), %%" _ASM_AX " \n\t" 8630 "mov %%cr2, %%" _ASM_DX " \n\t" 8631 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" 8632 "je 2f \n\t" 8633 "mov %%" _ASM_AX", %%cr2 \n\t" 8634 "2: \n\t" 8635 /* Check if vmlaunch of vmresume is needed */ 8636 "cmpl $0, %c[launched](%0) \n\t" 8637 /* Load guest registers. Don't clobber flags. */ 8638 "mov %c[rax](%0), %%" _ASM_AX " \n\t" 8639 "mov %c[rbx](%0), %%" _ASM_BX " \n\t" 8640 "mov %c[rdx](%0), %%" _ASM_DX " \n\t" 8641 "mov %c[rsi](%0), %%" _ASM_SI " \n\t" 8642 "mov %c[rdi](%0), %%" _ASM_DI " \n\t" 8643 "mov %c[rbp](%0), %%" _ASM_BP " \n\t" 8644#ifdef CONFIG_X86_64 8645 "mov %c[r8](%0), %%r8 \n\t" 8646 "mov %c[r9](%0), %%r9 \n\t" 8647 "mov %c[r10](%0), %%r10 \n\t" 8648 "mov %c[r11](%0), %%r11 \n\t" 8649 "mov %c[r12](%0), %%r12 \n\t" 8650 "mov %c[r13](%0), %%r13 \n\t" 8651 "mov %c[r14](%0), %%r14 \n\t" 8652 "mov %c[r15](%0), %%r15 \n\t" 8653#endif 8654 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ 8655 8656 /* Enter guest mode */ 8657 "jne 1f \n\t" 8658 __ex(ASM_VMX_VMLAUNCH) "\n\t" 8659 "jmp 2f \n\t" 8660 "1: " __ex(ASM_VMX_VMRESUME) "\n\t" 8661 "2: " 8662 /* Save guest registers, load host registers, keep flags */ 8663 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" 8664 "pop %0 \n\t" 8665 "mov %%" _ASM_AX ", %c[rax](%0) \n\t" 8666 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" 8667 __ASM_SIZE(pop) " %c[rcx](%0) \n\t" 8668 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" 8669 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" 8670 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" 8671 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" 8672#ifdef CONFIG_X86_64 8673 "mov %%r8, %c[r8](%0) \n\t" 8674 "mov %%r9, %c[r9](%0) \n\t" 8675 "mov %%r10, %c[r10](%0) \n\t" 8676 "mov %%r11, %c[r11](%0) \n\t" 8677 "mov %%r12, %c[r12](%0) \n\t" 8678 "mov %%r13, %c[r13](%0) \n\t" 8679 "mov %%r14, %c[r14](%0) \n\t" 8680 "mov %%r15, %c[r15](%0) \n\t" 8681#endif 8682 "mov %%cr2, %%" _ASM_AX " \n\t" 8683 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" 8684 8685 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" 8686 "setbe %c[fail](%0) \n\t" 8687 ".pushsection .rodata \n\t" 8688 ".global vmx_return \n\t" 8689 "vmx_return: " _ASM_PTR " 2b \n\t" 8690 ".popsection" 8691 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 8692 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 8693 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 8694 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 8695 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 8696 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 8697 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 8698 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), 8699 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), 8700 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), 8701 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), 8702#ifdef CONFIG_X86_64 8703 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), 8704 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), 8705 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), 8706 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), 8707 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), 8708 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), 8709 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 8710 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 8711#endif 8712 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), 8713 [wordsize]"i"(sizeof(ulong)) 8714 : "cc", "memory" 8715#ifdef CONFIG_X86_64 8716 , "rax", "rbx", "rdi", "rsi" 8717 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 8718#else 8719 , "eax", "ebx", "edi", "esi" 8720#endif 8721 ); 8722 8723 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 8724 if (debugctlmsr) 8725 update_debugctlmsr(debugctlmsr); 8726 8727#ifndef CONFIG_X86_64 8728 /* 8729 * The sysexit path does not restore ds/es, so we must set them to 8730 * a reasonable value ourselves. 8731 * 8732 * We can't defer this to vmx_load_host_state() since that function 8733 * may be executed in interrupt context, which saves and restore segments 8734 * around it, nullifying its effect. 8735 */ 8736 loadsegment(ds, __USER_DS); 8737 loadsegment(es, __USER_DS); 8738#endif 8739 8740 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 8741 | (1 << VCPU_EXREG_RFLAGS) 8742 | (1 << VCPU_EXREG_PDPTR) 8743 | (1 << VCPU_EXREG_SEGMENTS) 8744 | (1 << VCPU_EXREG_CR3)); 8745 vcpu->arch.regs_dirty = 0; 8746 8747 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 8748 8749 vmx->loaded_vmcs->launched = 1; 8750 8751 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 8752 8753 /* 8754 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if 8755 * we did not inject a still-pending event to L1 now because of 8756 * nested_run_pending, we need to re-enable this bit. 8757 */ 8758 if (vmx->nested.nested_run_pending) 8759 kvm_make_request(KVM_REQ_EVENT, vcpu); 8760 8761 vmx->nested.nested_run_pending = 0; 8762 8763 vmx_complete_atomic_exit(vmx); 8764 vmx_recover_nmi_blocking(vmx); 8765 vmx_complete_interrupts(vmx); 8766} 8767 8768static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) 8769{ 8770 struct vcpu_vmx *vmx = to_vmx(vcpu); 8771 int cpu; 8772 8773 if (vmx->loaded_vmcs == &vmx->vmcs01) 8774 return; 8775 8776 cpu = get_cpu(); 8777 vmx->loaded_vmcs = &vmx->vmcs01; 8778 vmx_vcpu_put(vcpu); 8779 vmx_vcpu_load(vcpu, cpu); 8780 vcpu->cpu = cpu; 8781 put_cpu(); 8782} 8783 8784static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 8785{ 8786 struct vcpu_vmx *vmx = to_vmx(vcpu); 8787 8788 if (enable_pml) 8789 vmx_destroy_pml_buffer(vmx); 8790 free_vpid(vmx->vpid); 8791 leave_guest_mode(vcpu); 8792 vmx_load_vmcs01(vcpu); 8793 free_nested(vmx); 8794 free_loaded_vmcs(vmx->loaded_vmcs); 8795 kfree(vmx->guest_msrs); 8796 kvm_vcpu_uninit(vcpu); 8797 kmem_cache_free(kvm_vcpu_cache, vmx); 8798} 8799 8800static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 8801{ 8802 int err; 8803 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 8804 int cpu; 8805 8806 if (!vmx) 8807 return ERR_PTR(-ENOMEM); 8808 8809 vmx->vpid = allocate_vpid(); 8810 8811 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 8812 if (err) 8813 goto free_vcpu; 8814 8815 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 8816 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) 8817 > PAGE_SIZE); 8818 8819 err = -ENOMEM; 8820 if (!vmx->guest_msrs) { 8821 goto uninit_vcpu; 8822 } 8823 8824 vmx->loaded_vmcs = &vmx->vmcs01; 8825 vmx->loaded_vmcs->vmcs = alloc_vmcs(); 8826 if (!vmx->loaded_vmcs->vmcs) 8827 goto free_msrs; 8828 if (!vmm_exclusive) 8829 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); 8830 loaded_vmcs_init(vmx->loaded_vmcs); 8831 if (!vmm_exclusive) 8832 kvm_cpu_vmxoff(); 8833 8834 cpu = get_cpu(); 8835 vmx_vcpu_load(&vmx->vcpu, cpu); 8836 vmx->vcpu.cpu = cpu; 8837 err = vmx_vcpu_setup(vmx); 8838 vmx_vcpu_put(&vmx->vcpu); 8839 put_cpu(); 8840 if (err) 8841 goto free_vmcs; 8842 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { 8843 err = alloc_apic_access_page(kvm); 8844 if (err) 8845 goto free_vmcs; 8846 } 8847 8848 if (enable_ept) { 8849 if (!kvm->arch.ept_identity_map_addr) 8850 kvm->arch.ept_identity_map_addr = 8851 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 8852 err = init_rmode_identity_map(kvm); 8853 if (err) 8854 goto free_vmcs; 8855 } 8856 8857 if (nested) { 8858 nested_vmx_setup_ctls_msrs(vmx); 8859 vmx->nested.vpid02 = allocate_vpid(); 8860 } 8861 8862 vmx->nested.posted_intr_nv = -1; 8863 vmx->nested.current_vmptr = -1ull; 8864 vmx->nested.current_vmcs12 = NULL; 8865 8866 /* 8867 * If PML is turned on, failure on enabling PML just results in failure 8868 * of creating the vcpu, therefore we can simplify PML logic (by 8869 * avoiding dealing with cases, such as enabling PML partially on vcpus 8870 * for the guest, etc. 8871 */ 8872 if (enable_pml) { 8873 err = vmx_create_pml_buffer(vmx); 8874 if (err) 8875 goto free_vmcs; 8876 } 8877 8878 return &vmx->vcpu; 8879 8880free_vmcs: 8881 free_vpid(vmx->nested.vpid02); 8882 free_loaded_vmcs(vmx->loaded_vmcs); 8883free_msrs: 8884 kfree(vmx->guest_msrs); 8885uninit_vcpu: 8886 kvm_vcpu_uninit(&vmx->vcpu); 8887free_vcpu: 8888 free_vpid(vmx->vpid); 8889 kmem_cache_free(kvm_vcpu_cache, vmx); 8890 return ERR_PTR(err); 8891} 8892 8893static void __init vmx_check_processor_compat(void *rtn) 8894{ 8895 struct vmcs_config vmcs_conf; 8896 8897 *(int *)rtn = 0; 8898 if (setup_vmcs_config(&vmcs_conf) < 0) 8899 *(int *)rtn = -EIO; 8900 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 8901 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 8902 smp_processor_id()); 8903 *(int *)rtn = -EIO; 8904 } 8905} 8906 8907static int get_ept_level(void) 8908{ 8909 return VMX_EPT_DEFAULT_GAW + 1; 8910} 8911 8912static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 8913{ 8914 u8 cache; 8915 u64 ipat = 0; 8916 8917 /* For VT-d and EPT combination 8918 * 1. MMIO: always map as UC 8919 * 2. EPT with VT-d: 8920 * a. VT-d without snooping control feature: can't guarantee the 8921 * result, try to trust guest. 8922 * b. VT-d with snooping control feature: snooping control feature of 8923 * VT-d engine can guarantee the cache correctness. Just set it 8924 * to WB to keep consistent with host. So the same as item 3. 8925 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep 8926 * consistent with host MTRR 8927 */ 8928 if (is_mmio) { 8929 cache = MTRR_TYPE_UNCACHABLE; 8930 goto exit; 8931 } 8932 8933 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { 8934 ipat = VMX_EPT_IPAT_BIT; 8935 cache = MTRR_TYPE_WRBACK; 8936 goto exit; 8937 } 8938 8939 if (kvm_read_cr0(vcpu) & X86_CR0_CD) { 8940 ipat = VMX_EPT_IPAT_BIT; 8941 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 8942 cache = MTRR_TYPE_WRBACK; 8943 else 8944 cache = MTRR_TYPE_UNCACHABLE; 8945 goto exit; 8946 } 8947 8948 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); 8949 8950exit: 8951 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; 8952} 8953 8954static int vmx_get_lpage_level(void) 8955{ 8956 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 8957 return PT_DIRECTORY_LEVEL; 8958 else 8959 /* For shadow and EPT supported 1GB page */ 8960 return PT_PDPE_LEVEL; 8961} 8962 8963static void vmcs_set_secondary_exec_control(u32 new_ctl) 8964{ 8965 /* 8966 * These bits in the secondary execution controls field 8967 * are dynamic, the others are mostly based on the hypervisor 8968 * architecture and the guest's CPUID. Do not touch the 8969 * dynamic bits. 8970 */ 8971 u32 mask = 8972 SECONDARY_EXEC_SHADOW_VMCS | 8973 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 8974 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 8975 8976 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8977 8978 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8979 (new_ctl & ~mask) | (cur_ctl & mask)); 8980} 8981 8982static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 8983{ 8984 struct kvm_cpuid_entry2 *best; 8985 struct vcpu_vmx *vmx = to_vmx(vcpu); 8986 u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); 8987 8988 if (vmx_rdtscp_supported()) { 8989 bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); 8990 if (!rdtscp_enabled) 8991 secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; 8992 8993 if (nested) { 8994 if (rdtscp_enabled) 8995 vmx->nested.nested_vmx_secondary_ctls_high |= 8996 SECONDARY_EXEC_RDTSCP; 8997 else 8998 vmx->nested.nested_vmx_secondary_ctls_high &= 8999 ~SECONDARY_EXEC_RDTSCP; 9000 } 9001 } 9002 9003 /* Exposing INVPCID only when PCID is exposed */ 9004 best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 9005 if (vmx_invpcid_supported() && 9006 (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || 9007 !guest_cpuid_has_pcid(vcpu))) { 9008 secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; 9009 9010 if (best) 9011 best->ebx &= ~bit(X86_FEATURE_INVPCID); 9012 } 9013 9014 if (cpu_has_secondary_exec_ctrls()) 9015 vmcs_set_secondary_exec_control(secondary_exec_ctl); 9016 9017 if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { 9018 if (guest_cpuid_has_pcommit(vcpu)) 9019 vmx->nested.nested_vmx_secondary_ctls_high |= 9020 SECONDARY_EXEC_PCOMMIT; 9021 else 9022 vmx->nested.nested_vmx_secondary_ctls_high &= 9023 ~SECONDARY_EXEC_PCOMMIT; 9024 } 9025} 9026 9027static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 9028{ 9029 if (func == 1 && nested) 9030 entry->ecx |= bit(X86_FEATURE_VMX); 9031} 9032 9033static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 9034 struct x86_exception *fault) 9035{ 9036 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 9037 u32 exit_reason; 9038 9039 if (fault->error_code & PFERR_RSVD_MASK) 9040 exit_reason = EXIT_REASON_EPT_MISCONFIG; 9041 else 9042 exit_reason = EXIT_REASON_EPT_VIOLATION; 9043 nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); 9044 vmcs12->guest_physical_address = fault->address; 9045} 9046 9047/* Callbacks for nested_ept_init_mmu_context: */ 9048 9049static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) 9050{ 9051 /* return the page table to be shadowed - in our case, EPT12 */ 9052 return get_vmcs12(vcpu)->ept_pointer; 9053} 9054 9055static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 9056{ 9057 WARN_ON(mmu_is_nested(vcpu)); 9058 kvm_init_shadow_ept_mmu(vcpu, 9059 to_vmx(vcpu)->nested.nested_vmx_ept_caps & 9060 VMX_EPT_EXECUTE_ONLY_BIT); 9061 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 9062 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 9063 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 9064 9065 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 9066} 9067 9068static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 9069{ 9070 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 9071} 9072 9073static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 9074 u16 error_code) 9075{ 9076 bool inequality, bit; 9077 9078 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 9079 inequality = 9080 (error_code & vmcs12->page_fault_error_code_mask) != 9081 vmcs12->page_fault_error_code_match; 9082 return inequality ^ bit; 9083} 9084 9085static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 9086 struct x86_exception *fault) 9087{ 9088 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 9089 9090 WARN_ON(!is_guest_mode(vcpu)); 9091 9092 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) 9093 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 9094 vmcs_read32(VM_EXIT_INTR_INFO), 9095 vmcs_readl(EXIT_QUALIFICATION)); 9096 else 9097 kvm_inject_page_fault(vcpu, fault); 9098} 9099 9100static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, 9101 struct vmcs12 *vmcs12) 9102{ 9103 struct vcpu_vmx *vmx = to_vmx(vcpu); 9104 int maxphyaddr = cpuid_maxphyaddr(vcpu); 9105 9106 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 9107 if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || 9108 vmcs12->apic_access_addr >> maxphyaddr) 9109 return false; 9110 9111 /* 9112 * Translate L1 physical address to host physical 9113 * address for vmcs02. Keep the page pinned, so this 9114 * physical address remains valid. We keep a reference 9115 * to it so we can release it later. 9116 */ 9117 if (vmx->nested.apic_access_page) /* shouldn't happen */ 9118 nested_release_page(vmx->nested.apic_access_page); 9119 vmx->nested.apic_access_page = 9120 nested_get_page(vcpu, vmcs12->apic_access_addr); 9121 } 9122 9123 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 9124 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || 9125 vmcs12->virtual_apic_page_addr >> maxphyaddr) 9126 return false; 9127 9128 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 9129 nested_release_page(vmx->nested.virtual_apic_page); 9130 vmx->nested.virtual_apic_page = 9131 nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); 9132 9133 /* 9134 * Failing the vm entry is _not_ what the processor does 9135 * but it's basically the only possibility we have. 9136 * We could still enter the guest if CR8 load exits are 9137 * enabled, CR8 store exits are enabled, and virtualize APIC 9138 * access is disabled; in this case the processor would never 9139 * use the TPR shadow and we could simply clear the bit from 9140 * the execution control. But such a configuration is useless, 9141 * so let's keep the code simple. 9142 */ 9143 if (!vmx->nested.virtual_apic_page) 9144 return false; 9145 } 9146 9147 if (nested_cpu_has_posted_intr(vmcs12)) { 9148 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || 9149 vmcs12->posted_intr_desc_addr >> maxphyaddr) 9150 return false; 9151 9152 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 9153 kunmap(vmx->nested.pi_desc_page); 9154 nested_release_page(vmx->nested.pi_desc_page); 9155 } 9156 vmx->nested.pi_desc_page = 9157 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); 9158 if (!vmx->nested.pi_desc_page) 9159 return false; 9160 9161 vmx->nested.pi_desc = 9162 (struct pi_desc *)kmap(vmx->nested.pi_desc_page); 9163 if (!vmx->nested.pi_desc) { 9164 nested_release_page_clean(vmx->nested.pi_desc_page); 9165 return false; 9166 } 9167 vmx->nested.pi_desc = 9168 (struct pi_desc *)((void *)vmx->nested.pi_desc + 9169 (unsigned long)(vmcs12->posted_intr_desc_addr & 9170 (PAGE_SIZE - 1))); 9171 } 9172 9173 return true; 9174} 9175 9176static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 9177{ 9178 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 9179 struct vcpu_vmx *vmx = to_vmx(vcpu); 9180 9181 if (vcpu->arch.virtual_tsc_khz == 0) 9182 return; 9183 9184 /* Make sure short timeouts reliably trigger an immediate vmexit. 9185 * hrtimer_start does not guarantee this. */ 9186 if (preemption_timeout <= 1) { 9187 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 9188 return; 9189 } 9190 9191 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 9192 preemption_timeout *= 1000000; 9193 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 9194 hrtimer_start(&vmx->nested.preemption_timer, 9195 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 9196} 9197 9198static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 9199 struct vmcs12 *vmcs12) 9200{ 9201 int maxphyaddr; 9202 u64 addr; 9203 9204 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 9205 return 0; 9206 9207 if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { 9208 WARN_ON(1); 9209 return -EINVAL; 9210 } 9211 maxphyaddr = cpuid_maxphyaddr(vcpu); 9212 9213 if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || 9214 ((addr + PAGE_SIZE) >> maxphyaddr)) 9215 return -EINVAL; 9216 9217 return 0; 9218} 9219 9220/* 9221 * Merge L0's and L1's MSR bitmap, return false to indicate that 9222 * we do not use the hardware. 9223 */ 9224static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, 9225 struct vmcs12 *vmcs12) 9226{ 9227 int msr; 9228 struct page *page; 9229 unsigned long *msr_bitmap; 9230 9231 if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) 9232 return false; 9233 9234 page = nested_get_page(vcpu, vmcs12->msr_bitmap); 9235 if (!page) { 9236 WARN_ON(1); 9237 return false; 9238 } 9239 msr_bitmap = (unsigned long *)kmap(page); 9240 if (!msr_bitmap) { 9241 nested_release_page_clean(page); 9242 WARN_ON(1); 9243 return false; 9244 } 9245 9246 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 9247 if (nested_cpu_has_apic_reg_virt(vmcs12)) 9248 for (msr = 0x800; msr <= 0x8ff; msr++) 9249 nested_vmx_disable_intercept_for_msr( 9250 msr_bitmap, 9251 vmx_msr_bitmap_nested, 9252 msr, MSR_TYPE_R); 9253 /* TPR is allowed */ 9254 nested_vmx_disable_intercept_for_msr(msr_bitmap, 9255 vmx_msr_bitmap_nested, 9256 APIC_BASE_MSR + (APIC_TASKPRI >> 4), 9257 MSR_TYPE_R | MSR_TYPE_W); 9258 if (nested_cpu_has_vid(vmcs12)) { 9259 /* EOI and self-IPI are allowed */ 9260 nested_vmx_disable_intercept_for_msr( 9261 msr_bitmap, 9262 vmx_msr_bitmap_nested, 9263 APIC_BASE_MSR + (APIC_EOI >> 4), 9264 MSR_TYPE_W); 9265 nested_vmx_disable_intercept_for_msr( 9266 msr_bitmap, 9267 vmx_msr_bitmap_nested, 9268 APIC_BASE_MSR + (APIC_SELF_IPI >> 4), 9269 MSR_TYPE_W); 9270 } 9271 } else { 9272 /* 9273 * Enable reading intercept of all the x2apic 9274 * MSRs. We should not rely on vmcs12 to do any 9275 * optimizations here, it may have been modified 9276 * by L1. 9277 */ 9278 for (msr = 0x800; msr <= 0x8ff; msr++) 9279 __vmx_enable_intercept_for_msr( 9280 vmx_msr_bitmap_nested, 9281 msr, 9282 MSR_TYPE_R); 9283 9284 __vmx_enable_intercept_for_msr( 9285 vmx_msr_bitmap_nested, 9286 APIC_BASE_MSR + (APIC_TASKPRI >> 4), 9287 MSR_TYPE_W); 9288 __vmx_enable_intercept_for_msr( 9289 vmx_msr_bitmap_nested, 9290 APIC_BASE_MSR + (APIC_EOI >> 4), 9291 MSR_TYPE_W); 9292 __vmx_enable_intercept_for_msr( 9293 vmx_msr_bitmap_nested, 9294 APIC_BASE_MSR + (APIC_SELF_IPI >> 4), 9295 MSR_TYPE_W); 9296 } 9297 kunmap(page); 9298 nested_release_page_clean(page); 9299 9300 return true; 9301} 9302 9303static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 9304 struct vmcs12 *vmcs12) 9305{ 9306 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 9307 !nested_cpu_has_apic_reg_virt(vmcs12) && 9308 !nested_cpu_has_vid(vmcs12) && 9309 !nested_cpu_has_posted_intr(vmcs12)) 9310 return 0; 9311 9312 /* 9313 * If virtualize x2apic mode is enabled, 9314 * virtualize apic access must be disabled. 9315 */ 9316 if (nested_cpu_has_virt_x2apic_mode(vmcs12) && 9317 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 9318 return -EINVAL; 9319 9320 /* 9321 * If virtual interrupt delivery is enabled, 9322 * we must exit on external interrupts. 9323 */ 9324 if (nested_cpu_has_vid(vmcs12) && 9325 !nested_exit_on_intr(vcpu)) 9326 return -EINVAL; 9327 9328 /* 9329 * bits 15:8 should be zero in posted_intr_nv, 9330 * the descriptor address has been already checked 9331 * in nested_get_vmcs12_pages. 9332 */ 9333 if (nested_cpu_has_posted_intr(vmcs12) && 9334 (!nested_cpu_has_vid(vmcs12) || 9335 !nested_exit_intr_ack_set(vcpu) || 9336 vmcs12->posted_intr_nv & 0xff00)) 9337 return -EINVAL; 9338 9339 /* tpr shadow is needed by all apicv features. */ 9340 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 9341 return -EINVAL; 9342 9343 return 0; 9344} 9345 9346static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 9347 unsigned long count_field, 9348 unsigned long addr_field) 9349{ 9350 int maxphyaddr; 9351 u64 count, addr; 9352 9353 if (vmcs12_read_any(vcpu, count_field, &count) || 9354 vmcs12_read_any(vcpu, addr_field, &addr)) { 9355 WARN_ON(1); 9356 return -EINVAL; 9357 } 9358 if (count == 0) 9359 return 0; 9360 maxphyaddr = cpuid_maxphyaddr(vcpu); 9361 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 9362 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { 9363 pr_warn_ratelimited( 9364 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", 9365 addr_field, maxphyaddr, count, addr); 9366 return -EINVAL; 9367 } 9368 return 0; 9369} 9370 9371static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, 9372 struct vmcs12 *vmcs12) 9373{ 9374 if (vmcs12->vm_exit_msr_load_count == 0 && 9375 vmcs12->vm_exit_msr_store_count == 0 && 9376 vmcs12->vm_entry_msr_load_count == 0) 9377 return 0; /* Fast path */ 9378 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, 9379 VM_EXIT_MSR_LOAD_ADDR) || 9380 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, 9381 VM_EXIT_MSR_STORE_ADDR) || 9382 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, 9383 VM_ENTRY_MSR_LOAD_ADDR)) 9384 return -EINVAL; 9385 return 0; 9386} 9387 9388static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 9389 struct vmx_msr_entry *e) 9390{ 9391 /* x2APIC MSR accesses are not allowed */ 9392 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) 9393 return -EINVAL; 9394 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ 9395 e->index == MSR_IA32_UCODE_REV) 9396 return -EINVAL; 9397 if (e->reserved != 0) 9398 return -EINVAL; 9399 return 0; 9400} 9401 9402static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 9403 struct vmx_msr_entry *e) 9404{ 9405 if (e->index == MSR_FS_BASE || 9406 e->index == MSR_GS_BASE || 9407 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ 9408 nested_vmx_msr_check_common(vcpu, e)) 9409 return -EINVAL; 9410 return 0; 9411} 9412 9413static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 9414 struct vmx_msr_entry *e) 9415{ 9416 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ 9417 nested_vmx_msr_check_common(vcpu, e)) 9418 return -EINVAL; 9419 return 0; 9420} 9421 9422/* 9423 * Load guest's/host's msr at nested entry/exit. 9424 * return 0 for success, entry index for failure. 9425 */ 9426static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 9427{ 9428 u32 i; 9429 struct vmx_msr_entry e; 9430 struct msr_data msr; 9431 9432 msr.host_initiated = false; 9433 for (i = 0; i < count; i++) { 9434 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 9435 &e, sizeof(e))) { 9436 pr_warn_ratelimited( 9437 "%s cannot read MSR entry (%u, 0x%08llx)\n", 9438 __func__, i, gpa + i * sizeof(e)); 9439 goto fail; 9440 } 9441 if (nested_vmx_load_msr_check(vcpu, &e)) { 9442 pr_warn_ratelimited( 9443 "%s check failed (%u, 0x%x, 0x%x)\n", 9444 __func__, i, e.index, e.reserved); 9445 goto fail; 9446 } 9447 msr.index = e.index; 9448 msr.data = e.value; 9449 if (kvm_set_msr(vcpu, &msr)) { 9450 pr_warn_ratelimited( 9451 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 9452 __func__, i, e.index, e.value); 9453 goto fail; 9454 } 9455 } 9456 return 0; 9457fail: 9458 return i + 1; 9459} 9460 9461static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 9462{ 9463 u32 i; 9464 struct vmx_msr_entry e; 9465 9466 for (i = 0; i < count; i++) { 9467 struct msr_data msr_info; 9468 if (kvm_vcpu_read_guest(vcpu, 9469 gpa + i * sizeof(e), 9470 &e, 2 * sizeof(u32))) { 9471 pr_warn_ratelimited( 9472 "%s cannot read MSR entry (%u, 0x%08llx)\n", 9473 __func__, i, gpa + i * sizeof(e)); 9474 return -EINVAL; 9475 } 9476 if (nested_vmx_store_msr_check(vcpu, &e)) { 9477 pr_warn_ratelimited( 9478 "%s check failed (%u, 0x%x, 0x%x)\n", 9479 __func__, i, e.index, e.reserved); 9480 return -EINVAL; 9481 } 9482 msr_info.host_initiated = false; 9483 msr_info.index = e.index; 9484 if (kvm_get_msr(vcpu, &msr_info)) { 9485 pr_warn_ratelimited( 9486 "%s cannot read MSR (%u, 0x%x)\n", 9487 __func__, i, e.index); 9488 return -EINVAL; 9489 } 9490 if (kvm_vcpu_write_guest(vcpu, 9491 gpa + i * sizeof(e) + 9492 offsetof(struct vmx_msr_entry, value), 9493 &msr_info.data, sizeof(msr_info.data))) { 9494 pr_warn_ratelimited( 9495 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 9496 __func__, i, e.index, msr_info.data); 9497 return -EINVAL; 9498 } 9499 } 9500 return 0; 9501} 9502 9503/* 9504 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 9505 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 9506 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 9507 * guest in a way that will both be appropriate to L1's requests, and our 9508 * needs. In addition to modifying the active vmcs (which is vmcs02), this 9509 * function also has additional necessary side-effects, like setting various 9510 * vcpu->arch fields. 9511 */ 9512static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 9513{ 9514 struct vcpu_vmx *vmx = to_vmx(vcpu); 9515 u32 exec_control; 9516 9517 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 9518 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 9519 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 9520 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 9521 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 9522 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 9523 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 9524 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 9525 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 9526 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 9527 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 9528 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 9529 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 9530 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 9531 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 9532 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 9533 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 9534 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 9535 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 9536 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 9537 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 9538 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 9539 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 9540 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 9541 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 9542 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 9543 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 9544 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 9545 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 9546 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 9547 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 9548 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 9549 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 9550 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 9551 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 9552 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 9553 9554 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 9555 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 9556 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 9557 } else { 9558 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 9559 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 9560 } 9561 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 9562 vmcs12->vm_entry_intr_info_field); 9563 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 9564 vmcs12->vm_entry_exception_error_code); 9565 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 9566 vmcs12->vm_entry_instruction_len); 9567 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 9568 vmcs12->guest_interruptibility_info); 9569 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 9570 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 9571 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 9572 vmcs12->guest_pending_dbg_exceptions); 9573 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 9574 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 9575 9576 if (nested_cpu_has_xsaves(vmcs12)) 9577 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 9578 vmcs_write64(VMCS_LINK_POINTER, -1ull); 9579 9580 exec_control = vmcs12->pin_based_vm_exec_control; 9581 exec_control |= vmcs_config.pin_based_exec_ctrl; 9582 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 9583 9584 if (nested_cpu_has_posted_intr(vmcs12)) { 9585 /* 9586 * Note that we use L0's vector here and in 9587 * vmx_deliver_nested_posted_interrupt. 9588 */ 9589 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 9590 vmx->nested.pi_pending = false; 9591 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 9592 vmcs_write64(POSTED_INTR_DESC_ADDR, 9593 page_to_phys(vmx->nested.pi_desc_page) + 9594 (unsigned long)(vmcs12->posted_intr_desc_addr & 9595 (PAGE_SIZE - 1))); 9596 } else 9597 exec_control &= ~PIN_BASED_POSTED_INTR; 9598 9599 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 9600 9601 vmx->nested.preemption_timer_expired = false; 9602 if (nested_cpu_has_preemption_timer(vmcs12)) 9603 vmx_start_preemption_timer(vcpu); 9604 9605 /* 9606 * Whether page-faults are trapped is determined by a combination of 9607 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 9608 * If enable_ept, L0 doesn't care about page faults and we should 9609 * set all of these to L1's desires. However, if !enable_ept, L0 does 9610 * care about (at least some) page faults, and because it is not easy 9611 * (if at all possible?) to merge L0 and L1's desires, we simply ask 9612 * to exit on each and every L2 page fault. This is done by setting 9613 * MASK=MATCH=0 and (see below) EB.PF=1. 9614 * Note that below we don't need special code to set EB.PF beyond the 9615 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 9616 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 9617 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 9618 * 9619 * A problem with this approach (when !enable_ept) is that L1 may be 9620 * injected with more page faults than it asked for. This could have 9621 * caused problems, but in practice existing hypervisors don't care. 9622 * To fix this, we will need to emulate the PFEC checking (on the L1 9623 * page tables), using walk_addr(), when injecting PFs to L1. 9624 */ 9625 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 9626 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 9627 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 9628 enable_ept ? vmcs12->page_fault_error_code_match : 0); 9629 9630 if (cpu_has_secondary_exec_ctrls()) { 9631 exec_control = vmx_secondary_exec_control(vmx); 9632 9633 /* Take the following fields only from vmcs12 */ 9634 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 9635 SECONDARY_EXEC_RDTSCP | 9636 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 9637 SECONDARY_EXEC_APIC_REGISTER_VIRT | 9638 SECONDARY_EXEC_PCOMMIT); 9639 if (nested_cpu_has(vmcs12, 9640 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 9641 exec_control |= vmcs12->secondary_vm_exec_control; 9642 9643 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { 9644 /* 9645 * If translation failed, no matter: This feature asks 9646 * to exit when accessing the given address, and if it 9647 * can never be accessed, this feature won't do 9648 * anything anyway. 9649 */ 9650 if (!vmx->nested.apic_access_page) 9651 exec_control &= 9652 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9653 else 9654 vmcs_write64(APIC_ACCESS_ADDR, 9655 page_to_phys(vmx->nested.apic_access_page)); 9656 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && 9657 cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { 9658 exec_control |= 9659 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9660 kvm_vcpu_reload_apic_access_page(vcpu); 9661 } 9662 9663 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 9664 vmcs_write64(EOI_EXIT_BITMAP0, 9665 vmcs12->eoi_exit_bitmap0); 9666 vmcs_write64(EOI_EXIT_BITMAP1, 9667 vmcs12->eoi_exit_bitmap1); 9668 vmcs_write64(EOI_EXIT_BITMAP2, 9669 vmcs12->eoi_exit_bitmap2); 9670 vmcs_write64(EOI_EXIT_BITMAP3, 9671 vmcs12->eoi_exit_bitmap3); 9672 vmcs_write16(GUEST_INTR_STATUS, 9673 vmcs12->guest_intr_status); 9674 } 9675 9676 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 9677 } 9678 9679 9680 /* 9681 * Set host-state according to L0's settings (vmcs12 is irrelevant here) 9682 * Some constant fields are set here by vmx_set_constant_host_state(). 9683 * Other fields are different per CPU, and will be set later when 9684 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 9685 */ 9686 vmx_set_constant_host_state(vmx); 9687 9688 /* 9689 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before 9690 * entry, but only if the current (host) sp changed from the value 9691 * we wrote last (vmx->host_rsp). This cache is no longer relevant 9692 * if we switch vmcs, and rather than hold a separate cache per vmcs, 9693 * here we just force the write to happen on entry. 9694 */ 9695 vmx->host_rsp = 0; 9696 9697 exec_control = vmx_exec_control(vmx); /* L0's desires */ 9698 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 9699 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 9700 exec_control &= ~CPU_BASED_TPR_SHADOW; 9701 exec_control |= vmcs12->cpu_based_vm_exec_control; 9702 9703 if (exec_control & CPU_BASED_TPR_SHADOW) { 9704 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 9705 page_to_phys(vmx->nested.virtual_apic_page)); 9706 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 9707 } 9708 9709 if (cpu_has_vmx_msr_bitmap() && 9710 exec_control & CPU_BASED_USE_MSR_BITMAPS) { 9711 nested_vmx_merge_msr_bitmap(vcpu, vmcs12); 9712 /* MSR_BITMAP will be set by following vmx_set_efer. */ 9713 } else 9714 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 9715 9716 /* 9717 * Merging of IO bitmap not currently supported. 9718 * Rather, exit every time. 9719 */ 9720 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 9721 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 9722 9723 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 9724 9725 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 9726 * bitwise-or of what L1 wants to trap for L2, and what we want to 9727 * trap. Note that CR0.TS also needs updating - we do this later. 9728 */ 9729 update_exception_bitmap(vcpu); 9730 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 9731 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 9732 9733 /* L2->L1 exit controls are emulated - the hardware exit is to L0 so 9734 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 9735 * bits are further modified by vmx_set_efer() below. 9736 */ 9737 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 9738 9739 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are 9740 * emulated by vmx_set_efer(), below. 9741 */ 9742 vm_entry_controls_init(vmx, 9743 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & 9744 ~VM_ENTRY_IA32E_MODE) | 9745 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 9746 9747 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { 9748 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 9749 vcpu->arch.pat = vmcs12->guest_ia32_pat; 9750 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 9751 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 9752 9753 9754 set_cr4_guest_host_mask(vmx); 9755 9756 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) 9757 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 9758 9759 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 9760 vmcs_write64(TSC_OFFSET, 9761 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 9762 else 9763 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 9764 9765 if (enable_vpid) { 9766 /* 9767 * There is no direct mapping between vpid02 and vpid12, the 9768 * vpid02 is per-vCPU for L0 and reused while the value of 9769 * vpid12 is changed w/ one invvpid during nested vmentry. 9770 * The vpid12 is allocated by L1 for L2, so it will not 9771 * influence global bitmap(for vpid01 and vpid02 allocation) 9772 * even if spawn a lot of nested vCPUs. 9773 */ 9774 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { 9775 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 9776 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 9777 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 9778 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); 9779 } 9780 } else { 9781 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 9782 vmx_flush_tlb(vcpu); 9783 } 9784 9785 } 9786 9787 if (nested_cpu_has_ept(vmcs12)) { 9788 kvm_mmu_unload(vcpu); 9789 nested_ept_init_mmu_context(vcpu); 9790 } 9791 9792 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 9793 vcpu->arch.efer = vmcs12->guest_ia32_efer; 9794 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 9795 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 9796 else 9797 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 9798 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 9799 vmx_set_efer(vcpu, vcpu->arch.efer); 9800 9801 /* 9802 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified 9803 * TS bit (for lazy fpu) and bits which we consider mandatory enabled. 9804 * The CR0_READ_SHADOW is what L2 should have expected to read given 9805 * the specifications by L1; It's not enough to take 9806 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 9807 * have more bits than L1 expected. 9808 */ 9809 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 9810 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 9811 9812 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 9813 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 9814 9815 /* shadow page tables on either EPT or shadow page tables */ 9816 kvm_set_cr3(vcpu, vmcs12->guest_cr3); 9817 kvm_mmu_reset_context(vcpu); 9818 9819 if (!enable_ept) 9820 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 9821 9822 /* 9823 * L1 may access the L2's PDPTR, so save them to construct vmcs12 9824 */ 9825 if (enable_ept) { 9826 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 9827 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 9828 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 9829 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 9830 } 9831 9832 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 9833 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 9834} 9835 9836/* 9837 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 9838 * for running an L2 nested guest. 9839 */ 9840static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 9841{ 9842 struct vmcs12 *vmcs12; 9843 struct vcpu_vmx *vmx = to_vmx(vcpu); 9844 int cpu; 9845 struct loaded_vmcs *vmcs02; 9846 bool ia32e; 9847 u32 msr_entry_idx; 9848 9849 if (!nested_vmx_check_permission(vcpu) || 9850 !nested_vmx_check_vmcs12(vcpu)) 9851 return 1; 9852 9853 skip_emulated_instruction(vcpu); 9854 vmcs12 = get_vmcs12(vcpu); 9855 9856 if (enable_shadow_vmcs) 9857 copy_shadow_to_vmcs12(vmx); 9858 9859 /* 9860 * The nested entry process starts with enforcing various prerequisites 9861 * on vmcs12 as required by the Intel SDM, and act appropriately when 9862 * they fail: As the SDM explains, some conditions should cause the 9863 * instruction to fail, while others will cause the instruction to seem 9864 * to succeed, but return an EXIT_REASON_INVALID_STATE. 9865 * To speed up the normal (success) code path, we should avoid checking 9866 * for misconfigurations which will anyway be caught by the processor 9867 * when using the merged vmcs02. 9868 */ 9869 if (vmcs12->launch_state == launch) { 9870 nested_vmx_failValid(vcpu, 9871 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 9872 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 9873 return 1; 9874 } 9875 9876 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 9877 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { 9878 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9879 return 1; 9880 } 9881 9882 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 9883 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9884 return 1; 9885 } 9886 9887 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { 9888 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9889 return 1; 9890 } 9891 9892 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { 9893 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9894 return 1; 9895 } 9896 9897 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { 9898 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9899 return 1; 9900 } 9901 9902 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 9903 vmx->nested.nested_vmx_true_procbased_ctls_low, 9904 vmx->nested.nested_vmx_procbased_ctls_high) || 9905 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 9906 vmx->nested.nested_vmx_secondary_ctls_low, 9907 vmx->nested.nested_vmx_secondary_ctls_high) || 9908 !vmx_control_verify(vmcs12->pin_based_vm_exec_control, 9909 vmx->nested.nested_vmx_pinbased_ctls_low, 9910 vmx->nested.nested_vmx_pinbased_ctls_high) || 9911 !vmx_control_verify(vmcs12->vm_exit_controls, 9912 vmx->nested.nested_vmx_true_exit_ctls_low, 9913 vmx->nested.nested_vmx_exit_ctls_high) || 9914 !vmx_control_verify(vmcs12->vm_entry_controls, 9915 vmx->nested.nested_vmx_true_entry_ctls_low, 9916 vmx->nested.nested_vmx_entry_ctls_high)) 9917 { 9918 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9919 return 1; 9920 } 9921 9922 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || 9923 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9924 nested_vmx_failValid(vcpu, 9925 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 9926 return 1; 9927 } 9928 9929 if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || 9930 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9931 nested_vmx_entry_failure(vcpu, vmcs12, 9932 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9933 return 1; 9934 } 9935 if (vmcs12->vmcs_link_pointer != -1ull) { 9936 nested_vmx_entry_failure(vcpu, vmcs12, 9937 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); 9938 return 1; 9939 } 9940 9941 /* 9942 * If the load IA32_EFER VM-entry control is 1, the following checks 9943 * are performed on the field for the IA32_EFER MSR: 9944 * - Bits reserved in the IA32_EFER MSR must be 0. 9945 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 9946 * the IA-32e mode guest VM-exit control. It must also be identical 9947 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 9948 * CR0.PG) is 1. 9949 */ 9950 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { 9951 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 9952 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 9953 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 9954 ((vmcs12->guest_cr0 & X86_CR0_PG) && 9955 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { 9956 nested_vmx_entry_failure(vcpu, vmcs12, 9957 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9958 return 1; 9959 } 9960 } 9961 9962 /* 9963 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 9964 * IA32_EFER MSR must be 0 in the field for that register. In addition, 9965 * the values of the LMA and LME bits in the field must each be that of 9966 * the host address-space size VM-exit control. 9967 */ 9968 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 9969 ia32e = (vmcs12->vm_exit_controls & 9970 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 9971 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 9972 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 9973 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { 9974 nested_vmx_entry_failure(vcpu, vmcs12, 9975 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9976 return 1; 9977 } 9978 } 9979 9980 /* 9981 * We're finally done with prerequisite checking, and can start with 9982 * the nested entry. 9983 */ 9984 9985 vmcs02 = nested_get_current_vmcs02(vmx); 9986 if (!vmcs02) 9987 return -ENOMEM; 9988 9989 enter_guest_mode(vcpu); 9990 9991 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); 9992 9993 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 9994 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 9995 9996 cpu = get_cpu(); 9997 vmx->loaded_vmcs = vmcs02; 9998 vmx_vcpu_put(vcpu); 9999 vmx_vcpu_load(vcpu, cpu); 10000 vcpu->cpu = cpu; 10001 put_cpu(); 10002 10003 vmx_segment_cache_clear(vmx); 10004 10005 prepare_vmcs02(vcpu, vmcs12); 10006 10007 msr_entry_idx = nested_vmx_load_msr(vcpu, 10008 vmcs12->vm_entry_msr_load_addr, 10009 vmcs12->vm_entry_msr_load_count); 10010 if (msr_entry_idx) { 10011 leave_guest_mode(vcpu); 10012 vmx_load_vmcs01(vcpu); 10013 nested_vmx_entry_failure(vcpu, vmcs12, 10014 EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); 10015 return 1; 10016 } 10017 10018 vmcs12->launch_state = 1; 10019 10020 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 10021 return kvm_vcpu_halt(vcpu); 10022 10023 vmx->nested.nested_run_pending = 1; 10024 10025 /* 10026 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 10027 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 10028 * returned as far as L1 is concerned. It will only return (and set 10029 * the success flag) when L2 exits (see nested_vmx_vmexit()). 10030 */ 10031 return 1; 10032} 10033 10034/* 10035 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 10036 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 10037 * This function returns the new value we should put in vmcs12.guest_cr0. 10038 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 10039 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 10040 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 10041 * didn't trap the bit, because if L1 did, so would L0). 10042 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 10043 * been modified by L2, and L1 knows it. So just leave the old value of 10044 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 10045 * isn't relevant, because if L0 traps this bit it can set it to anything. 10046 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 10047 * changed these bits, and therefore they need to be updated, but L0 10048 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 10049 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 10050 */ 10051static inline unsigned long 10052vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 10053{ 10054 return 10055 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 10056 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 10057 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 10058 vcpu->arch.cr0_guest_owned_bits)); 10059} 10060 10061static inline unsigned long 10062vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 10063{ 10064 return 10065 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 10066 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 10067 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 10068 vcpu->arch.cr4_guest_owned_bits)); 10069} 10070 10071static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 10072 struct vmcs12 *vmcs12) 10073{ 10074 u32 idt_vectoring; 10075 unsigned int nr; 10076 10077 if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { 10078 nr = vcpu->arch.exception.nr; 10079 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 10080 10081 if (kvm_exception_is_soft(nr)) { 10082 vmcs12->vm_exit_instruction_len = 10083 vcpu->arch.event_exit_inst_len; 10084 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 10085 } else 10086 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 10087 10088 if (vcpu->arch.exception.has_error_code) { 10089 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 10090 vmcs12->idt_vectoring_error_code = 10091 vcpu->arch.exception.error_code; 10092 } 10093 10094 vmcs12->idt_vectoring_info_field = idt_vectoring; 10095 } else if (vcpu->arch.nmi_injected) { 10096 vmcs12->idt_vectoring_info_field = 10097 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 10098 } else if (vcpu->arch.interrupt.pending) { 10099 nr = vcpu->arch.interrupt.nr; 10100 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 10101 10102 if (vcpu->arch.interrupt.soft) { 10103 idt_vectoring |= INTR_TYPE_SOFT_INTR; 10104 vmcs12->vm_entry_instruction_len = 10105 vcpu->arch.event_exit_inst_len; 10106 } else 10107 idt_vectoring |= INTR_TYPE_EXT_INTR; 10108 10109 vmcs12->idt_vectoring_info_field = idt_vectoring; 10110 } 10111} 10112 10113static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 10114{ 10115 struct vcpu_vmx *vmx = to_vmx(vcpu); 10116 10117 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 10118 vmx->nested.preemption_timer_expired) { 10119 if (vmx->nested.nested_run_pending) 10120 return -EBUSY; 10121 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 10122 return 0; 10123 } 10124 10125 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 10126 if (vmx->nested.nested_run_pending || 10127 vcpu->arch.interrupt.pending) 10128 return -EBUSY; 10129 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 10130 NMI_VECTOR | INTR_TYPE_NMI_INTR | 10131 INTR_INFO_VALID_MASK, 0); 10132 /* 10133 * The NMI-triggered VM exit counts as injection: 10134 * clear this one and block further NMIs. 10135 */ 10136 vcpu->arch.nmi_pending = 0; 10137 vmx_set_nmi_mask(vcpu, true); 10138 return 0; 10139 } 10140 10141 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 10142 nested_exit_on_intr(vcpu)) { 10143 if (vmx->nested.nested_run_pending) 10144 return -EBUSY; 10145 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 10146 return 0; 10147 } 10148 10149 return vmx_complete_nested_posted_interrupt(vcpu); 10150} 10151 10152static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 10153{ 10154 ktime_t remaining = 10155 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 10156 u64 value; 10157 10158 if (ktime_to_ns(remaining) <= 0) 10159 return 0; 10160 10161 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 10162 do_div(value, 1000000); 10163 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 10164} 10165 10166/* 10167 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 10168 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 10169 * and this function updates it to reflect the changes to the guest state while 10170 * L2 was running (and perhaps made some exits which were handled directly by L0 10171 * without going back to L1), and to reflect the exit reason. 10172 * Note that we do not have to copy here all VMCS fields, just those that 10173 * could have changed by the L2 guest or the exit - i.e., the guest-state and 10174 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 10175 * which already writes to vmcs12 directly. 10176 */ 10177static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 10178 u32 exit_reason, u32 exit_intr_info, 10179 unsigned long exit_qualification) 10180{ 10181 /* update guest state fields: */ 10182 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 10183 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 10184 10185 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 10186 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); 10187 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 10188 10189 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 10190 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 10191 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 10192 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 10193 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 10194 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 10195 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 10196 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 10197 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 10198 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 10199 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 10200 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 10201 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 10202 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 10203 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 10204 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 10205 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 10206 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 10207 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 10208 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 10209 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 10210 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 10211 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 10212 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 10213 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 10214 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 10215 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 10216 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 10217 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 10218 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 10219 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 10220 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 10221 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 10222 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 10223 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 10224 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 10225 10226 vmcs12->guest_interruptibility_info = 10227 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 10228 vmcs12->guest_pending_dbg_exceptions = 10229 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 10230 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 10231 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 10232 else 10233 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 10234 10235 if (nested_cpu_has_preemption_timer(vmcs12)) { 10236 if (vmcs12->vm_exit_controls & 10237 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 10238 vmcs12->vmx_preemption_timer_value = 10239 vmx_get_preemption_timer_value(vcpu); 10240 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 10241 } 10242 10243 /* 10244 * In some cases (usually, nested EPT), L2 is allowed to change its 10245 * own CR3 without exiting. If it has changed it, we must keep it. 10246 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 10247 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 10248 * 10249 * Additionally, restore L2's PDPTR to vmcs12. 10250 */ 10251 if (enable_ept) { 10252 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 10253 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 10254 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 10255 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 10256 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 10257 } 10258 10259 if (nested_cpu_has_vid(vmcs12)) 10260 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 10261 10262 vmcs12->vm_entry_controls = 10263 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 10264 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 10265 10266 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { 10267 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 10268 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 10269 } 10270 10271 /* TODO: These cannot have changed unless we have MSR bitmaps and 10272 * the relevant bit asks not to trap the change */ 10273 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 10274 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); 10275 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 10276 vmcs12->guest_ia32_efer = vcpu->arch.efer; 10277 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 10278 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 10279 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 10280 if (vmx_mpx_supported()) 10281 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 10282 if (nested_cpu_has_xsaves(vmcs12)) 10283 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); 10284 10285 /* update exit information fields: */ 10286 10287 vmcs12->vm_exit_reason = exit_reason; 10288 vmcs12->exit_qualification = exit_qualification; 10289 10290 vmcs12->vm_exit_intr_info = exit_intr_info; 10291 if ((vmcs12->vm_exit_intr_info & 10292 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 10293 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) 10294 vmcs12->vm_exit_intr_error_code = 10295 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 10296 vmcs12->idt_vectoring_info_field = 0; 10297 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 10298 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 10299 10300 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 10301 /* vm_entry_intr_info_field is cleared on exit. Emulate this 10302 * instead of reading the real value. */ 10303 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 10304 10305 /* 10306 * Transfer the event that L0 or L1 may wanted to inject into 10307 * L2 to IDT_VECTORING_INFO_FIELD. 10308 */ 10309 vmcs12_save_pending_event(vcpu, vmcs12); 10310 } 10311 10312 /* 10313 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 10314 * preserved above and would only end up incorrectly in L1. 10315 */ 10316 vcpu->arch.nmi_injected = false; 10317 kvm_clear_exception_queue(vcpu); 10318 kvm_clear_interrupt_queue(vcpu); 10319} 10320 10321/* 10322 * A part of what we need to when the nested L2 guest exits and we want to 10323 * run its L1 parent, is to reset L1's guest state to the host state specified 10324 * in vmcs12. 10325 * This function is to be called not only on normal nested exit, but also on 10326 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 10327 * Failures During or After Loading Guest State"). 10328 * This function should be called when the active VMCS is L1's (vmcs01). 10329 */ 10330static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 10331 struct vmcs12 *vmcs12) 10332{ 10333 struct kvm_segment seg; 10334 10335 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 10336 vcpu->arch.efer = vmcs12->host_ia32_efer; 10337 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 10338 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 10339 else 10340 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 10341 vmx_set_efer(vcpu, vcpu->arch.efer); 10342 10343 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 10344 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 10345 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 10346 /* 10347 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 10348 * actually changed, because it depends on the current state of 10349 * fpu_active (which may have changed). 10350 * Note that vmx_set_cr0 refers to efer set above. 10351 */ 10352 vmx_set_cr0(vcpu, vmcs12->host_cr0); 10353 /* 10354 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need 10355 * to apply the same changes to L1's vmcs. We just set cr0 correctly, 10356 * but we also need to update cr0_guest_host_mask and exception_bitmap. 10357 */ 10358 update_exception_bitmap(vcpu); 10359 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); 10360 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 10361 10362 /* 10363 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 10364 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); 10365 */ 10366 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 10367 kvm_set_cr4(vcpu, vmcs12->host_cr4); 10368 10369 nested_ept_uninit_mmu_context(vcpu); 10370 10371 kvm_set_cr3(vcpu, vmcs12->host_cr3); 10372 kvm_mmu_reset_context(vcpu); 10373 10374 if (!enable_ept) 10375 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 10376 10377 if (enable_vpid) { 10378 /* 10379 * Trivially support vpid by letting L2s share their parent 10380 * L1's vpid. TODO: move to a more elaborate solution, giving 10381 * each L2 its own vpid and exposing the vpid feature to L1. 10382 */ 10383 vmx_flush_tlb(vcpu); 10384 } 10385 10386 10387 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 10388 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 10389 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 10390 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 10391 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 10392 10393 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 10394 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 10395 vmcs_write64(GUEST_BNDCFGS, 0); 10396 10397 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 10398 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 10399 vcpu->arch.pat = vmcs12->host_ia32_pat; 10400 } 10401 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 10402 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 10403 vmcs12->host_ia32_perf_global_ctrl); 10404 10405 /* Set L1 segment info according to Intel SDM 10406 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 10407 seg = (struct kvm_segment) { 10408 .base = 0, 10409 .limit = 0xFFFFFFFF, 10410 .selector = vmcs12->host_cs_selector, 10411 .type = 11, 10412 .present = 1, 10413 .s = 1, 10414 .g = 1 10415 }; 10416 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 10417 seg.l = 1; 10418 else 10419 seg.db = 1; 10420 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 10421 seg = (struct kvm_segment) { 10422 .base = 0, 10423 .limit = 0xFFFFFFFF, 10424 .type = 3, 10425 .present = 1, 10426 .s = 1, 10427 .db = 1, 10428 .g = 1 10429 }; 10430 seg.selector = vmcs12->host_ds_selector; 10431 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 10432 seg.selector = vmcs12->host_es_selector; 10433 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 10434 seg.selector = vmcs12->host_ss_selector; 10435 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 10436 seg.selector = vmcs12->host_fs_selector; 10437 seg.base = vmcs12->host_fs_base; 10438 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 10439 seg.selector = vmcs12->host_gs_selector; 10440 seg.base = vmcs12->host_gs_base; 10441 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 10442 seg = (struct kvm_segment) { 10443 .base = vmcs12->host_tr_base, 10444 .limit = 0x67, 10445 .selector = vmcs12->host_tr_selector, 10446 .type = 11, 10447 .present = 1 10448 }; 10449 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 10450 10451 kvm_set_dr(vcpu, 7, 0x400); 10452 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 10453 10454 if (cpu_has_vmx_msr_bitmap()) 10455 vmx_set_msr_bitmap(vcpu); 10456 10457 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 10458 vmcs12->vm_exit_msr_load_count)) 10459 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 10460} 10461 10462/* 10463 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 10464 * and modify vmcs12 to make it see what it would expect to see there if 10465 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 10466 */ 10467static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 10468 u32 exit_intr_info, 10469 unsigned long exit_qualification) 10470{ 10471 struct vcpu_vmx *vmx = to_vmx(vcpu); 10472 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 10473 10474 /* trying to cancel vmlaunch/vmresume is a bug */ 10475 WARN_ON_ONCE(vmx->nested.nested_run_pending); 10476 10477 leave_guest_mode(vcpu); 10478 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 10479 exit_qualification); 10480 10481 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, 10482 vmcs12->vm_exit_msr_store_count)) 10483 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); 10484 10485 vmx_load_vmcs01(vcpu); 10486 10487 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 10488 && nested_exit_intr_ack_set(vcpu)) { 10489 int irq = kvm_cpu_get_interrupt(vcpu); 10490 WARN_ON(irq < 0); 10491 vmcs12->vm_exit_intr_info = irq | 10492 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 10493 } 10494 10495 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 10496 vmcs12->exit_qualification, 10497 vmcs12->idt_vectoring_info_field, 10498 vmcs12->vm_exit_intr_info, 10499 vmcs12->vm_exit_intr_error_code, 10500 KVM_ISA_VMX); 10501 10502 vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); 10503 vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); 10504 vmx_segment_cache_clear(vmx); 10505 10506 /* if no vmcs02 cache requested, remove the one we used */ 10507 if (VMCS02_POOL_SIZE == 0) 10508 nested_free_vmcs02(vmx, vmx->nested.current_vmptr); 10509 10510 load_vmcs12_host_state(vcpu, vmcs12); 10511 10512 /* Update TSC_OFFSET if TSC was changed while L2 ran */ 10513 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 10514 10515 /* This is needed for same reason as it was needed in prepare_vmcs02 */ 10516 vmx->host_rsp = 0; 10517 10518 /* Unpin physical memory we referred to in vmcs02 */ 10519 if (vmx->nested.apic_access_page) { 10520 nested_release_page(vmx->nested.apic_access_page); 10521 vmx->nested.apic_access_page = NULL; 10522 } 10523 if (vmx->nested.virtual_apic_page) { 10524 nested_release_page(vmx->nested.virtual_apic_page); 10525 vmx->nested.virtual_apic_page = NULL; 10526 } 10527 if (vmx->nested.pi_desc_page) { 10528 kunmap(vmx->nested.pi_desc_page); 10529 nested_release_page(vmx->nested.pi_desc_page); 10530 vmx->nested.pi_desc_page = NULL; 10531 vmx->nested.pi_desc = NULL; 10532 } 10533 10534 /* 10535 * We are now running in L2, mmu_notifier will force to reload the 10536 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 10537 */ 10538 kvm_vcpu_reload_apic_access_page(vcpu); 10539 10540 /* 10541 * Exiting from L2 to L1, we're now back to L1 which thinks it just 10542 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the 10543 * success or failure flag accordingly. 10544 */ 10545 if (unlikely(vmx->fail)) { 10546 vmx->fail = 0; 10547 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); 10548 } else 10549 nested_vmx_succeed(vcpu); 10550 if (enable_shadow_vmcs) 10551 vmx->nested.sync_shadow_vmcs = true; 10552 10553 /* in case we halted in L2 */ 10554 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 10555} 10556 10557/* 10558 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 10559 */ 10560static void vmx_leave_nested(struct kvm_vcpu *vcpu) 10561{ 10562 if (is_guest_mode(vcpu)) 10563 nested_vmx_vmexit(vcpu, -1, 0, 0); 10564 free_nested(to_vmx(vcpu)); 10565} 10566 10567/* 10568 * L1's failure to enter L2 is a subset of a normal exit, as explained in 10569 * 23.7 "VM-entry failures during or after loading guest state" (this also 10570 * lists the acceptable exit-reason and exit-qualification parameters). 10571 * It should only be called before L2 actually succeeded to run, and when 10572 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). 10573 */ 10574static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 10575 struct vmcs12 *vmcs12, 10576 u32 reason, unsigned long qualification) 10577{ 10578 load_vmcs12_host_state(vcpu, vmcs12); 10579 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 10580 vmcs12->exit_qualification = qualification; 10581 nested_vmx_succeed(vcpu); 10582 if (enable_shadow_vmcs) 10583 to_vmx(vcpu)->nested.sync_shadow_vmcs = true; 10584} 10585 10586static int vmx_check_intercept(struct kvm_vcpu *vcpu, 10587 struct x86_instruction_info *info, 10588 enum x86_intercept_stage stage) 10589{ 10590 return X86EMUL_CONTINUE; 10591} 10592 10593static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 10594{ 10595 if (ple_gap) 10596 shrink_ple_window(vcpu); 10597} 10598 10599static void vmx_slot_enable_log_dirty(struct kvm *kvm, 10600 struct kvm_memory_slot *slot) 10601{ 10602 kvm_mmu_slot_leaf_clear_dirty(kvm, slot); 10603 kvm_mmu_slot_largepage_remove_write_access(kvm, slot); 10604} 10605 10606static void vmx_slot_disable_log_dirty(struct kvm *kvm, 10607 struct kvm_memory_slot *slot) 10608{ 10609 kvm_mmu_slot_set_dirty(kvm, slot); 10610} 10611 10612static void vmx_flush_log_dirty(struct kvm *kvm) 10613{ 10614 kvm_flush_pml_buffers(kvm); 10615} 10616 10617static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, 10618 struct kvm_memory_slot *memslot, 10619 gfn_t offset, unsigned long mask) 10620{ 10621 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 10622} 10623 10624/* 10625 * This routine does the following things for vCPU which is going 10626 * to be blocked if VT-d PI is enabled. 10627 * - Store the vCPU to the wakeup list, so when interrupts happen 10628 * we can find the right vCPU to wake up. 10629 * - Change the Posted-interrupt descriptor as below: 10630 * 'NDST' <-- vcpu->pre_pcpu 10631 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR 10632 * - If 'ON' is set during this process, which means at least one 10633 * interrupt is posted for this vCPU, we cannot block it, in 10634 * this case, return 1, otherwise, return 0. 10635 * 10636 */ 10637static int vmx_pre_block(struct kvm_vcpu *vcpu) 10638{ 10639 unsigned long flags; 10640 unsigned int dest; 10641 struct pi_desc old, new; 10642 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 10643 10644 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 10645 !irq_remapping_cap(IRQ_POSTING_CAP)) 10646 return 0; 10647 10648 vcpu->pre_pcpu = vcpu->cpu; 10649 spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, 10650 vcpu->pre_pcpu), flags); 10651 list_add_tail(&vcpu->blocked_vcpu_list, 10652 &per_cpu(blocked_vcpu_on_cpu, 10653 vcpu->pre_pcpu)); 10654 spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, 10655 vcpu->pre_pcpu), flags); 10656 10657 do { 10658 old.control = new.control = pi_desc->control; 10659 10660 /* 10661 * We should not block the vCPU if 10662 * an interrupt is posted for it. 10663 */ 10664 if (pi_test_on(pi_desc) == 1) { 10665 spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, 10666 vcpu->pre_pcpu), flags); 10667 list_del(&vcpu->blocked_vcpu_list); 10668 spin_unlock_irqrestore( 10669 &per_cpu(blocked_vcpu_on_cpu_lock, 10670 vcpu->pre_pcpu), flags); 10671 vcpu->pre_pcpu = -1; 10672 10673 return 1; 10674 } 10675 10676 WARN((pi_desc->sn == 1), 10677 "Warning: SN field of posted-interrupts " 10678 "is set before blocking\n"); 10679 10680 /* 10681 * Since vCPU can be preempted during this process, 10682 * vcpu->cpu could be different with pre_pcpu, we 10683 * need to set pre_pcpu as the destination of wakeup 10684 * notification event, then we can find the right vCPU 10685 * to wakeup in wakeup handler if interrupts happen 10686 * when the vCPU is in blocked state. 10687 */ 10688 dest = cpu_physical_id(vcpu->pre_pcpu); 10689 10690 if (x2apic_enabled()) 10691 new.ndst = dest; 10692 else 10693 new.ndst = (dest << 8) & 0xFF00; 10694 10695 /* set 'NV' to 'wakeup vector' */ 10696 new.nv = POSTED_INTR_WAKEUP_VECTOR; 10697 } while (cmpxchg(&pi_desc->control, old.control, 10698 new.control) != old.control); 10699 10700 return 0; 10701} 10702 10703static void vmx_post_block(struct kvm_vcpu *vcpu) 10704{ 10705 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 10706 struct pi_desc old, new; 10707 unsigned int dest; 10708 unsigned long flags; 10709 10710 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 10711 !irq_remapping_cap(IRQ_POSTING_CAP)) 10712 return; 10713 10714 do { 10715 old.control = new.control = pi_desc->control; 10716 10717 dest = cpu_physical_id(vcpu->cpu); 10718 10719 if (x2apic_enabled()) 10720 new.ndst = dest; 10721 else 10722 new.ndst = (dest << 8) & 0xFF00; 10723 10724 /* Allow posting non-urgent interrupts */ 10725 new.sn = 0; 10726 10727 /* set 'NV' to 'notification vector' */ 10728 new.nv = POSTED_INTR_VECTOR; 10729 } while (cmpxchg(&pi_desc->control, old.control, 10730 new.control) != old.control); 10731 10732 if(vcpu->pre_pcpu != -1) { 10733 spin_lock_irqsave( 10734 &per_cpu(blocked_vcpu_on_cpu_lock, 10735 vcpu->pre_pcpu), flags); 10736 list_del(&vcpu->blocked_vcpu_list); 10737 spin_unlock_irqrestore( 10738 &per_cpu(blocked_vcpu_on_cpu_lock, 10739 vcpu->pre_pcpu), flags); 10740 vcpu->pre_pcpu = -1; 10741 } 10742} 10743 10744/* 10745 * vmx_update_pi_irte - set IRTE for Posted-Interrupts 10746 * 10747 * @kvm: kvm 10748 * @host_irq: host irq of the interrupt 10749 * @guest_irq: gsi of the interrupt 10750 * @set: set or unset PI 10751 * returns 0 on success, < 0 on failure 10752 */ 10753static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, 10754 uint32_t guest_irq, bool set) 10755{ 10756 struct kvm_kernel_irq_routing_entry *e; 10757 struct kvm_irq_routing_table *irq_rt; 10758 struct kvm_lapic_irq irq; 10759 struct kvm_vcpu *vcpu; 10760 struct vcpu_data vcpu_info; 10761 int idx, ret = -EINVAL; 10762 10763 if (!kvm_arch_has_assigned_device(kvm) || 10764 !irq_remapping_cap(IRQ_POSTING_CAP)) 10765 return 0; 10766 10767 idx = srcu_read_lock(&kvm->irq_srcu); 10768 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 10769 BUG_ON(guest_irq >= irq_rt->nr_rt_entries); 10770 10771 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 10772 if (e->type != KVM_IRQ_ROUTING_MSI) 10773 continue; 10774 /* 10775 * VT-d PI cannot support posting multicast/broadcast 10776 * interrupts to a vCPU, we still use interrupt remapping 10777 * for these kind of interrupts. 10778 * 10779 * For lowest-priority interrupts, we only support 10780 * those with single CPU as the destination, e.g. user 10781 * configures the interrupts via /proc/irq or uses 10782 * irqbalance to make the interrupts single-CPU. 10783 * 10784 * We will support full lowest-priority interrupt later. 10785 */ 10786 10787 kvm_set_msi_irq(e, &irq); 10788 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) 10789 continue; 10790 10791 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); 10792 vcpu_info.vector = irq.vector; 10793 10794 trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, 10795 vcpu_info.vector, vcpu_info.pi_desc_addr, set); 10796 10797 if (set) 10798 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); 10799 else { 10800 /* suppress notification event before unposting */ 10801 pi_set_sn(vcpu_to_pi_desc(vcpu)); 10802 ret = irq_set_vcpu_affinity(host_irq, NULL); 10803 pi_clear_sn(vcpu_to_pi_desc(vcpu)); 10804 } 10805 10806 if (ret < 0) { 10807 printk(KERN_INFO "%s: failed to update PI IRTE\n", 10808 __func__); 10809 goto out; 10810 } 10811 } 10812 10813 ret = 0; 10814out: 10815 srcu_read_unlock(&kvm->irq_srcu, idx); 10816 return ret; 10817} 10818 10819static struct kvm_x86_ops vmx_x86_ops = { 10820 .cpu_has_kvm_support = cpu_has_kvm_support, 10821 .disabled_by_bios = vmx_disabled_by_bios, 10822 .hardware_setup = hardware_setup, 10823 .hardware_unsetup = hardware_unsetup, 10824 .check_processor_compatibility = vmx_check_processor_compat, 10825 .hardware_enable = hardware_enable, 10826 .hardware_disable = hardware_disable, 10827 .cpu_has_accelerated_tpr = report_flexpriority, 10828 .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, 10829 10830 .vcpu_create = vmx_create_vcpu, 10831 .vcpu_free = vmx_free_vcpu, 10832 .vcpu_reset = vmx_vcpu_reset, 10833 10834 .prepare_guest_switch = vmx_save_host_state, 10835 .vcpu_load = vmx_vcpu_load, 10836 .vcpu_put = vmx_vcpu_put, 10837 10838 .update_bp_intercept = update_exception_bitmap, 10839 .get_msr = vmx_get_msr, 10840 .set_msr = vmx_set_msr, 10841 .get_segment_base = vmx_get_segment_base, 10842 .get_segment = vmx_get_segment, 10843 .set_segment = vmx_set_segment, 10844 .get_cpl = vmx_get_cpl, 10845 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 10846 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 10847 .decache_cr3 = vmx_decache_cr3, 10848 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 10849 .set_cr0 = vmx_set_cr0, 10850 .set_cr3 = vmx_set_cr3, 10851 .set_cr4 = vmx_set_cr4, 10852 .set_efer = vmx_set_efer, 10853 .get_idt = vmx_get_idt, 10854 .set_idt = vmx_set_idt, 10855 .get_gdt = vmx_get_gdt, 10856 .set_gdt = vmx_set_gdt, 10857 .get_dr6 = vmx_get_dr6, 10858 .set_dr6 = vmx_set_dr6, 10859 .set_dr7 = vmx_set_dr7, 10860 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 10861 .cache_reg = vmx_cache_reg, 10862 .get_rflags = vmx_get_rflags, 10863 .set_rflags = vmx_set_rflags, 10864 .fpu_activate = vmx_fpu_activate, 10865 .fpu_deactivate = vmx_fpu_deactivate, 10866 10867 .tlb_flush = vmx_flush_tlb, 10868 10869 .run = vmx_vcpu_run, 10870 .handle_exit = vmx_handle_exit, 10871 .skip_emulated_instruction = skip_emulated_instruction, 10872 .set_interrupt_shadow = vmx_set_interrupt_shadow, 10873 .get_interrupt_shadow = vmx_get_interrupt_shadow, 10874 .patch_hypercall = vmx_patch_hypercall, 10875 .set_irq = vmx_inject_irq, 10876 .set_nmi = vmx_inject_nmi, 10877 .queue_exception = vmx_queue_exception, 10878 .cancel_injection = vmx_cancel_injection, 10879 .interrupt_allowed = vmx_interrupt_allowed, 10880 .nmi_allowed = vmx_nmi_allowed, 10881 .get_nmi_mask = vmx_get_nmi_mask, 10882 .set_nmi_mask = vmx_set_nmi_mask, 10883 .enable_nmi_window = enable_nmi_window, 10884 .enable_irq_window = enable_irq_window, 10885 .update_cr8_intercept = update_cr8_intercept, 10886 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, 10887 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 10888 .get_enable_apicv = vmx_get_enable_apicv, 10889 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, 10890 .load_eoi_exitmap = vmx_load_eoi_exitmap, 10891 .hwapic_irr_update = vmx_hwapic_irr_update, 10892 .hwapic_isr_update = vmx_hwapic_isr_update, 10893 .sync_pir_to_irr = vmx_sync_pir_to_irr, 10894 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 10895 10896 .set_tss_addr = vmx_set_tss_addr, 10897 .get_tdp_level = get_ept_level, 10898 .get_mt_mask = vmx_get_mt_mask, 10899 10900 .get_exit_info = vmx_get_exit_info, 10901 10902 .get_lpage_level = vmx_get_lpage_level, 10903 10904 .cpuid_update = vmx_cpuid_update, 10905 10906 .rdtscp_supported = vmx_rdtscp_supported, 10907 .invpcid_supported = vmx_invpcid_supported, 10908 10909 .set_supported_cpuid = vmx_set_supported_cpuid, 10910 10911 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 10912 10913 .read_tsc_offset = vmx_read_tsc_offset, 10914 .write_tsc_offset = vmx_write_tsc_offset, 10915 .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, 10916 .read_l1_tsc = vmx_read_l1_tsc, 10917 10918 .set_tdp_cr3 = vmx_set_cr3, 10919 10920 .check_intercept = vmx_check_intercept, 10921 .handle_external_intr = vmx_handle_external_intr, 10922 .mpx_supported = vmx_mpx_supported, 10923 .xsaves_supported = vmx_xsaves_supported, 10924 10925 .check_nested_events = vmx_check_nested_events, 10926 10927 .sched_in = vmx_sched_in, 10928 10929 .slot_enable_log_dirty = vmx_slot_enable_log_dirty, 10930 .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 10931 .flush_log_dirty = vmx_flush_log_dirty, 10932 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 10933 10934 .pre_block = vmx_pre_block, 10935 .post_block = vmx_post_block, 10936 10937 .pmu_ops = &intel_pmu_ops, 10938 10939 .update_pi_irte = vmx_update_pi_irte, 10940}; 10941 10942static int __init vmx_init(void) 10943{ 10944 int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 10945 __alignof__(struct vcpu_vmx), THIS_MODULE); 10946 if (r) 10947 return r; 10948 10949#ifdef CONFIG_KEXEC_CORE 10950 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 10951 crash_vmclear_local_loaded_vmcss); 10952#endif 10953 10954 return 0; 10955} 10956 10957static void __exit vmx_exit(void) 10958{ 10959#ifdef CONFIG_KEXEC_CORE 10960 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 10961 synchronize_rcu(); 10962#endif 10963 10964 kvm_exit(); 10965} 10966 10967module_init(vmx_init) 10968module_exit(vmx_exit)